In [1]:
# import libraries needed
import pandas as pd
from stop_words import get_stop_words
from nltk.corpus import stopwords
import random
import nltk
import math


In [2]:
# setting the path to our file
path = 'netflixMoviesData.csv'
# convert our csv file to a pandas DataFrame
# converters allows pandas to know that abilities is a list, not a string
movieData = pd.read_csv(path)

In [3]:
movieData

Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type
0,81193313,Chocolate,,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...",South Korea,"November 30, 2019",2019,TV-14,1 Season,"International TV Shows, Korean TV Shows, Roman...",Brought together by meaningful meals in the pa...,TV Show
1,81197050,Guatemala: Heart of the Mayan World,"Luis Ara, Ignacio Jaunsolo",Christian Morales,,"November 30, 2019",2019,TV-G,67 min,"Documentaries, International Movies","From Sierra de las Minas to Esquipulas, explor...",Movie
2,81213894,The Zoya Factor,Abhishek Sharma,"Sonam Kapoor, Dulquer Salmaan, Sanjay Kapoor, ...",India,"November 30, 2019",2019,TV-14,135 min,"Comedies, Dramas, International Movies",A goofy copywriter unwittingly convinces the I...,Movie
3,81082007,Atlantics,Mati Diop,"Mama Sane, Amadou Mbow, Ibrahima Traore, Nicol...","France, Senegal, Belgium","November 29, 2019",2019,TV-14,106 min,"Dramas, Independent Movies, International Movies","Arranged to marry a rich man, young Ada is cru...",Movie
4,80213643,Chip and Potato,,"Abigail Oliver, Andrea Libman, Briana Buckmast...","Canada, United Kingdom",,2019,TV-Y,2 Seasons,Kids' TV,"Lovable pug Chip starts kindergarten, makes ne...",TV Show
...,...,...,...,...,...,...,...,...,...,...,...,...
5832,70141644,Mad Ron's Prevues from Hell,Jim Monaco,"Nick Pawlow, Jordu Schell, Jay Kushwara, Micha...",United States,"November 1, 2010",1987,NR,84 min,"Cult Movies, Horror Movies","This collection cherry-picks trailers, forgott...",Movie
5833,70127998,Splatter,Joe Dante,"Corey Feldman, Tony Todd, Tara Leigh, Erin Way...",United States,"November 18, 2009",2009,TV-14,29 min,Horror Movies,"After committing suicide, a washed-up rocker r...",Movie
5834,70084180,Just Another Love Story,Ole Bornedal,"Anders W. Berthelsen, Rebecka Hemse, Nikolaj L...",Denmark,"May 5, 2009",2007,NR,104 min,"Dramas, International Movies",When he causes a car accident that leaves a yo...,Movie
5835,70157452,Dinner for Five,,,United States,"February 4, 2008",2007,TV-MA,1 Season,Stand-Up Comedy & Talk Shows,"In each episode, four celebrities join host Jo...",TV Show


In [4]:
#Creating variables for the movie's titles and descriptions

movieTitles = movieData.title
movieDescriptions = movieData.description

In [5]:
#Testing the description
movieDescriptions[10]

"After an expensive night out, two flatmates get tangled in an overnight misadventure to recover their rent money to pay their dead landlord's daughter."

In [6]:
#Working out how many unique titles there are in the dataset
len(movieTitles.unique())

5780

In [7]:
# establishing a base line for our algorithm
humanAccuracy = 1/5780
#print(humanAccuracy)
roundedHumanAccuracy = round(humanAccuracy, 4)
strHumanAccuracy = str(roundedHumanAccuracy)
print(strHumanAccuracy)

0.0002


In [8]:
#Function that creates the features through the movies' descriptions
def movies_features(movieDescript):
    # stopwords list is all lowercase so we need to match
    #print(type(movieDescript))
    descriptions = movieDescript.lower()
    words = descriptions.split()
    keywords = [x for x in words if x not in get_stop_words('english','spanish')]
    while len(keywords) < 3:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2]}
    

movies_features('After an expensive night out, two flatmates get tangled in an overnight misadventure to recover')


{'first_keyword': 'expensive',
 'second_keyword': 'night',
 'third_keyword': 'out,'}

In [9]:
#Creating tuples for the movie descriptions and titles
zipped_features = zip(movieDescriptions,movieTitles)

In [10]:
# Creating feature sets for each of the elements in my zipped variable
featuresets = [(movies_features(x), y) for x, y in zipped_features ]
featuresets

[({'first_keyword': 'brought',
   'second_keyword': 'together',
   'third_keyword': 'meaningful'},
  'Chocolate'),
 ({'first_keyword': 'sierra', 'second_keyword': 'de', 'third_keyword': 'las'},
  'Guatemala: Heart of the Mayan World'),
 ({'first_keyword': 'goofy',
   'second_keyword': 'copywriter',
   'third_keyword': 'unwittingly'},
  'The Zoya Factor'),
 ({'first_keyword': 'arranged',
   'second_keyword': 'marry',
   'third_keyword': 'rich'},
  'Atlantics'),
 ({'first_keyword': 'lovable',
   'second_keyword': 'pug',
   'third_keyword': 'chip'},
  'Chip and Potato'),
 ({'first_keyword': 'nollywood',
   'second_keyword': 'star',
   'third_keyword': 'ramsey'},
  'Crazy people'),
 ({'first_keyword': 'romance,',
   'second_keyword': 'mystery',
   'third_keyword': 'adventure'},
  'I Lost My Body'),
 ({'first_keyword': 'life',
   'second_keyword': 'times',
   'third_keyword': 'iconic'},
  'Kalushi: The Story of Solomon Mahlangu'),
 ({'first_keyword': 'compelling',
   'second_keyword': 'show

In [11]:
#Shuffling my sets
random.shuffle(featuresets)

In [12]:
# Working out the 80% of my data

print(f'Features length:  {len(featuresets)}')
split_num = math.floor(len(featuresets)*.8)
print(f'80% split number:  {split_num}')

# split feature sets into training and test sets (here we'll try 80% train, 20% test)
train_set, test_set = featuresets[:split_num], featuresets[split_num+1:]


Features length:  5837
80% split number:  4669


In [13]:
# Printing training set
train_set

[({'first_keyword': 'longtime',
   'second_keyword': 'mayor',
   'third_keyword': 'marseille'},
  'Marseille'),
 ({'first_keyword': 'middle',
   'second_keyword': 'schooler',
   'third_keyword': 'valt'},
  'Beyblade Burst'),
 ({'first_keyword': 'old-school',
   'second_keyword': 'brooklyn',
   'third_keyword': 'native'},
  "It's Bruno!"),
 ({'first_keyword': 'join',
   'second_keyword': 'historian',
   'third_keyword': 'dan'},
  'Secrets of Great British Castles'),
 ({'first_keyword': 'teens',
   'second_keyword': 'korean',
   'third_keyword': 'descent'},
  'Seoul Searching'),
 ({'first_keyword': 'obsessed',
   'second_keyword': 'aspiring',
   'third_keyword': 'writer,'},
  'You'),
 ({'first_keyword': 'intimate',
   'second_keyword': 'lives',
   'third_keyword': 'young'},
  'Hong Kong West Side Stories'),
 ({'first_keyword': 'future',
   'second_keyword': 'technology',
   'third_keyword': 'rendered'},
  'Anon'),
 ({'first_keyword': 'successful',
   'second_keyword': "dj's",
   'third_k

In [14]:
# build a classifier based on the training set
# note the train_set is a list of tuples where the first item of the tuple is a dictionary of features
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [31]:
#Testing the classifier
classifier.classify(movies_features("iconic rockstar"))

'John Mellencamp: Plain Spoken'

In [26]:
classifier.classify(movies_features("fun friendship movie"))

'True: Magical Friends'

In [50]:
classifier.classify(movies_features("sad lonely tired"))

'Yours Fatefully'

In [51]:
#Rounding the AI accuracy and converting it into a string to print it and compare it to the human accuracy

roundedClassifierAccuracy = round(nltk.classify.accuracy(classifier, test_set), 4)
strClassifierAccuracy = str(roundedClassifierAccuracy)

In [52]:
print('human accuracy: ' + strHumanAccuracy + ' -- vs -- my classifier: ' + strClassifierAccuracy)

human accuracy: 0.0002 -- vs -- my classifier: 0.0009


In [130]:
classifier.show_most_informative_features(12)

Most Informative Features
           first_keyword = 'documentary'  #Rucke : Limitl =      1.0 : 1.0
           first_keyword = 'encountering' At the : The Si =      1.0 : 1.0
           first_keyword = 'group'        Assimi : Oh My  =      1.0 : 1.0
           first_keyword = 'possessed'    Fear F : Oh My  =      1.0 : 1.0
           first_keyword = 'skilled'      Kickbo : Oh My  =      1.0 : 1.0
           first_keyword = 'world'        Angel  : The Si =      1.0 : 1.0
          second_keyword = 'attack'       Kuromu : The Si =      1.0 : 1.0
          second_keyword = 'career'       Hostil : Limitl =      1.0 : 1.0
          second_keyword = 'follows'      Alt-Ri : Limitl =      1.0 : 1.0
          second_keyword = 'friends'         5CM : Oh My  =      1.0 : 1.0
          second_keyword = 'ghost'        Cyborg : Oh My  =      1.0 : 1.0
          second_keyword = 'musician'     Covere : Limitl =      1.0 : 1.0


In [199]:
#Trying to improve the accuracy by adding more keywrods. It seems more accurate by somehow the accuracy number has decreased to 0
# Adding more keywords to our classifier, and remove number
# and using nltk stopwords


def movies_features2(movieDescript):
    description = movieDescript.lower()
    words = description.split()
    s_words = stopwords.words('english', 'spanish')
    keywords = [x for x in words if x not in s_words and not x.isdigit()]
    while len(keywords) < 8:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2], 'fourth_keyword': keywords[3], 'fifth_keyword': keywords[4], 'sixth_keyword': keywords[5],'seventh_keyword': keywords[6], 'eight_keyword': keywords[7]}

movies_features2('I am feeling like a sad christmas movie about love tonight')



{'first_keyword': 'feeling',
 'second_keyword': 'like',
 'third_keyword': 'sad',
 'fourth_keyword': 'christmas',
 'fifth_keyword': 'movie',
 'sixth_keyword': 'love',
 'seventh_keyword': 'tonight',
 'eight_keyword': 'None'}

In [200]:
zipped_features2 = zip(movieDescriptions, movieTitles)
featuresets2 = [(movies_features2(x), y) for x, y in zipped_features2 ]
featuresets2


[({'first_keyword': 'brought',
   'second_keyword': 'together',
   'third_keyword': 'meaningful',
   'fourth_keyword': 'meals',
   'fifth_keyword': 'past',
   'sixth_keyword': 'present,',
   'seventh_keyword': 'doctor',
   'eight_keyword': 'chef'},
  'Chocolate'),
 ({'first_keyword': 'sierra',
   'second_keyword': 'de',
   'third_keyword': 'las',
   'fourth_keyword': 'minas',
   'fifth_keyword': 'esquipulas,',
   'sixth_keyword': 'explore',
   'seventh_keyword': "guatemala's",
   'eight_keyword': 'cultural'},
  'Guatemala: Heart of the Mayan World'),
 ({'first_keyword': 'goofy',
   'second_keyword': 'copywriter',
   'third_keyword': 'unwittingly',
   'fourth_keyword': 'convinces',
   'fifth_keyword': 'indian',
   'sixth_keyword': 'cricket',
   'seventh_keyword': 'team',
   'eight_keyword': 'she’s'},
  'The Zoya Factor'),
 ({'first_keyword': 'arranged',
   'second_keyword': 'marry',
   'third_keyword': 'rich',
   'fourth_keyword': 'man,',
   'fifth_keyword': 'young',
   'sixth_keyword':

In [201]:
random.shuffle(featuresets2)

In [202]:
split_num2 = math.floor(len(featuresets2)*.8)
print(split_num2)

# split feature sets into training and test sets (here we'll try 80% train, 20% test)
train_set2, test_set2 = featuresets2[:split_num2], featuresets2[split_num2+1:]

print('improved train set: ' + str(len(train_set2)) + ' & improved test set: ' + str(len(test_set2)))

4669
improved train set: 4669 & improved test set: 1167


In [203]:
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

In [217]:
classifier2.classify(movies_features2("a movie about caste differences thro wrench blossoming relationship couple"))

'Evvarikee Cheppoddu'

In [212]:
print(nltk.classify.accuracy(classifier2, test_set2))

0.0


In [215]:
classifier2.show_most_informative_features(30)

Most Informative Features
          fourth_keyword = 'expose'       Sarkar : The Re =      1.7 : 1.0
           third_keyword = 'mission'      Sarkar : Valor  =      1.7 : 1.0
           sixth_keyword = 'fraud'        Sarkar : The Ad =      1.7 : 1.0
         seventh_keyword = 'brings'       Sarkar : Bob Ro =      1.7 : 1.0
           first_keyword = 'documentary'  100 Ye : Limitl =      1.0 : 1.0
           first_keyword = 'encountering' At the : The Si =      1.0 : 1.0
           first_keyword = 'failed'       Outlaw : Limitl =      1.0 : 1.0
           first_keyword = 'man'          100 Me : Love   =      1.0 : 1.0
           first_keyword = 'rebellious'   Crazy  : Love   =      1.0 : 1.0
           first_keyword = 'world'        Angel  : The Si =      1.0 : 1.0
          second_keyword = 'attack'       Kuromu : The Si =      1.0 : 1.0
          second_keyword = 'career'       Hostil : Limitl =      1.0 : 1.0
          second_keyword = 'five'         Chupan : Love   =      1.0 : 1.0