In [331]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from rake_nltk import Rake

In [332]:
#pip install python-rake


In [333]:
#pip install rake-nltk

In [334]:
netflix=pd.read_csv('C:/Users/SAJAN P MENON/Desktop/week-9-case_study/discover_dollar/NETFLIX TITLES.csv')
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81088285,Movie,The Mayo Clinic,"Ken Burns, Christopher Loren Ewers, Erik Ewers",Peter Coyote,United States,"April 19, 2019",2018,TV-14,116 min,Documentaries,A look at how a world-renowned medical institu...
1,81077597,Movie,I Am,Onir,"Juhi Chawla, Rahul Bose, Nandita Das, Sanjay S...","India, Japan","March 4, 2019",2010,TV-MA,106 min,"Dramas, Independent Movies, International Movies",Four individuals in modern India grapple with ...
2,1150871,Movie,Love Jones,Theodore Witcher,"Larenz Tate, Nia Long, Isaiah Washington, Lisa...",United States,"November 1, 2019",1997,R,109 min,"Comedies, Dramas, Independent Movies","In this urban romantic comedy set in Chicago, ..."
3,20077944,Movie,Ghayal,Rajkumar Santoshi,"Sunny Deol, Meenakshi Sheshadri, Amrish Puri, ...",India,"December 31, 2019",1990,TV-14,163 min,"Action & Adventure, Dramas, International Movies","Framed for his older brother's murder, a boxer..."
4,80223779,Movie,Marriage Story,Noah Baumbach,"Scarlett Johansson, Adam Driver, Laura Dern, A...","United States, United Kingdom","December 6, 2019",2019,R,137 min,Dramas,Academy Award-nominated filmmaker Noah Baumbac...


In [335]:
#checking shape of the dataset
netflix.shape

(4675, 12)

In [336]:
#we shall take following features as main parameters which would help in recommendation

data=netflix[['title','director','cast','country','release_year','listed_in','description']]

### PreProcessing details - 'description

"""as we cant use the 'description' feature directly in our model and have to vectorize it, we shall clean our data first using natural language processing libraries first

Approach:
1) tokenization has to be done to get each words as separate tokens
2) then we remove stop words from to reduce redundancy
3) then we apply lemmatizer to reduce different forms of the same word into its root form.
4) lastly, we shall change the strings into lower case so that duplicated wont be an issue later"""



In [337]:


def text_extractor(x):
    
    
    rw_words=[]
    for words in x.split():
        for word in nltk.word_tokenize(words):
            if word.isalnum()==True:
                    rw_words.append(word)
                    
    from nltk.corpus import stopwords
    stop_words=stopwords.words("english")
    rw_words_nostop=[]
    for i in rw_words:
        if i not in stop_words:
            rw_words_nostop.append(i)
            
    from nltk.stem import WordNetLemmatizer
    lemmatizer=WordNetLemmatizer()
    words_postLemma_rw=[]
    for i in rw_words_nostop:
        words_postLemma_rw.append(lemmatizer.lemmatize(i))
        
    lower_case_rw=[]
    for i in words_postLemma_rw:
        lower_case_rw.append(i.lower())
        
    return(lower_case_rw)
        
        
data['imp_words']=data['description'].apply(text_extractor)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [338]:
data=data.drop(columns=['description'],axis=1)
data.head(3)

Unnamed: 0,title,director,cast,country,release_year,listed_in,imp_words
0,The Mayo Clinic,"Ken Burns, Christopher Loren Ewers, Erik Ewers",Peter Coyote,United States,2018,Documentaries,"[a, look, medical, institution, prioritizes, p..."
1,I Am,Onir,"Juhi Chawla, Rahul Bose, Nandita Das, Sanjay S...","India, Japan",2010,"Dramas, Independent Movies, International Movies","[four, individual, modern, india, grapple, ide..."
2,Love Jones,Theodore Witcher,"Larenz Tate, Nia Long, Isaiah Washington, Lisa...",United States,1997,"Comedies, Dramas, Independent Movies","[in, urban, romantic, comedy, set, chicago, up..."


### PreProcessing details - 'cast'

In [339]:
#checking null values in 'cast' feature
data['cast'].isnull().sum()*100/data.shape[0]

9.390374331550802

#as the missing values are almost 10% of the data and also there is no cross sectional relation between different rows, we shall flag it as 'missing_cast' rather than filling with mode. 

In [340]:
data['cast']=data['cast'].fillna('missingcast')

In [341]:

def text_comb(x):
    text_comb_list=[]
    for i in str(x).split(',')[:5]:
        text_comb_list.append(str(i).lower().replace(' ',''))
    return(text_comb_list)
        
data['cast_comb']=data['cast'].apply(text_comb)

### PreProcessing details - 'director'

In [342]:
#checking null values in 'director' feature
data['director'].isnull().sum()*100/data.shape[0]

30.86631016042781

#as the missing values are almost 30% of the data and also there is no cross sectional relation between different rows, we shall flag it as 'missingdirector' rather than filling with mode. 

In [343]:
data['director']=data['director'].fillna('missingdirector')

In [344]:

def director_comb(x):
    director_comb_list=[]
    for i in str(x).split(','):
        director_comb_list.append(str(i).lower().replace(' ',''))
    return(director_comb_list)
        
data['director_comb']=data['director'].apply(director_comb)

### PreProcessing details - 'country'

In [345]:
#checking null values in 'director' feature
data['country'].isnull().sum()*100/data.shape[0]

7.46524064171123

#as the missing values are almost 7.5% of the data and also there is no cross sectional relation between different rows, we shall flag it as 'missingcountry' rather than filling with mode. 

In [346]:
data['country']=data['country'].fillna('missingcountry')

In [347]:

def country_comb(x):
    country_comb_list=[]
    for i in str(x).split(','):
        country_comb_list.append(str(i).lower().replace(' ',''))
    return(country_comb_list)
        
data['country_comb']=data['country'].apply(country_comb)

### PreProcessing details - 'country'

In [348]:
#checking null values in 'director' feature
data['listed_in'].isnull().sum()*100/data.shape[0]

0.0

In [349]:

def listedin_comb(x):
    listedin_comb_list=[]
    for i in str(x).split(','):
        listedin_comb_list.append(str(i).lower().replace(' ',''))
    return(listedin_comb_list)
        
data['listedin_comb']=data['listed_in'].apply(listedin_comb)

In [350]:
df=data.drop(columns=['cast','director','country','listed_in'],axis=1)
df.head(2)

Unnamed: 0,title,release_year,imp_words,cast_comb,director_comb,country_comb,listedin_comb
0,The Mayo Clinic,2018,"[a, look, medical, institution, prioritizes, p...",[petercoyote],"[kenburns, christopherlorenewers, erikewers]",[unitedstates],[documentaries]
1,I Am,2010,"[four, individual, modern, india, grapple, ide...","[juhichawla, rahulbose, nanditadas, sanjaysuri...",[onir],"[india, japan]","[dramas, independentmovies, internationalmovies]"


# Bag Of words Model

o	Bag of Words Modelling concept is implemented, which is a way of extracting features from text for using in modelling.
o	A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things:
o	A vocabulary of known words.
o	A measure of the presence of known words (we are taking count of words here as I assumed repetition of words can help in find the similarity)


In [351]:
#Copying the dataframe
df1=df.copy()

In [352]:
df1.set_index('title', inplace = True)
df1.head()

Unnamed: 0_level_0,release_year,imp_words,cast_comb,director_comb,country_comb,listedin_comb
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
The Mayo Clinic,2018,"[a, look, medical, institution, prioritizes, p...",[petercoyote],"[kenburns, christopherlorenewers, erikewers]",[unitedstates],[documentaries]
I Am,2010,"[four, individual, modern, india, grapple, ide...","[juhichawla, rahulbose, nanditadas, sanjaysuri...",[onir],"[india, japan]","[dramas, independentmovies, internationalmovies]"
Love Jones,1997,"[in, urban, romantic, comedy, set, chicago, up...","[larenztate, nialong, isaiahwashington, lisani...",[theodorewitcher],[unitedstates],"[comedies, dramas, independentmovies]"
Ghayal,1990,"[framed, older, brother, murder, boxer, seek, ...","[sunnydeol, meenakshisheshadri, amrishpuri, mo...",[rajkumarsantoshi],[india],"[action&adventure, dramas, internationalmovies]"
Marriage Story,2019,"[academy, filmmaker, noah, baumbach, directs, ...","[scarlettjohansson, adamdriver, lauradern, ala...",[noahbaumbach],"[unitedstates, unitedkingdom]",[dramas]


### creating a list with bag of words

In [353]:
df1['bag_of_words'] = ''
columns = df1.columns
bow_list=[]

for index, row in df1.iterrows():
    words = ''    
    for col in columns:
        if col !='release_year':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + ''.join(str(row[col]))+ ' '

    bow_list.append(words)


In [354]:
bow_list

['2018 a look medical institution prioritizes patient need adapted healthcare evolving demand 150 year petercoyote kenburns christopherlorenewers erikewers unitedstates documentaries  ',
 '2010 four individual modern india grapple identity amid social taboo trauma brutal sexual discrimination quartet story juhichawla rahulbose nanditadas sanjaysuri manishakoirala onir india japan dramas independentmovies internationalmovies  ',
 '1997 in urban romantic comedy set chicago ups down courtship play young black poet pretty shutterbug larenztate nialong isaiahwashington lisanicolecarson billbellamy theodorewitcher unitedstates comedies dramas independentmovies  ',
 '1990 framed older brother murder boxer seek violent revenge true killer torn family apart sunnydeol meenakshisheshadri amrishpuri moushumichatterjee kulbhushankharbanda rajkumarsantoshi india action&adventure dramas internationalmovies  ',
 '2019 academy filmmaker noah baumbach directs incisive compassionate look marriage coming 

In [355]:
df_title=list(df['title'])
df_title

['The Mayo Clinic',
 'I Am',
 'Love Jones',
 'Ghayal',
 'Marriage Story',
 'Chloe',
 "DC's Legends of Tomorrow",
 'Filosofi Kopi The Movie',
 'Little Lunch',
 'Heartthrob',
 'Camp X-Ray',
 'The Last Exorcism',
 'Monster High: Why Do Ghouls Fall in Love?',
 'Goon: Last of the Enforcers',
 'Jimmy Carr: Funny Business',
 'Winter Sun',
 'A.D. Kingdom and Empire',
 'Let It Snow',
 'Secrets of Althorp - The Spencers',
 'Snow Day',
 "Ricardo O'Farrill Abrazo Genial",
 'Parmanu: The Story of Pokhran',
 'XXX: State of the Union',
 'The Spooky Tale of Captain Underpants Hack-a-ween',
 'My Little Pony Equestria Girls: Rollercoaster of Friendship',
 '13TH: A Conversation with Oprah Winfrey & Ava DuVernay',
 'The Vendor',
 'This Is Not What I Expected',
 'Una',
 'Get Him to the Greek',
 'Guru',
 'Görümce',
 'Thithi',
 'Catwalk: Tales from the Cat Show Circuit',
 'Billu',
 "Joan Rivers: Don't Start with Me",
 'The Great British Baking Show',
 'Fakta Ladh Mhana',
 'Gagarin: First in Space',
 'One Hea

In [356]:
#creating a dataframe with title name and bag of words against it.

final_df=pd.DataFrame({'bag_of_words':bow_list},index=df_title)
final_df.head()

Unnamed: 0,bag_of_words
The Mayo Clinic,2018 a look medical institution prioritizes pa...
I Am,2010 four individual modern india grapple iden...
Love Jones,1997 in urban romantic comedy set chicago ups ...
Ghayal,1990 framed older brother murder boxer seek vi...
Marriage Story,2019 academy filmmaker noah baumbach directs i...


### Vectorization of the bag of words

In [357]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
count_matrix = count.fit_transform(final_df['bag_of_words'])


In [358]:
indices = pd.Series(final_df.index)
indices[:5]

0    The Mayo Clinic
1               I Am
2         Love Jones
3             Ghayal
4     Marriage Story
dtype: object

### calculating cosine similiarity between bag-of-words

In [359]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.        , 0.04499213, ..., 0.        , 0.086711  ,
        0.        ],
       [0.        , 1.        , 0.0728357 , ..., 0.08104409, 0.03509312,
        0.        ],
       [0.04499213, 0.0728357 , 1.        , ..., 0.        , 0.14824986,
        0.        ],
       ...,
       [0.        , 0.08104409, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.086711  , 0.03509312, 0.14824986, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

# function to take movie title as input and shows top 3 recommended movies.

In [360]:
# function that takes in movie title as input and returns the top 10 recommended movies
def top_recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_3_indexes = list(score_series.iloc[1:4].index)
    
    for i in top_3_indexes:
        recommended_movies.append(list(final_df.index)[i])
        
    return recommended_movies



# The movie name we search should be in the netflix_title dataset so that the calculated cosine similarity could be filtered.

top_recommendations("Badla")   # top three recommendations for this movie will be displayed.

['Deewangee', 'Oththa Seruppu Size 7', 'Shuddhi']