In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def content_based_recommendation(title,cs):
    try:
        index = (df[df["title"]==title].index.values[0])
    except IndexError:
        return
    
    similarity = cs[index]
    similarity_list = list(enumerate(similarity))
    similarity_list = sorted(similarity_list, key = lambda x: x[1]) 
    similarity_list = similarity_list[::-1]
    top_7 = [i[0] for i in similarity_list[1:8]]

    return (df['title'].iloc[top_7])

In [3]:
def random_recommendation(title):
    genre = df.loc[df['title'] == title, 'listed_in'].iloc[0]
    data = df[df['listed_in'] == genre]
    data.drop(data.index[data['title'] == title], inplace = True)
    if (data.shape[0]) > 7:
        data = (data.sample(7))
    return(data["title"])

In [4]:
df = pd.read_csv('netflix_titles.csv')

In [5]:
df = df[df['director'].notnull()]
df = df[df['cast'].notnull()]
df['rating'].fillna('PG-13',inplace = True)
df['duration'].fillna('90 min',inplace = True)
m = df['country'].mode()[0]
df['country'].fillna(m,inplace = True)

In [20]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,data_soup
2,s3,TV Show,Ganglands,julienleclercq,"[samibouajila, tracygotoas, samueljouy]",United States,"September 24, 2021",2021,TV-MA,1 Season,"[crimetvshows, internationaltvshows, tvaction&...",To protect his family from a powerful drug lor...,samibouajila tracygotoas samueljouy julienlec...
5,s6,TV Show,Midnight Mass,mikeflanagan,"[katesiegel, zachgilford, hamishlinklater]",United States,"September 24, 2021",2021,TV-MA,1 Season,"[tvdramas, tvhorror, tvmysteries]",The arrival of a charismatic young priest brin...,katesiegel zachgilford hamishlinklater mikefl...
6,s7,Movie,My Little Pony: A New Generation,robertcullen,"[vanessahudgens, kimikoglenn, jamesmarsden]",United States,"September 24, 2021",2021,PG,91 min,[children&familymovies],Equestria's divided. But a bright-eyed hero be...,vanessahudgens kimikoglenn jamesmarsden rober...
7,s8,Movie,Sankofa,hailegerima,"[kofighanaba, oyafunmikeogunlano, alexandraduah]","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"[dramas, independentmovies, internationalmovies]","On a photo shoot in Ghana, an American model s...",kofighanaba oyafunmikeogunlano alexandraduah ...
8,s9,TV Show,The Great British Baking Show,andydevonshire,"[melgiedroyc, sueperkins, maryberry]",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"[britishtvshows, realitytv]",A talented batch of amateur bakers face off in...,melgiedroyc sueperkins maryberry andydevonshi...


In [7]:
# First model is based on the description of a particular movie using cosine similarity and bag of words.
# Word vector will be calculated using TF-IDF

tfidf = TfidfVectorizer(stop_words='english')
df["description"].fillna('')
word_matrix = tfidf.fit_transform(df['description'])

In [8]:
print(word_matrix.shape)

(5700, 14767)


In [9]:
cs = cosine_similarity(word_matrix, word_matrix)

In [10]:
top_7_recommendations = content_based_recommendation("Ganglands",cs)

In [11]:
print("Top recommendations based on the plot:")
print(top_7_recommendations)

Top recommendations based on the plot:
4364    My Little Pony Friendship Is Magic: Best Gift ...
4588                                      My Friend Pinto
7110                      Jack and the Cuckoo-Clock Heart
5485                                            Ram Jaane
555                                           Snowpiercer
6641                                          Dragonheart
2314                                             Stardust
Name: title, dtype: object


In [12]:
top_7_random_recommendations = random_recommendation("Ganglands")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [13]:
print("Top recommendations based on the genre:")
print(top_7_random_recommendations)

Top recommendations based on the genre:
11      Bangkok Breaking
1223              Dealer
3356         Nowhere Man
Name: title, dtype: object


In [14]:
def processData(x):
    if len(x.split(",")) > 3:
        return x.split(",")[0:3]
    else:
        return x.split(",")


In [15]:
def processing(x):
    if type(x) == list:
            return [i.replace(" ", "").lower() for i in x]
    else:
            return (x.replace(" ", "").lower())

In [16]:
def create_soup(data):
    return  ' ' + ' '.join(data['cast']) + ' ' + data['director'] + ' ' + ' '.join(data['listed_in'])

In [21]:
def recommendation_director(title):
    data = df
    data["cast"] = data["cast"].apply(processData)
    data["listed_in"] = data["listed_in"].apply(processData)
    data["director"] = data["director"].apply(lambda x: x.split(",")[0])
    
    categories = ["director","listed_in","cast"]

    for c in categories:
        data[c] = data[c].apply(processing)

    data["data_soup"] = data.apply(create_soup, axis=1)
    
    count = CountVectorizer(stop_words='english')
    cm = count.fit_transform(data['data_soup'])

    #Use cosine similarity 
    cs = cosine_similarity(cm, cm)
    res = content_based_recommendation(title,cs)
    return res

In [18]:
top_7_director = recommendation_director("Ganglands")

In [22]:
print("Top Movies based on combination of Cast, director and Genre:")
print(top_7_director)

Top Movies based on combination of Cast, director and Genre:
6433             Cats & Dogs: The Revenge of Kitty Galore
3016                                                  Hop
3248                          The Knight Before Christmas
1681                  The Princess Switch: Switched Again
2858    Calico Critters: Everyone's Big Dream Flying i...
2188                                           Sugar High
1304           Animals on the Loose: A You vs. Wild Movie
Name: title, dtype: object
