# **NETFLIX RECOMMENDATION SYSTEM**
This is a content based recommendation system (unorganised)


In [2]:
%pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl.metadata (12 kB)
Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/114.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


In [4]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [126]:
df=pd.read_csv('netflix_titles.csv')
print(df.shape)
df.head(4)

(8807, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."


In [127]:
df.isna().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2634
cast,825
country,831
date_added,10
release_year,0
rating,4
duration,3


In [128]:
df.rename(columns={'listed_in':'genre'},inplace=True)

In [129]:
df.type.value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
Movie,6131
TV Show,2676


# **Movie Data Vectorization**

In [130]:
df_movie=df[df['type']=='Movie'].copy()
#df_movie.drop('type', axis=1,inplace=True)
df_movie.reset_index(drop=True,inplace=True)
print(df_movie.shape)
df_movie.head(4)

(6131, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
2,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
3,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...


In [131]:
# filling NaN manually at rating column so pandas can treat it as a Non null-value
df_movie['rating'].fillna('NaN', inplace= True)

# Dropping null values
df_movie.dropna(inplace= True)
df_movie.reset_index(drop=True,inplace=True)

In [132]:
df_movie.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
release_year,5186.0,2012.613768,9.705851,1942.0,2011.0,2016.0,2018.0,2021.0


In [133]:
movies=df_movie[['title','director','cast','country','genre','rating']].copy()
movies.head(4)

Unnamed: 0,title,director,cast,country,genre,rating
0,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","Dramas, Independent Movies, International Movies",TV-MA
1,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"Comedies, Dramas",PG-13
2,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","Dramas, International Movies",TV-MA
3,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,"Comedies, International Movies, Romantic Movies",TV-14


### **PREPARING DATA FOR VECTORIZATION**

In [134]:
#Removing stopwords and special characters since they have negligible influence on text analysis
def remove_stop_char(data,column,sw=True):
  if sw:
    data[column]=data[column].apply(nfx.remove_stopwords)
  else:
    data[column]=data[column].apply(nfx.remove_special_characters)

remove_stop_char(movies,'director')
remove_stop_char(movies,'cast')
remove_stop_char(movies,'country')
remove_stop_char(movies,'genre')

remove_stop_char(movies,'country',False)

movies.head(4)

Unnamed: 0,title,director,cast,country,genre,rating
0,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...",United States Ghana Burkina Faso United Kingdo...,"Dramas, Independent Movies, International Movies",TV-MA
1,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"Comedies, Dramas",PG-13
2,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...",Germany Czech Republic,"Dramas, International Movies",TV-MA
3,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,"Comedies, International Movies, Romantic Movies",TV-14


### **VECTORIZATION**

In [135]:
#Vectorizing Data
def vectorization(data,column,token=True):
  if token:
    countVector = CountVectorizer(binary=True, tokenizer=lambda x:x.split(','))
    return countVector.fit_transform(data[column]).toarray()
  else:
    countVector = CountVectorizer(binary=True)
    return countVector.fit_transform(data[column]).toarray()

country = vectorization(movies,'country',False)
director = vectorization(movies,'director')
cast = vectorization(movies,'cast')
genre = vectorization(movies,'genre')

In [136]:
binary_director=pd.DataFrame(director).transpose()
binary_cast=pd.DataFrame(cast).transpose()
binary_country=pd.DataFrame(country).transpose()
binary_genre=pd.DataFrame(genre).transpose()

In [137]:
# Concating Dataframe
movie_binary = pd.concat([binary_director,binary_cast,binary_country,binary_genre],axis=0, ignore_index=True)
movie_binary.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30813,30814,30815,30816,30817,30818,30819,30820,30821,30822
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5182,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5183,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5184,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
movie_sim=cosine_similarity(movie_binary.T)
movie_sim

array([[1.        , 0.1118034 , 0.16269784, ..., 0.12909944, 0.11952286,
        0.12403473],
       [0.1118034 , 1.        , 0.        , ..., 0.21650635, 0.13363062,
        0.        ],
       [0.16269784, 0.        , 1.        , ..., 0.        , 0.        ,
        0.13453456],
       ...,
       [0.12909944, 0.21650635, 0.        , ..., 1.        , 0.15430335,
        0.        ],
       [0.11952286, 0.13363062, 0.        , ..., 0.15430335, 1.        ,
        0.        ],
       [0.12403473, 0.        , 0.13453456, ..., 0.        , 0.        ,
        1.        ]])

In [139]:
movie_sim.shape

(5186, 5186)

# **TV DATA VECTORIZATION**

In [140]:
df_tv=df[df.type=='TV Show'].copy()
df_tv.reset_index(drop=True, inplace=True)
print(df_tv.shape)
df_tv.head(4)

(2676, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description
0,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
1,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
2,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
3,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [141]:
# filling NaN manually at director column so pandas can treat it as a Non null-value
df_tv['director'].fillna('NaN', inplace= True)

# Dropping null values
df_tv.dropna(inplace= True)
df_tv.reset_index(drop=True,inplace=True)

In [143]:
tv=df_tv[['title','director','cast','country','genre','rating']].copy()
tv.head(4)

Unnamed: 0,title,director,cast,country,genre,rating
0,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"International TV Shows, TV Dramas, TV Mysteries",TV-MA
1,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"International TV Shows, Romantic TV Shows, TV ...",TV-MA
2,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"British TV Shows, Reality TV",TV-14
3,Dear White People,,"Logan Browning, Brandon P. Bell, DeRon Horton,...",United States,"TV Comedies, TV Dramas",TV-MA


In [144]:
tv.describe().T

Unnamed: 0,count,unique,top,freq
title,2013,2013,Blood & Water,1
director,2013,142,,1866
cast,2013,1980,David Attenborough,14
country,2013,184,United States,618
genre,2013,219,Kids' TV,161
rating,2013,9,TV-MA,881


### **Preparing Data for Vectorization**

In [145]:
remove_stop_char(tv,'director')
remove_stop_char(tv,'cast')
remove_stop_char(tv,'country')
remove_stop_char(tv,'genre')

remove_stop_char(tv,'country',False)

### **Vectorizing**

In [146]:
tv_country = vectorization(df_tv,'country',False)
tv_director = vectorization(df_tv,'director')
tv_cast = vectorization(df_tv,'cast')
tv_genre = vectorization(df_tv,'genre')

In [147]:
tv_binary_director=pd.DataFrame(tv_director).transpose()
tv_binary_cast=pd.DataFrame(tv_cast).transpose()
tv_binary_country=pd.DataFrame(tv_country).transpose()
tv_binary_genre=pd.DataFrame(tv_genre).transpose()

In [148]:
# Concating Dataframe
tv_binary = pd.concat([tv_binary_cast,  tv_binary_country, tv_binary_genre], axis=0,ignore_index=True)
tv_binary.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14061,14062,14063,14064,14065,14066,14067,14068,14069,14070
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2011,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [149]:
tv_sim = cosine_similarity(tv_binary.T)
tv_sim

array([[1.        , 0.05892557, 0.        , ..., 0.08908708, 0.05455447,
        0.1132277 ],
       [0.05892557, 1.        , 0.        , ..., 0.06299408, 0.        ,
        0.16012815],
       [0.        , 0.        , 1.        , ..., 0.        , 0.09449112,
        0.        ],
       ...,
       [0.08908708, 0.06299408, 0.        , ..., 1.        , 0.        ,
        0.12104551],
       [0.05455447, 0.        , 0.09449112, ..., 0.        , 1.        ,
        0.        ],
       [0.1132277 , 0.16012815, 0.        , ..., 0.12104551, 0.        ,
        1.        ]])

# **RECOMENDATION ENGINE TESTING**

In [150]:
def recommedation(title):
  if title in df_movie.title.values:
    index=df_movie[df_movie.title == title].index.item()
    scores=dict(enumerate(movie_sim[index]))
    sorted_scores=dict(sorted(scores.items(),key=lambda x:x[1],reverse=True))

    selected_movies_index=[id for id, scores in sorted_scores.items()]
    selected_movies_score=[scores for id, scores in sorted_scores.items()]

    recommend_movies=df_movie.iloc[selected_movies_index]
    recommend_movies['similarity'] = selected_movies_score

    movie_recommend = recommend_movies.reset_index(drop=True)
    return movie_recommend[1:6]

  elif title in df_tv['title'].values:
    index=df_tv[df_tv.title == title].index.item()
    scores=dict(enumerate(tv_sim[index]))
    sorted_scores=dict(sorted(scores.items(),key=lambda x:x[1],reverse=True))

    selected_tv_index=[id for id, scores in sorted_scores.items()]
    selected_tv_score=[scores for id, scores in sorted_scores.items()]

    recommend_tv=df_tv.iloc[selected_tv_index]
    recommend_tv['similarity'] = selected_tv_score

    tv_recommend = recommend_tv.reset_index(drop=True)
    return tv_recommend[1:6]

  else:
    print('Title not found')

In [151]:
recommedation("Child's Play")

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description,similarity
1,s6416,Movie,Candyman,Bernard Rose,"Virginia Madsen, Tony Todd, Xander Berkeley, K...","United States, United Kingdom","October 1, 2019",1992,R,99 min,"Cult Movies, Horror Movies",Grad student Helen Lyle unintentionally summon...,0.276026
2,s8238,Movie,The Car,Elliot Silverstein,"James Brolin, Kathleen Lloyd, John Marley, R.G...",United States,"June 1, 2020",1977,PG,96 min,"Cult Movies, Horror Movies","In his small Southwestern town, sheriff Wade P...",0.276026
3,s797,Movie,Hostel: Part III,Scott Spiegel,"Kip Pardue, Brian Hallisay, John Hensley, Sara...",United States,"June 2, 2021",2011,R,88 min,"Cult Movies, Horror Movies",In this installment in the popular horror fran...,0.266667
4,s4525,Movie,Tales From the Hood 2,"Rusty Cundieff, Darin Scott","Keith David, Bryan Batt, Alexandria Deberry, B...",United States,"October 10, 2018",2018,R,110 min,"Cult Movies, Horror Movies, Independent Movies",Buckle up for an anthology of socially conscio...,0.266667
5,s6545,Movie,Cult of Chucky,Don Mancini,"Fiona Dourif, Michael Therriault, Adam Hurtig,...",United States,"October 3, 2017",2017,R,90 min,Horror Movies,Following a string of murders in the asylum wh...,0.266667


In [91]:
recommedation('After')

Unnamed: 0,title,director,cast,country,genre,rating,similarity
1,After We Collided,Roger Kumble,"Josephine Langford, Hero Fiennes Tiffin, Dylan...",United States,"Dramas, Romantic Movies",R,0.726722
2,Rodney King,Spike Lee,Roger Guenveur Smith,United States,Dramas,TV-MA,0.33541
3,The World We Make,Brian Baugh,"Caleb Castille, Rose Reid, Kevin Sizemore, Gre...",United States,"Dramas, Romantic Movies",PG,0.333333
4,Blue Jay,Alex Lehmann,"Sarah Paulson, Mark Duplass, Clu Gulager",United States,"Dramas, Independent Movies, Romantic Movies",TV-MA,0.333333
5,Burlesque,Steve Antin,"Cher, Christina Aguilera, Alan Cumming, Eric D...",United States,"Dramas, Romantic Movies",PG-13,0.322749


In [92]:
recommedation('Coffee & Kareem')

Unnamed: 0,title,director,cast,country,genre,rating,similarity
1,The Legacy of a Whitetail Deer Hunter,Jody Hill,"Josh Brolin, Danny McBride, Montana Jordan, Sc...",United States,"Action & Adventure, Comedies, Dramas",TV-14,0.402015
2,The Paper Tigers,Quoc Bao Tran,"Alain Uy, Ron Yuan, Mykel Shannon Jenkins, Jae...",United States,"Action & Adventure, Comedies",PG-13,0.3849
3,Spenser Confidential,Peter Berg,"Mark Wahlberg, Winston Duke, Alan Arkin, Bokee...",United States,"Action & Adventure, Comedies",R,0.3849
4,The Last Boy Scout,Tony Scott,"Bruce Willis, Damon Wayans, Chelsea Field, Nob...",United States,"Action & Adventure, Comedies",R,0.3698
5,Due Date,Todd Phillips,"Robert Downey Jr., Zach Galifianakis, Michelle...",United States,"Action & Adventure, Comedies",R,0.3698
