# **NETFLIX MOVIE RECOMMENDATION SYSTEM**
In this notebook, I have created classes and functions to automate the code.\
You can call individual functions in the class or call the automate() function to run them all in sequence.\
This approach makes the code cleaner but reduces flexibility.

In [None]:
%pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl.metadata (12 kB)
Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/114.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


In [50]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import plotly.graph_objects as go
import plotly.io as pio
#pio.renderers.default = 'colab'
pio.renderers.default = 'notebook'
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import time

import warnings
warnings.filterwarnings('ignore')

In [None]:
class recommendationEngine:

  def __init__(self):
    self.df=pd.read_csv('netflix_titles.csv')

  def clean_data(self,type):
    self.df.rename(columns={'listed_in':'genre'},inplace=True)
    df_temp=self.df[self.df['type']==type].copy()
    df_temp.reset_index(drop=True,inplace=True)

    # filling NaN manually at the director and ratings column so pandas can treat it as a Non null-value
    df_temp['director'].fillna('NaN', inplace = True)
    df_temp['rating'].fillna('NaN', inplace = True)

    # Dropping null values
    df_temp.dropna(inplace= True)
    df_temp.reset_index(drop=True, inplace=True)

    time.sleep(1)
    print(f'Shape of {type} dataframe is {df_temp.shape[0]} rows and {df_temp.shape[1]} columns\n')

    temp=df_temp[['title','director','cast','country','genre','rating','type']].copy()
    time.sleep(1)
    print(f'Few statistics about the columns of the {type} Dataset are\n{temp.describe().T}\n')

    return df_temp, temp

  #Removing stopwords and special characters since they have negligible influence on text analysis
  #sw=stopward
  def remove_stop_char(self,data):
    data['cast']=data['cast'].apply(nfx.remove_stopwords)
    data['country']=data['country'].apply(nfx.remove_stopwords)
    data['genre']=data['genre'].apply(nfx.remove_stopwords)
    data['country']=data['country'].apply(nfx.remove_special_characters)

    if data['type'].unique()[0]=='Movie':
      data['director']=data['director'].apply(nfx.remove_stopwords)
      time.sleep(1)
      print('Removed Stopwords and Special Characters from Movies Dataset')
    else:
      time.sleep(1)
      print('Removed Stopwords and Special Characters from TV Show Dataset')
    return data

  #Vectorizing Data
  def vectorization(self,data,column,token=True):
    if token:
      countVector = CountVectorizer(binary=True, tokenizer=lambda x:x.split(','))
      time.sleep(1)
      print(f'Vectorized {column} column of {data.type.unique()[0]} dataset')
      return countVector.fit_transform(data[column]).toarray()
    else:
      countVector = CountVectorizer(binary=True)
      time.sleep(1)
      print(f'Vectorized {column} column of {data.type.unique()[0]} dataset')
      return countVector.fit_transform(data[column]).toarray()

  def binary(self,type,cast,country,genre,director=1):
    if type=='Movie':
      time.sleep(1)
      print('Converting Movies data to Binary')
      binary_director=pd.DataFrame(director).transpose()
    else:
      time.sleep(1)
      print('Converting TV Show data to Binary')
    binary_cast=pd.DataFrame(cast).transpose()
    binary_country=pd.DataFrame(country).transpose()
    binary_genre=pd.DataFrame(genre).transpose()

    # Concating Dataframe
    if type=='Movie':
      df_binary = pd.concat([binary_director,binary_cast,binary_country,binary_genre],axis=0, ignore_index=True)
    else:
      df_binary = pd.concat([binary_cast,  binary_country, binary_genre], axis=0,ignore_index=True)
    time.sleep(1)
    print('Converted\n')
    return df_binary.T

  def cosine(self,df_binary,data):
    time.sleep(1)
    print(f'Calculating Cosine Similarity of {data.type.unique()[0]} data')
    cos_sim = cosine_similarity(df_binary)
    time.sleep(1)
    print('Cosine Similarity Calculated\n')
    return cos_sim

  def recommedation(self,title,df_movies,df_tv,movie_sim,tv_sim):
    if title in df_movies.title.values:
      index=df_movies[df_movies.title == title].index.item()
      scores=dict(enumerate(movie_sim[index]))
      sorted_scores=dict(sorted(scores.items(),key=lambda x:x[1],reverse=True))

      selected_movies_index=[id for id, scores in sorted_scores.items()]
      selected_movies_score=[scores for id, scores in sorted_scores.items()]

      recommend_movies=df_movies.iloc[selected_movies_index]
      recommend_movies['similarity'] = selected_movies_score

      movie_recommend = recommend_movies.reset_index(drop=True)
      return movie_recommend[1:6]

    elif title in df_tv['title'].values:
      index=df_tv[df_tv.title == title].index.item()
      scores=dict(enumerate(tv_sim[index]))
      sorted_scores=dict(sorted(scores.items(),key=lambda x:x[1],reverse=True))

      selected_tv_index=[id for id, scores in sorted_scores.items()]
      selected_tv_score=[scores for id, scores in sorted_scores.items()]

      recommend_tv=df_tv.iloc[selected_tv_index]
      recommend_tv['similarity'] = selected_tv_score

      tv_recommend = recommend_tv.reset_index(drop=True)
      return tv_recommend[1:6]

    else:
      print('Title not found')

  def table(self,df):
    fig = go.Figure(data=[go.Table(
        columnorder=[1, 2, 3, 4, 5],
        columnwidth=[20, 20, 20, 30, 50],
        header=dict(values=list(['Type', 'Title', 'Country', 'Genre(s)', 'Description']),
                    line_color='black', font=dict(color='black', family="Gravitas One", size=20), height=40,
                    fill_color='#FF6865',
                    align='center'),
        cells=dict(values=[df.type, df.title, df.country, df.genre, df.description],
                   font=dict(color='black', family="Lato", size=16),
                   fill_color='#FFB3B2',
                   align='left'))
    ])

    fig.update_layout(height=700,
                      title={'text': "Top Movie Recommendations", 'font': {'size': 22, 'family': 'Gravitas One'}},
                      title_x=0.5
                      )
    fig.show()

  def automate():
    engine=recommendationEngine()
    time.sleep(1)
    print('RECOMMENDATION ENGINE CALLED\n')

    df_movies,movies=engine.clean_data('Movie')
    df_tv,tv=engine.clean_data('TV Show')

    movies=engine.remove_stop_char(movies)
    tv=engine.remove_stop_char(tv)
    print('\n')

    country = engine.vectorization(movies,'country',False)
    director = engine.vectorization(movies,'director')
    cast = engine.vectorization(movies,'cast')
    genre = engine.vectorization(movies,'genre')

    tv_country = engine.vectorization(tv,'country',False)
    tv_cast = engine.vectorization(tv,'cast')
    tv_genre = engine.vectorization(tv,'genre')
    print('\n')

    movie_binary = engine.binary('Movie',cast,country,genre,director)
    movie_sim = engine.cosine(movie_binary,movies)

    tv_binary = engine.binary('TV Show',tv_cast,tv_country,tv_genre)
    tv_sim = engine.cosine(tv_binary,tv)

    time.sleep(1)
    print('RECOMENDATION ENGINE CREATED\n')
    time.sleep(1)
    print('USE engine.recommedation("title",df_movies,df_tv,movie_sim,tv_sim) TO GET RECOMMENDATIONS IN TABULAR FORM USING PANDAS\n')
    time.sleep(1)
    print('USE engine.table(engine.recommedation("title",df_movies,df_tv,movie_sim,tv_sim)) TO GET RECOMMENDATIONS IN TABULAR FORM USING PLOTLY\n')

    return engine,df_movies,df_tv,movie_sim,tv_sim,engine.df

In [None]:
engine,df_movies,df_tv,movie_sim,tv_sim,df = recommendationEngine.automate()

RECOMMENDATION ENGINE CALLED

Shape of Movie dataframe is 5278 rows and 12 columns

Few statistics about the columns of the Movie Dataset are
         count unique                           top  freq
title     5278   5278                       Sankofa     1
director  5278   3830                           NaN    92
cast      5278   5139                   Samuel West    10
country   5278    597                 United States  1861
genre     5278    270  Dramas, International Movies   337
rating    5278     15                         TV-MA  1776
type      5278      1                         Movie  5278

Shape of TV Show dataframe is 2015 rows and 12 columns

Few statistics about the columns of the TV Show Dataset are
         count unique                 top  freq
title     2015   2015       Blood & Water     1
director  2015    142                 NaN  1868
cast      2015   1982  David Attenborough    14
country   2015    184       United States   618
genre     2015    219            Kids

In [51]:
engine.table(engine.recommedation('Coffee & Kareem',df_movies,df_tv,movie_sim,tv_sim))

In [None]:
engine.recommedation('The Paper Tigers',df_movies,df_tv,movie_sim,tv_sim)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description,similarity
1,s2710,Movie,Coffee & Kareem,Michael Dowse,"Ed Helms, Taraji P. Henson, Terrence Little Ga...",United States,"April 3, 2020",2020,TV-MA,88 min,"Action & Adventure, Comedies",An inept Detroit cop must team up with his gir...,0.3849
2,s4783,Movie,The Legacy of a Whitetail Deer Hunter,Jody Hill,"Josh Brolin, Danny McBride, Montana Jordan, Sc...",United States,"July 6, 2018",2018,TV-14,83 min,"Action & Adventure, Comedies, Dramas",A star of hunting videos strives to bond with ...,0.348155
3,s2837,Movie,Spenser Confidential,Peter Berg,"Mark Wahlberg, Winston Duke, Alan Arkin, Bokee...",United States,"March 6, 2020",2020,R,111 min,"Action & Adventure, Comedies","Spenser, an ex-cop and ex-con, teams up with a...",0.333333
4,s355,Movie,The Last Boy Scout,Tony Scott,"Bruce Willis, Damon Wayans, Chelsea Field, Nob...",United States,"August 1, 2021",1991,R,105 min,"Action & Adventure, Comedies",Private eye Joe Hallenbeck is forced to team u...,0.320256
5,s931,Movie,Due Date,Todd Phillips,"Robert Downey Jr., Zach Galifianakis, Michelle...",United States,"May 1, 2021",2010,R,95 min,"Action & Adventure, Comedies","Days before his pregnant wife's due date, Pete...",0.320256
