# CONTENT BASED FILTERING FOR MOVIE RECOMMENDATION

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(pd.read_csv("../input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv"))
df.head(4)

## CLEANING DATA

In [None]:
df.drop(['Unnamed: 0','Type','Rotten Tomatoes'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['Age'] = df['Age'].str.replace('+','')
df['Age'] = df['Age'].str.replace('all','0')
median = df['Age'].median()
df['Age'] = df['Age'].fillna(median)

In [None]:
mean = df['Runtime'].mean()
mean_IMDb = df['IMDb'].mean()

In [None]:
df['Runtime'] = df['Runtime'].fillna(mean)
df['Runtime'] = df['Runtime'].astype(int)
df['IMDb'] = df['IMDb'].fillna(mean)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df['Runtime'] = df['Runtime'].astype(str)
df['IMDb'] = df['IMDb'].astype(str)
df['Age'] = df['Age'].astype(str)
feat = ['Title','Age','IMDb','Directors','Genres','Country','Language','Runtime']
df1 = df[feat]
df1.head()

## MAKING DATA FIT FOR CONTENT BASED FILTERING ANALYSIS

In [None]:
def clean_data(x):
    return str.lower(x.replace(" ",""))

In [None]:
for i in feat:
    df1[i] = df1[i].apply(clean_data)

In [None]:
df1.head(2)

In [None]:
# BOW => Bag of Words
def BOW(x):
    return x['Title'] + ' ' + x['Age'] + ' ' + x['IMDb'] + ' ' + x['Directors'] + ' ' + x['Genres'] + ' ' + x['Country'] + ' ' + x['Country'] + ' ' + x['Runtime']
    

In [None]:
df1['BOW'] = df1.apply(BOW, axis=1)
df1.head()

## CREATING AND RUNNING THE FILTERING ALGORITHM

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df1['BOW'])

cos_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
df1 = df1.reset_index()
indices = pd.Series(df1.index, index = df1['Title'])

In [None]:
import numpy as np
def get_recom(t, cos_sim=cos_sim):
    t=t.replace(' ','').lower()
    i = indices[t]
    s_scores = list(enumerate(cos_sim[i]))
    s_scores = sorted(s_scores, key=lambda x: x[1], reverse=True)
    s_scores = s_scores[1:11]
    mov_idx = [j[0] for j in s_scores]
    
    l = []
    for i in mov_idx:
        Platform = []
        l.append(df['Title'].iloc[i])
        l.append(df['IMDb'].iloc[i])
        l.append(df['Directors'].iloc[i])
        l.append(df['Genres'].iloc[i])
        if df['Netflix'].iloc[i]==1:
            Platform.append("Netflix")
        if df['Prime Video'].iloc[i]==1:
            Platform.append("Prime Videos")
        if df["Hulu"].iloc[i]==1:
            Platform.append("Hulu")
        if df["Disney+"].iloc[i]==1:
            Platform.append("Disney+")
        pl = ', '.join(map(str,Platform))
        l.append(pl)
    
    l = np.array(l)
    l = l.reshape(10,5)
    
    cols = ['Title', 'IMDb Rating', 'Directed By', 'Genre','Available on Platform']
    
    l = pd.DataFrame(data=l, columns = cols)
    return l

In [None]:
get_recom('Inception',cos_sim)

In [None]:
get_recom('The Matrix',cos_sim)

In [None]:
get_recom('Mohabbatein',cos_sim)

In [None]:
get_recom('Avengers: Infinity War',cos_sim)

In [None]:
get_recom('The Conjuring', cos_sim)