# Importing

In [1]:

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#from surprise import Reader, Dataset, SVDb

import warnings; warnings.simplefilter('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Movie Overviews and Taglines


In [3]:
df = pd.read_csv("/content/drive/MyDrive/movie dataset/indian_movies.csv")
df = df.iloc[:, 1:]   # remove unnecessory columns, position is at 0th place
df.shape

(828, 24)

In [None]:
#df = df.drop([19730, 29503, 35587])

In [4]:
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


## Genre is in list of dictionary (dictionary of id and genre name), convert it into list of genres
# Literal eval is convert the sting into list, our data in csv is in string type, list is also in string type, so w euse literal_eval to convert the string into original one.

In [5]:
vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.531400966183575

In [6]:
m = vote_counts.quantile(0.95)
m

84.0

In [7]:
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
# Year columns is added in the original dataset by using the release_date

In [8]:
df['id'] = df['id'].astype('int')

In [9]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
links_df = pd.read_csv('/content/drive/MyDrive/movie dataset/links_small.csv')
links_df = links_df[links_df['tmdbId'].notnull()]['tmdbId'].astype('int')

In [11]:
df.shape

(828, 25)

In [12]:
small_df = df[df['id'].isin(links_df)]
small_df.shape

(39, 25)

In [None]:
small_df['tagline'] = small_df['tagline'].fillna('')
small_df['description'] = small_df['overview'] + small_df['tagline']
small_df['description'] = small_df['description'].fillna('')

In [None]:
func = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

matrix = func.fit_transform(small_df['description'])

In [None]:
matrix.shape

(39, 2394)

In [None]:
cosine_sim = linear_kernel(matrix, matrix)

In [None]:
cosine_sim[0]

array([1.        , 0.06083   , 0.05352798, 0.00360787, 0.01168117,
       0.0184591 , 0.00908338, 0.01627048, 0.0310815 , 0.0330157 ,
       0.02144284, 0.13721876, 0.        , 0.02441336, 0.        ,
       0.        , 0.03317478, 0.01920172, 0.02648269, 0.00463108,
       0.        , 0.0124538 , 0.        , 0.01674144, 0.00661898,
       0.02647744, 0.00773867, 0.0103507 , 0.00510464, 0.        ,
       0.        , 0.00807022, 0.0175017 , 0.03406363, 0.00259932,
       0.00751258, 0.        , 0.01133472, 0.00537892])

In [None]:
small_df = small_df.reset_index()
titles = small_df['title']
indices = pd.Series(small_df.index, index=small_df['title'])

In [None]:
def recommendations(any_title, bool=False):
    index = indices[any_title]
    scores = list(enumerate(cosine_sim[index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[1:31]
    mve_index = [i[0] for i in scores]
    if bool:
      return mve_index
    return titles.iloc[mve_index]

In [None]:
recommendations('Gandhi').head(10)

0                       Pather Panchali
35                      Bridge of Spies
10    Lagaan: Once Upon a Time in India
12             11'09''01 - September 11
5                       Monsoon Wedding
4                          Bandit Queen
16                            Parineeta
18                         The Namesake
33             The Hundred-Foot Journey
21                  Slumdog Millionaire
Name: title, dtype: object

In [None]:
recommendations('3 Idiots').head(5)

29     Student of the Year
20           The Happening
1         The World of Apu
7     Bend It Like Beckham
5          Monsoon Wedding
Name: title, dtype: object

In [None]:
recommendations('Yeh Jawaani Hai Deewani').head(5)

5     Monsoon Wedding
37             Rustom
25           The Help
9                Fire
0     Pather Panchali
Name: title, dtype: object

# Movie Cast Crew Keyword nad Genres


In [None]:
keywords_df = pd.read_csv("/content/drive/MyDrive/movie dataset/keywords.csv")
credits_df = pd.read_csv("/content/drive/MyDrive/movie dataset/credits.csv")

In [None]:
keywords_df['id'] = keywords_df['id'].astype('int')
credits_df['id'] = credits_df['id'].astype('int')
df['id'] = df['id'].astype('int')

In [None]:
df = df.merge(credits_df, on='id')
df = df.merge(keywords_df, on='id')

In [None]:
df.shape

(868, 28)

In [None]:
small_df = df[df['id'].isin(links_df)]
small_df.shape

(46, 28)

In [None]:
small_df['cast'] = small_df['cast'].apply(literal_eval)
small_df['crew'] = small_df['crew'].apply(literal_eval)
small_df['keywords'] = small_df['keywords'].apply(literal_eval)
small_df['cast_size'] = small_df['cast'].apply(lambda x: len(x))
small_df['crew_size'] = small_df['crew'].apply(lambda x: len(x))

In [None]:
def make_director(y):
    for j in y:
        if j['job'] == 'Director':
            return j['name']
    return np.nan

In [None]:
small_df['director'] = small_df['crew'].apply(make_director)

In [None]:
small_df['cast'] = small_df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_df['cast'] = small_df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [None]:
small_df['keywords'] = small_df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
small_df['director'] = small_df['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
small_df['director'] = small_df['director'].apply(lambda x: [x,x, x])

In [None]:
a = small_df.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
a.name = 'keyword'

In [None]:
a = a.value_counts()
a[:5]

bollywood         9
woman director    7
indian lead       5
india             4
based on novel    3
Name: keyword, dtype: int64

In [None]:
a = a[a > 1]

In [None]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [None]:
def filtering_keywords(y):
    word = []
    for j in y:
        if j in a:
            word.append(j)
    return word

In [None]:
small_df['keywords'] = small_df['keywords'].apply(filtering_keywords)
small_df['keywords'] = small_df['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
small_df['keywords'] = small_df['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [None]:
small_df['keywords'].head(2), small_df['cast'].head(2), small_df['director'].head(2), small_df['genres'].head(2)

(0                       [dyinganddeath, writer]
 1    [arrangedmarriag, dyinganddeath, calcutta]
 Name: keywords, dtype: object,
 0    [Kanu Bannerjee, Karuna Bannerjee, Chunibala D...
 1    [Soumitra Chatterjee, Sharmila Tagore, Alok Ch...
 Name: cast, dtype: object,
 0    [satyajitray, satyajitray, satyajitray]
 1    [satyajitray, satyajitray, satyajitray]
 Name: director, dtype: object,
 0    [Drama]
 1    [Drama]
 Name: genres, dtype: object)

In [None]:
small_df.head(4)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director
0,False,"{'id': 158391, 'name': 'The Apu Collection', '...",0,[Drama],,5801,tt0048473,bn,পথের পাঁচালী,The film is a coming-of-age story of a young b...,...,False,7.9,63.0,1955,"[Kanu Bannerjee, Karuna Bannerjee, Chunibala D...","[{'credit_id': '52fe4423c3a36847f80845c7', 'de...","[dyinganddeath, writer]",19,15,"[satyajitray, satyajitray, satyajitray]"
1,False,"{'id': 158391, 'name': 'The Apu Collection', '...",0,[Drama],,896,tt0052572,bn,অপুর সংসার,Apu is a jobless ex-student dreaming vaguely o...,...,False,8.2,40.0,1959,"[Soumitra Chatterjee, Sharmila Tagore, Alok Ch...","[{'credit_id': '52fe4289c3a36847f80268c1', 'de...","[arrangedmarriag, dyinganddeath, calcutta]",7,23,"[satyajitray, satyajitray, satyajitray]"
2,False,,22000000,"[Drama, History]",,783,tt0083987,en,Gandhi,"In the early years of the 20th century, Mohand...",...,False,7.4,730.0,1982,"[Ben Kingsley, Candice Bergen, Edward Fox]","[{'credit_id': '574dc2c3925141120e0000cb', 'de...","[indianlead, hinduism, calcutta]",92,30,"[richardattenborough, richardattenborough, ric..."
3,False,,3000000,"[Drama, History, Romance]",,28005,tt0116743,en,Kama Sutra - A Tale of Love,Tara and Maya are two inseparable friends in I...,...,False,5.7,36.0,1996,"[Indira Varma, Sarita Choudhury, Ramon Tikaram]","[{'credit_id': '52fe4572c3a368484e05ba07', 'de...",[womandirector],4,3,"[miranair, miranair, miranair]"


In [None]:
small_df['soup'] = small_df['keywords'] + small_df['cast'] + small_df['director'] + small_df['genres']
small_df['soup'] = small_df['soup'].apply(lambda x: ' '.join(x))

In [None]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

count_matrix = count.fit_transform(small_df['soup'])

In [None]:
cosine = cosine_similarity(count_matrix, count_matrix)

In [None]:
small_df = small_df.reset_index()
titles = small_df['title']
indices = pd.Series(small_df.index, index=small_df['title'])

In [None]:
recommendations('Monsoon Wedding').head(5)

9               Fire
18       The Warrior
8     Salaam Bombay!
32        Real Steel
25          The Fall
Name: title, dtype: object

In [None]:
recommendations('Gandhi').head(5)

0                       Pather Panchali
35                  Student of the Year
10    Lagaan: Once Upon a Time in India
12             11'09''01 - September 11
5                       Monsoon Wedding
Name: title, dtype: object

In [None]:
def modify_recommendations(any_title):
    index_movie = recommendations(any_title, bool=True)
    movies = small_df.iloc[index_movie][['title', 'vote_count', 'vote_average', 'year']]
    counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualify = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualify['vote_count'] = qualify['vote_count'].astype('int')
    qualify['vote_average'] = qualify['vote_average'].astype('int')
    qualify['wr'] = qualify.apply(weighted_rating, axis=1)
    qualify = qualify.sort_values('wr', ascending=False).head(10)
    return qualify

In [None]:
modify_recommendations('Gandhi').head(5)

Unnamed: 0,title,vote_count,vote_average,year,wr
31,The Help,1966,7,2011,6.939823
25,The Fall,430,7,2008,6.759995
10,Lagaan: Once Upon a Time in India,125,7,2001,6.40975
1,The World of Apu,40,8,1959,6.327723
0,Pather Panchali,63,7,1955,6.160801
