In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import string
import nltk
from nltk.stem.snowball import SnowballStemmer
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
pd.set_option('display.max.columns', None)

In [None]:
metadata = pd.read_csv('/content/drive/MyDrive/Dataset/data/movies_metadata_small.csv')
keywords = pd.read_csv('/content/drive/MyDrive/Dataset/data/keywords_small.csv')
credits = pd.read_csv('/content/drive/MyDrive/Dataset/data/credits_small.csv')
links = pd.read_csv('/content/drive/MyDrive/Dataset/data/links_small.csv')
ratings = pd.read_csv('/content/drive/MyDrive/Dataset/data/ratings_small.csv')

In [None]:
metadata.isnull().sum()

genres             0
id                 0
overview           0
popularity         0
release_date       0
tagline         2139
title              0
vote_average       0
dtype: int64

In [None]:
metadata.fillna("",inplace=True)
metadata.shape

(8833, 8)

In [None]:
def get_genres(genres):
    return ' '.join([i['name'] for i in genres])
def get_cast(cast):
    if len(cast)>7:
        cast=cast[:7];
    return [people['name'].translate(str.maketrans('','', " -.'")) for people in cast]
def get_director(crew):
    for person in crew:
        if person['job']=='Director':
            return [person['name'].translate(str.maketrans('','', " -.'"))]
    return []
def get_people(x):
    people = x['cast'] if x['genres'].find('Animation')==-1 else x['cast'][0:3]
    for d in x['director']:
        if d not in people:
            people.append(d)
    return ' '.join(people)
def get_keywords(keywords):
    return ' '.join([i['name'] for i in keywords])

In [None]:
metadata['genres']= metadata['genres'].apply(literal_eval).apply(get_genres)
credits['cast'] = credits['cast'].apply(literal_eval).apply(get_cast)
credits['director'] = credits['crew'].apply(literal_eval).apply(get_director)
keywords['keywords'] = keywords['keywords'].apply(literal_eval).apply(get_keywords)

In [None]:
info = pd.merge(credits[['id', 'cast', 'director']], keywords, how='inner', on='id')
metadata = pd.merge(metadata, info, how='inner', on='id')
metadata['keys']=metadata.apply(lambda x : x['title']+" "+x['overview']+" "+x['tagline']+" "+x['keywords'],axis=1)
metadata['people'] = metadata.apply(get_people, axis=1)
metadata = metadata[['id','title','people','genres','keys','popularity','release_date','vote_average']].copy()
metadata.head(2)

Unnamed: 0,id,title,people,genres,keys,popularity,release_date,vote_average
0,862,Toy Story,TomHanks TimAllen DonRickles JohnLasseter,Animation Comedy Family,"Toy Story Led by Woody, Andy's toys live happi...",21.946943,1995,7.7
1,8844,Jumanji,RobinWilliams JonathanHyde KirstenDunst Bradle...,Adventure Fantasy Family,Jumanji When siblings Judy and Peter discover ...,17.015539,1995,6.9


In [None]:
stemmer = SnowballStemmer(language='english')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def get_keys(x):
    x = x.translate(str.maketrans('', '', string.punctuation))
    return ' '.join([stemmer.stem(w) for w in x.split() if stemmer.stem(w) not in stopwords])
metadata['keys'] = metadata['keys'].apply(get_keys)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
transformer = QuantileTransformer(output_distribution='uniform')
metadata['popularity'] = pd.Series(transformer.fit_transform(metadata[['popularity']]).reshape(-1))
metadata['release_date'] = pd.Series(transformer.fit_transform(metadata[['release_date']]).reshape(-1))
metadata['vote_average'] = metadata['vote_average'].apply(lambda x : np.nan if x==0 else x)
mean_average = metadata['vote_average'].mean()
metadata.fillna(mean_average, inplace=True)
metadata['vote_average'] = (metadata['vote_average']/10)
metadata.head(2)

Unnamed: 0,id,title,people,genres,keys,popularity,release_date,vote_average
0,862,Toy Story,TomHanks TimAllen DonRickles JohnLasseter,Animation Comedy Family,toy stori led woodi andi toy live happili room...,0.98031,1995,0.77
1,8844,Jumanji,RobinWilliams JonathanHyde KirstenDunst Bradle...,Adventure Fantasy Family,jumanji sibl judi peter discov enchant board g...,0.956981,1995,0.69


In [None]:
movies = pd.concat([pd.Series(metadata.index, name='index'), metadata[['id','title']]],axis=1)
obj_cast = CountVectorizer(lowercase=True, analyzer='word', min_df=5)
cast_val = obj_cast.fit_transform(metadata['people']).toarray()
obj_genre = TfidfVectorizer()
genre_val = np.minimum(0.5, obj_genre.fit_transform(metadata['genres']).toarray())*2
obj_tfidf = TfidfVectorizer(min_df=50)
tfidf_val = np.minimum(0.25, obj_tfidf.fit_transform(metadata['keys']).toarray())*6
features_col = list(obj_cast.get_feature_names_out()) + list(obj_genre.get_feature_names_out()) + list(obj_tfidf.get_feature_names_out())
features = np.concatenate((cast_val, genre_val, tfidf_val), axis=1)
movies.shape, cast_val.shape, genre_val.shape, tfidf_val.shape, features.shape

((8833, 3), (8833, 3044), (8833, 22), (8833, 1379), (8833, 4445))

In [None]:
ratings = pd.merge(ratings, links, how='inner', on='movieId')
ratings = ratings.loc[:,['userId', 'tmdbId', 'rating']]

In [None]:
userId = 1
user_data = ratings.loc[ratings['userId']==userId]
user_data = pd.merge(movies, user_data, how='inner', left_on='id', right_on='tmdbId')
user_data['rating'] = (user_data['rating']-3)/2
user_data[['index','id','title','rating']].head()

Unnamed: 0,index,id,title,rating
0,0,862,Toy Story,0.5
1,2,15602,Grumpier Old Men,0.5
2,5,949,Heat,0.5
3,43,807,Se7en,1.0
4,46,629,The Usual Suspects,1.0


In [None]:
x_train, x_test = train_test_split(user_data, test_size=0.2, random_state= 100)
x = features[x_train['index'].values]
y = x_train[['rating']].values
usermul = np.matmul(np.transpose(y), x).reshape(-1)
feature_sum = np.sum(x, axis=0)
feature_cnt = np.sum(x>0, axis=0)
u1 = np.divide(usermul, feature_sum, out=np.zeros(usermul.shape), where = feature_sum!=0)
u2 = np.log(1+feature_cnt)
temp = np.concatenate((np.zeros(cast_val.shape[1],dtype=bool), np.ones(genre_val.shape[1]+tfidf_val.shape[1],dtype=bool)))
np.log(1+u2, out = u2, where = temp)
np.sqrt(u2, out = u2, where = (~temp))
user_profile = u1*u2
matmul = np.dot(features, user_profile)
mv_sum = np.sum(features, axis=1)
pred_rating = np.divide(matmul, mv_sum, out = np.zeros(matmul.shape), where = mv_sum!=0)
pred = pd.concat([movies, pd.Series(pred_rating, name='pred_rating')],axis=1)
pred.loc[x_train['index'].values, 'rating'] = x_train['rating'].values
pred.loc[x_train['index'].values, 'watched'] = 1
pred[['rating', 'pred_rating']].groupby('rating').mean().reset_index()

Unnamed: 0,rating,pred_rating
0,-0.5,0.333847
1,0.0,0.532494
2,0.5,0.663316
3,1.0,0.80827


In [None]:
user_profile[0:3046].max(), user_profile[3046:3068].max(), user_profile[3068:].max()

(1.2686362411795196, 1.1740086736720143, 1.1829343104176024)

In [None]:
u2[0:3046].max(), u2[3046:3068].max(), u2[3068:].max()

(1.6437209301888343, 1.6466671712706678, 1.5341940066777084)

In [None]:
user_features = sorted(list(enumerate(user_profile)), key=lambda x : x[1], reverse=True)
[(i[0],features_col[i[0]],i[1]) for i in user_features[0:40]]

[(1664, 'kevinspacey', 1.2686362411795196),
 (3010, 'wolfgangreitherman', 1.2686362411795196),
 (2723, 'stevebuscemi', 1.2047095791412654),
 (4391, 'way', 1.185639047955683),
 (1395, 'johncleese', 1.1774100225154747),
 (3046, 'animation', 1.1740086736720143),
 (3128, 'anim', 1.1644710863810457),
 (3531, 'fight', 1.163065230548284),
 (3050, 'drama', 1.1401820385397592),
 (2738, 'stevenspielberg', 1.133021125186408),
 (3510, 'famili', 1.118548568475036),
 (3783, 'life', 1.1174378487310943),
 (1054, 'harrisonford', 1.1100567110320796),
 (2005, 'michaeldouglas', 1.1100567110320796),
 (2618, 'seanconnery', 1.1100567110320796),
 (3535, 'film', 1.1083118248547639),
 (3048, 'crime', 1.1052277688397656),
 (4428, 'world', 1.0995217130690467),
 (4385, 'war', 1.0843453133307492),
 (3816, 'man', 1.0825968808551008),
 (4221, 'stop', 1.0810314794004443),
 (3624, 'happen', 1.0804178182729878),
 (3925, 'one', 1.0780484604223295),
 (3581, 'gang', 1.0742422652806207),
 (3992, 'power', 1.0730461226755268)

In [None]:
pred.iloc[1993,:]

index                  1993
id                      861
title          Total Recall
pred_rating        0.476866
rating                  0.5
watched                 NaN
Name: 1993, dtype: object

In [None]:
pred.loc[x_test['index'].values, 'rating'] = x_test['rating'].values
unwatched = pred.loc[np.isnan(pred['watched']),:]
unwatched.sort_values('pred_rating', ascending = False).head(100)

Unnamed: 0,index,id,title,pred_rating,rating,watched
8539,8539,44238,World of Glory,0.976608,,
6844,6844,36540,Winnie the Pooh and the Honey Tree,0.922953,,
3439,3439,35651,Time Out,0.912886,,
8818,8818,79701,Gena the Crocodile,0.888640,,
5224,5224,13409,State Property 2,0.876663,,
...,...,...,...,...,...,...
634,634,12632,Kaspar Hauser,0.766646,,
2924,2924,29937,Divided We Fall,0.766003,,
5999,5999,13054,Rise of the Footsoldier,0.765687,,
2437,2437,23531,Whipped,0.765681,,


In [None]:
def why_this_movie(user_profile, movie_index):
    temp = user_profile*features[movie_index,:]
    temp = sorted(list(enumerate(temp)), key=lambda x : x[1], reverse=True)
    return [(i[0],features_col[i[0]],i[1]) for i in temp[0:10]]

In [None]:
why_this_movie(user_profile, 8818)

[(3128, 'anim', 1.7467066295715685),
 (3541, 'first', 1.371662302392499),
 (3046, 'animation', 1.1740086736720143),
 (3051, 'family', 1.0394622831127582),
 (0, '50cent', 0.0),
 (1, 'aaroneckhart', 0.0),
 (2, 'aaronpaul', 0.0),
 (3, 'aarontaylorjohnson', 0.0),
 (4, 'aasifmandvi', 0.0),
 (5, 'abbiecornish', 0.0)]

In [None]:
features[8818,3046], user_profile[3046]

(1.0, 1.1740086736720143)

In [None]:
metadata.iloc[8818,:]

id                                                    79701
title                                    Gena the Crocodile
people          VasiliyLivanov KlaraRumyanova RomanKachanov
genres                                     Animation Family
keys              gena crocodil first anim gena cheburashka
popularity                                         0.059272
release_date                                       0.039039
vote_average                                           0.67
Name: 8818, dtype: object

In [None]:
user_profile.shape

(7764,)

In [None]:
().shape

(7764,)