In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk("."):
    path = root.split(os.sep)
    print((len(path) - 1) * '---', os.path.basename(root))
    for file in files:
        print(len(path) * '---', file)

In [None]:
# kaggle datasets
metadt = pd.read_csv('./datasets/movies_metadata.csv', low_memory=False)
metadt.head(3)

# Recommender system using weighted rating

Penggunaan rating untuk rekomendasi memiliki kekurangan:  
* rating tidak memberikan gambaran popularitas produk. Produk A memilki rating 9.5 dari 20 voters sedangkan produk B memiliki rating 8.7 dari 1000 voters. Mana yang lebih baik? tentu saja produk B, rating yang diberikan oleh lebih banyak user lebih terpercaya dibandingkan rating tinggi dengan sedikit user
  
Oleh karena itu, perlu dilakukan pembobotan rating sebagaimana rumus dibawah ini:  
\begin{equation}
\text Weighted Rating (\bf WR) = \left({{\bf v} \over {\bf v} + {\bf m}} \cdot R\right) + \left({{\bf m} \over {\bf v} + {\bf m}} \cdot C\right)
\end{equation}  
Keterangan:  
v = jumlah voters  
m = minimum votes yang dibutuhkan untuk masuk dalam list  
R = rata-rata rating  
C = mean atau rata-rata vote secara keseluruhan

In [None]:
# average rating in datasets
C = metadt.vote_average.mean()
print(C)

# calculate minimum number of vote (m)
# here i'm gonna using 90% percentile
m = metadt.vote_count.quantile(0.9)
print(m)

In [None]:
# filter movie with vote_count more than m
top_movies = metadt.copy().loc[metadt['vote_count'] > m]
print('shape:', metadt.shape)
print('shape:', top_movies.shape)

In [None]:
def weighted_rating(data, m=m, C=C):
    v = data['vote_count']
    R = data['vote_average']
    return (v/(v+m) * R) + (m/(v+m) * C)

In [None]:
top_movies['weighted_rating'] = top_movies.apply(weighted_rating, axis=1)

In [None]:
top_movies = top_movies.sort_values('weighted_rating', ascending=False)
print('shape:', top_movies.shape)
top_movies[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(10)

# Content-based Recommender
Natural Language Processing (TF-IDF) using **overview** feature on dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create TF-IDF object and remove all stop_word like 'and', 'or', 'the', etc.
tfidf = TfidfVectorizer(stop_words='english')

metadt['overview'] = metadt['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(metadt['overview'])

tfidf_matrix.shape

Selanjutnya hitung nilai similarity. Score similarity dapat ditentukan menggunakan manhattan distance, euclidean distance, pearson atau cosine similarity. Berikut merupakan rumus untuk menghitung cosine similarity:
<img src='https://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1590782185/cos_aalkpq.png'/>
  
kita akan menggunakan sklearn **linear_kernel()** karena lebih cepat dibandingkan **cosine_similarity()**

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cos_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cos_sim.shape)
cos_sim[1]

In [None]:
# reverse mapping title and index for recommender function
indices = pd.Series(metadt.index, index=metadt['title']).drop_duplicates()

In [None]:
def get_recommendation(title, cos_sim=cos_sim):
    ind = indices[title] # find index for title
    sim = list(enumerate(cos_sim[ind])) # enumerate all cosine similarity for the title
    sim = sorted(sim, key=lambda x: x[1], reverse=True) # sorted by second column (cosine similarity)
    sim = sim[1:10] # get top 10 highest similarity
    movie_indices = [ x[0] for x in sim ]
    return metadt['title'].loc[movie_indices]

In [None]:
get_recommendation('The Shawshank Redemption')

## add more feature recommendation

In [None]:
credits = pd.read_csv('./datasets/credits.csv')
keywords = pd.read_csv('./datasets/keywords.csv')

In [None]:
def checkInteger(data):
    try:
        int(data)
    except ValueError:
        return True

bad_id = [x for x in metadt.id if checkInteger(x)]
print(bad_id)
index_id = metadt.loc[metadt['id'].isin(bad_id)].index
metadt.drop(index_id, axis=0, inplace=True)

In [None]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadt['id'] = metadt['id'].astype('int')

#merge keywords and credits to metadt
metadt = metadt.merge(keywords, on='id')
metadt = metadt.merge(credits, on='id')
metadt.head(3)

In [None]:
# stringified list
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadt[feature] = metadt[feature].apply(literal_eval)

In [None]:
import numpy as np

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
# get top 3 from instance (crews, keywords, genres)
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

In [None]:
# apply get_director and get_list
metadt['director'] = metadt['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadt[feature] = metadt[feature].apply(get_list)
    
metadt[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

In [None]:
# strip and lower case all string
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''

In [None]:
# apply clean data
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadt[feature] = metadt[feature].apply(clean_data)

In [None]:
# combine all data needed to string
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [None]:
# apply soup
metadt['soup'] = metadt.apply(create_soup, axis=1)
metadt[['soup']].head(3)

In [None]:
# recommending section
# Director, genre, cast on relatively more movies doesn't affect their presence
# so we used CountVectorizer instead of TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadt['soup'])
count_matrix.shape

In [None]:
# measure distance using cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

cos_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
# reset index and remapping like before
metadt = metadt.reset_index()
indices = pd.Series(metadt.index, index=metadt['title'])

In [None]:
# get recommendation
get_recommendation('Toy Story', cos_sim)