In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [30]:
# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk("."):
    path = root.split(os.sep)
    print((len(path) - 1) * '---', os.path.basename(root))
    for file in files:
        print(len(path) * '---', file)

 .
--- movies_metadata.csv
--- Simple-Reccomender-System.ipynb
--- .ipynb_checkpoints
------ Simple-Reccomender-System-checkpoint.ipynb


In [31]:
# kaggle datasets
metadt = pd.read_csv('movies_metadata.csv', low_memory=False)
metadt.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


Penggunaan rating untuk rekomendasi memiliki kekurangan:  
* rating tidak memberikan gambaran popularitas produk. Produk A memilki rating 9.5 dari 20 voters sedangkan produk B memiliki rating 8.7 dari 1000 voters. Mana yang lebih baik? tentu saja produk B, rating yang diberikan oleh lebih banyak user lebih terpercaya dibandingkan rating tinggi dengan sedikit user
  
Oleh karena itu, perlu dilakukan pembobotan rating sebagaimana rumus dibawah ini:  
\begin{equation}
\text Weighted Rating (\bf WR) = \left({{\bf v} \over {\bf v} + {\bf m}} \cdot R\right) + \left({{\bf m} \over {\bf v} + {\bf m}} \cdot C\right)
\end{equation}  
Keterangan:  
v = jumlah voters  
m = minimum votes yang dibutuhkan untuk masuk dalam list  
R = rata-rata rating  
C = mean atau rata-rata vote secara keseluruhan

In [32]:
# average rating in datasets
C = metadt.vote_average.mean()
print(C)

# calculate minimum number of vote (m)
# here i'm gonna using 90% percentile
m = metadt.vote_count.quantile(0.9)
print(m)

5.618207215133889
160.0


In [33]:
# filter movie with vote_count more than m
top_movies = metadt.copy().loc[metadt['vote_count'] > m]
print('shape:', metadt.shape)
print('shape:', top_movies.shape)

shape: (45466, 24)
shape: (4538, 24)


In [34]:
def weighted_rating(data, m=m, C=C):
    v = data['vote_count']
    R = data['vote_average']
    return (v/(v+m) * R) + (m/(v+m) * C)

In [35]:
top_movies['weighted_rating'] = top_movies.apply(weighted_rating, axis=1)

In [36]:
top_movies = top_movies.sort_values('weighted_rating', ascending=False)
top_movies[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(10)

Unnamed: 0,title,vote_count,vote_average,weighted_rating
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171
