In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
ratings = pd.read_csv('data/ratings.dat', sep='::',
                      names=['user_id', 'movie_id', 'rating', 'rating_timestamp'],engine="python"
                      ).sort_values("rating_timestamp") # sort by datetime
ratings.describe()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
count,888452.0,888452.0,888452.0,888452.0
mean,34879.738435,2187479.0,7.316825,1459300000.0
std,20100.224164,2025072.0,1.853552,69025600.0
min,1.0,8.0,0.0,1362062000.0
25%,17775.0,765443.0,6.0,1396187000.0
50%,34764.5,1714206.0,8.0,1450370000.0
75%,51869.0,2883512.0,9.0,1513955000.0
max,69324.0,12920710.0,10.0,1600911000.0


In [3]:
movies = pd.read_csv('data/movies.dat', sep='::',
                      names=['movie_id','movie_title_year','genres'],engine="python"
                      ).sort_values("movie_title_year") # sort by movie title
movies.describe()
movies.head(10)

Unnamed: 0,movie_id,movie_title_year,genres
3975,68152,$ (1971),Comedy|Crime|Drama
11124,212555,$30 (1999),Comedy|Short
16621,1024733,$5 a Day (2008),Comedy|Drama
22588,2106284,$50K and a Call Girl: A Love Story (2014),Action|Adventure|Comedy|Drama|Romance
23431,2258233,$ellebrity (2012),Documentary
23837,2332503,&amp;Me (2013),Romance
25147,2614684,'71 (2014),Action|Drama|Thriller|War
5807,85127,'A' gai wak (1983),Action|Comedy
6642,92501,'A' gai wak 2 (1987),Action|Comedy|Crime
5259,80310,'Breaker' Morant (1980),Drama|History|War


In [4]:
print('''Join the ratings with movies''')
movie_ratings = (ratings
                  .set_index("movie_id")
                  .join(movies.set_index("movie_id"),
                        how="left")
                 )

movie_ratings['rating_year'] = movie_ratings['rating_timestamp'].apply(lambda ts: pd.to_datetime(ts, unit='s').year)
movie_ratings.head(5)

Join the ratings with movies


Unnamed: 0_level_0,user_id,rating,rating_timestamp,movie_title_year,genres,rating_year
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8,41985,5,1396981211,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,2014
10,69122,10,1412878553,La sortie des usines Lumière (1895),Documentary|Short,2014
12,68097,10,1439248579,The Arrival of a Train (1896),Documentary|Short,2015
25,36818,8,1488189899,The Oxford and Cambridge University Boat Race ...,,2017
91,5692,6,1385233195,Le manoir du diable (1896),Short|Horror,2013


In [5]:
movie_ratings['rating_year'].describe()

count    890064.000000
mean       2015.773470
std           2.213189
min        2013.000000
25%        2014.000000
50%        2015.000000
75%        2017.000000
max        2020.000000
Name: rating_year, dtype: float64

In [6]:
print('''Separate the genres string into individual category indicators through get_dummies()''')

categories = movie_ratings['genres'].str.get_dummies()
categories.head(5)

tidy_movie_ratings = (pd.concat([movie_ratings, categories], axis=1)
                       .drop(["rating_timestamp"], axis=1)
                )

tidy_movie_ratings["production_year"] = tidy_movie_ratings["movie_title_year"].str[-5:-1]
tidy_movie_ratings["movie_title"] = tidy_movie_ratings["movie_title_year"].str[:-7]

tidy_movie_ratings=tidy_movie_ratings.drop(["movie_title_year"], axis=1)

tidy_movie_ratings.reset_index(inplace=True)

tidy_movie_ratings.head(2)


Separate the genres string into individual category indicators through get_dummies()


Unnamed: 0,movie_id,user_id,rating,genres,rating_year,Action,Adult,Adventure,Animation,Biography,...,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,production_year,movie_title
0,8,41985,5,Documentary|Short,2014,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1894,Edison Kinetoscopic Record of a Sneeze
1,10,69122,10,Documentary|Short,2014,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1895,La sortie des usines Lumière


In [7]:
print('''Add a weighted popularity column for movie, which uses the following factors:
1. # of ratings 6 or higher; 2. Year of production: higher (weak) weight to recent year if available; 
3. Year of rating; recent year get higher rating  
Formula: popularity = (rating-5)*10 + (130-year_production)/5 + (mean(year_rating-2013))*9''')

#tidy_movie_ratings["popularity"]
tidy_movie_ratings.groupby("movie_id").sum().head(5)
tidy_movie_ratings['movie_id'][tidy_movie_ratings['rating']>5].value_counts()

Add a weighted popularity column for movie, which uses the following factors:
1. # of ratings 6 or higher; 2. Year of production: higher (weak) weight to recent year if available; 
3. Year of rating; recent year get higher rating  
Formula: popularity = (rating-5)*10 + (130-year_production)/5 + (mean(year_rating-2013))*9


1454468     2938
816692      2769
8579674     2734
993846      2724
7286456     2644
            ... 
2563562        1
3456414        1
1312137        1
220514         1
11558924       1
Name: movie_id, Length: 31148, dtype: int64

In [8]:
# Calculate aggregtes so we can calculate combined rating as described above
grouped_ratings=tidy_movie_ratings.groupby(
   ['movie_id']).agg(
    {
        'rating': lambda x: (x>5).sum()/10,    # conditional count
         'rating_year': lambda x: ((x-2013).mean()*8),  # recent year gets bigger value
         'production_year': lambda x: (x.astype(int)-1800).mean()/30  # really small weight
    })

grouped_ratings['combinedRating'] = grouped_ratings['rating']+grouped_ratings['rating_year']+grouped_ratings['production_year']
grouped_ratings.sort_values('combinedRating',ascending=False).head(5)


Unnamed: 0_level_0,rating,rating_year,production_year,combinedRating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8579674,273.4,56.0,7.3,336.7
7286456,264.4,48.962801,7.3,320.662801
1454468,293.8,5.029014,7.1,305.929014
816692,276.9,18.352781,7.133333,302.386114
993846,272.4,11.309271,7.1,290.809271


In [9]:
grouped_ratings.describe(include='all')

Unnamed: 0,rating,rating_year,production_year,combinedRating
count,36380.0,36380.0,36380.0,36380.0
mean,2.092809,22.135042,6.678555,30.906407
std,10.430799,15.765317,0.702539,19.061504
min,0.0,0.0,2.6,3.366667
25%,0.1,8.0,6.433333,16.066667
50%,0.2,19.636364,7.0,28.390326
75%,0.6,32.0,7.166667,41.1
max,293.8,56.0,7.366667,336.7


In [10]:
cols_to_add = tidy_movie_ratings.columns.difference(grouped_ratings.columns)
grouped_ratings_merged =  pd.merge(grouped_ratings, tidy_movie_ratings, 
                     left_index = True, 
                     right_on = 'movie_id', 
                     how='left',
                     suffixes=(None,"_y")).sort_values('combinedRating',ascending=False).drop_duplicates(subset=['movie_id'])

genre_column_names=movie_ratings['genres'].str.get_dummies().columns

grouped_ratings_merged.head(5)

Unnamed: 0,rating,rating_year,production_year,combinedRating,movie_id,user_id,rating_y,genres,rating_year_y,Action,...,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,production_year_y,movie_title
877692,273.4,56.0,7.3,336.7,8579674,42014,9,Drama|War,2020,0,...,0,0,0,0,0,0,1,0,2019,1917
858090,264.4,48.962801,7.3,320.662801,7286456,10605,8,Crime|Drama|Thriller,2019,0,...,0,0,0,0,0,1,0,0,2019,Joker
371863,293.8,5.029014,7.1,305.929014,1454468,33294,8,Drama|Sci-Fi|Thriller,2013,0,...,0,1,0,0,0,1,0,0,2013,Gravity
236588,276.9,18.352781,7.133333,302.386114,816692,59922,9,Adventure|Drama|Sci-Fi,2014,0,...,0,1,0,0,0,0,0,0,2014,Interstellar
263509,272.4,11.309271,7.1,290.809271,993846,22210,8,Biography|Crime|Drama,2014,0,...,0,0,0,0,0,0,0,0,2013,The Wolf of Wall Street


In [11]:
# Generate global recommendation chart:
# movie title, year of release, Genres,  Popularity Score (proprietary score)
global_chart=grouped_ratings_merged[["movie_title","production_year","genres", "combinedRating"]].head(10)
print(global_chart)
global_chart.to_csv('charts/global_ratings.csv',index=False)
#grouped_ratings_merged.columns

                    movie_title  production_year  \
877692                     1917         7.300000   
858090                    Joker         7.300000   
371863                  Gravity         7.100000   
236588             Interstellar         7.133333   
263509  The Wolf of Wall Street         7.100000   
222862             Man of Steel         7.100000   
577561                Gone Girl         7.133333   
846948             Gisaengchung         7.300000   
326660           Iron Man Three         7.100000   
239710              World War Z         7.100000   

                                         genres  combinedRating  
877692                                Drama|War      336.700000  
858090                     Crime|Drama|Thriller      320.662801  
371863                    Drama|Sci-Fi|Thriller      305.929014  
236588                   Adventure|Drama|Sci-Fi      302.386114  
263509                    Biography|Crime|Drama      290.809271  
222862                  Action|

In [12]:
# Generate global chart for each genre
for genre in genre_column_names:
    genre_chart=grouped_ratings_merged[grouped_ratings_merged[genre]>0][["movie_title","production_year","genres", "combinedRating"]].head(10)
    print(genre_chart)
    genre_chart.to_csv('charts/'+genre+'.csv',index=False)

                       movie_title  production_year  \
222862                Man of Steel         7.100000   
326660              Iron Man Three         7.100000   
239710                 World War Z         7.100000   
427754                The Revenant         7.166667   
364701                    Deadpool         7.200000   
350583          Mad Max: Fury Road         7.166667   
796836                     Dunkirk         7.233333   
760620           Avengers: Endgame         7.300000   
487524  X-Men: Days of Future Past         7.133333   
357310     Star Trek Into Darkness         7.100000   

                                          genres  combinedRating  
222862                   Action|Adventure|Sci-Fi      248.304478  
326660                   Action|Adventure|Sci-Fi      233.280066  
239710   Action|Adventure|Horror|Sci-Fi|Thriller      230.990928  
427754  Action|Adventure|Biography|Drama|Western      224.797413  
364701            Action|Adventure|Comedy|Sci-Fi      206.6

               movie_title  production_year  \
371863             Gravity         7.100000   
236588        Interstellar         7.133333   
222862        Man of Steel         7.100000   
326660      Iron Man Three         7.100000   
239710         World War Z         7.100000   
364701            Deadpool         7.200000   
350583  Mad Max: Fury Road         7.166667   
462045                 Her         7.100000   
760620   Avengers: Endgame         7.300000   
630521             Arrival         7.200000   

                                         genres  combinedRating  
371863                    Drama|Sci-Fi|Thriller      305.929014  
236588                   Adventure|Drama|Sci-Fi      302.386114  
222862                  Action|Adventure|Sci-Fi      248.304478  
326660                  Action|Adventure|Sci-Fi      233.280066  
239710  Action|Adventure|Horror|Sci-Fi|Thriller      230.990928  
364701           Action|Adventure|Comedy|Sci-Fi      206.634754  
350583         Actio

In [14]:
print('''The above concludes the generation of chart as per the project ask. Regarding If I had more time, what would I add ? 
1. A better rating formula which will would be validated through users feedback and refined
2. Join with other datasets that can help clarify rating
3. Add images for each selected movie through use of IMDB or another site data for a richer experience
4. Use item-item association logic to give higher recommendation weight to items with stronger association with an 
existing popular item''')

The above concludes the generation of chart as per the project ask. Regarding If I had more time, what would I add ? 
1. A better rating formula which will would be validated through users feedback and refined
2. Join with other datasets that can help clarify rating
3. Add images for each selected movie through use of IMDB or another site data for a richer experience
4. Use item-item association logic to give higher recommendation weight to items with stronger association with an 
existing popular item
