In [25]:
import pandas as pd
import plotly.express as px

import os 

k_colors = ['rgb(0,127,206)','rgb(62,196,4)','rgb(255,125,16)','rgb(253,99,90)']#blue,gree,orange,red
plotly_themes = ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]#plotly_white plotly_dark
theme = plotly_themes[1]


In [3]:
#path to data files
notebook_path = os.getcwd()
project_path = notebook_path[0:len(notebook_path)-16]+"data/"

In [4]:
movies = pd.read_csv(project_path+'movie.csv')
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies.shape

(27278, 3)

There are 27,278 movies with  there genre and title. We must extract and clean the year and separate the genres.

In [6]:
ratings = pd.read_csv(project_path+'rating.csv')
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [7]:
ratings.shape

(20000263, 4)

In [8]:
ratings.userId.nunique()

138493

In [9]:
ratings.movieId.nunique()

26744

There are 138k unique users that have rated movies and almost all the movies have received a rating in the time

In [10]:
print("Less recent rating by user: "+min(ratings.timestamp)+"\nMost recent rating by user:"+max(ratings.timestamp))

Less recent rating by user: 1995-01-09 11:46:44
Most recent rating by user:2015-03-31 06:40:02


## Year extraction

Let stract the year from the tittle and put it in another column

In [11]:
movies["year"] = movies["title"].str.extract('(\(\d\d\d\d\))',expand=False) #everything with parentheis and numbers (1995)
movies['year'] = movies["year"].str.extract('(\d\d\d\d)',expand=False) #only numbers
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')#replace numbers and parenthesis
movies['title'] = movies['title'].apply(lambda x: x.strip())
movies.head(3)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995


Lets see the distribution of movies in the dataset. As expected, most of the movies have been released in the last 2 decades.

In [26]:
movies_year_agg = movies.groupby(["year"]).agg(count =("year","count"))\
    .reset_index()
fig = px.bar(movies_year_agg, x='year', y='count', height=400,width = 1000,
labels={'count':'Number of movies'},
template=theme)
fig.update_traces(marker_color=k_colors[0])
fig.show()


## Genre extraction

In [13]:
movies['genre_list'] = movies['genres'].str.split('|').tolist()
movies.head(3)

Unnamed: 0,movieId,title,genres,year,genre_list
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji,Adventure|Children|Fantasy,1995,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men,Comedy|Romance,1995,"[Comedy, Romance]"


In [14]:
flat_genre = [item for sublist in movies['genre_list'] for item in sublist] # flatten the list
set_genre = set(flat_genre) # convert to a set to make unique
unique_genre = list(set_genre) # back to list
unique_genre

['Western',
 'Mystery',
 'Children',
 'Drama',
 'Animation',
 'Film-Noir',
 'Crime',
 '(no genres listed)',
 'Romance',
 'Comedy',
 'Action',
 'War',
 'Documentary',
 'Musical',
 'Thriller',
 'Sci-Fi',
 'Horror',
 'Adventure',
 'Fantasy',
 'IMAX']

In [15]:
print("Number of unique genres: "+str(len(unique_genre)-1)+" without movies with no genre listed")

Number of unique genres: 19 without movies with no genre listed


In [16]:
# remove NA
unique_genre.remove("(no genres listed)")

# create columns by each unique genre
movies = movies.reindex(movies.columns.tolist() + unique_genre, axis=1, fill_value=0)

# for each value inside column, update the dummy
for index, row in movies.iterrows():
    for val in row["genres"].split('|'):
        if val != "(no genres listed)":
            movies.loc[index, val] = 1
movies.head(5)

Unnamed: 0,movieId,title,genres,year,genre_list,Western,Mystery,Children,Drama,Animation,...,Action,War,Documentary,Musical,Thriller,Sci-Fi,Horror,Adventure,Fantasy,IMAX
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,0,1,...,0,0,0,0,0,0,0,1,1,0
1,2,Jumanji,Adventure|Children|Fantasy,1995,"[Adventure, Children, Fantasy]",0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
2,3,Grumpier Old Men,Comedy|Romance,1995,"[Comedy, Romance]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,"[Comedy, Drama, Romance]",0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,Comedy,1995,[Comedy],0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Percentage of genres in the movies

In [17]:
#agregation of the genre in the movies
movies_cat = movies.mean()
movies_cat_pd = pd.DataFrame(movies_cat,columns=['percentage']).reset_index()
movies_cat_pd.columns = ["genre","percentage"]
movies_cat_pd = movies_cat_pd.loc[1:19,].sort_values("percentage",ascending  = False)

In [27]:
#ploting the movies with most genres
fig = px.bar(movies_cat_pd, x='genre', y='percentage', height=400,width = 1000,
labels={'count':'% of genres in movies'},
template=theme)
fig.update_traces(marker_color=k_colors[1])
fig.show()

More than 70% of the movies are of Drama or Comedy genre.

## Distribution of good ratings (>=4) by year

In [19]:
ratings["year_month"] = pd.DatetimeIndex(ratings['timestamp']).year*100+\
pd.DatetimeIndex(ratings['timestamp']).month
ratings["year_rating"] = pd.DatetimeIndex(ratings['timestamp']).year
ratings['high'] = 0
ratings.loc[ratings['rating']>= 4.0,"high"]=1


In [20]:
ratings_by_year = ratings.groupby(["year_rating"]).agg({"high":"mean"})\
.reset_index()
ratings_by_year.columns = ["year","mean_rating"]

In [28]:
fig = px.line(ratings_by_year, x='year', y='mean_rating', height=400,width = 1000,
labels={'mean_rating':'Mean of high rates'},
template=theme)
fig.update_traces(marker_color=k_colors[0])
fig.show()

In the last decade of the 1900's the rating of the users that considered movies as good movies were a little higher than usual, then for the next 5 years it had a llittle decrement. Then it started to increment for 10 years and the final 2 years it has started to fall again.

Lets have a zoom on the last 2 years and have a glimpse on every month.

In [29]:

ratings_by_year_month = ratings.groupby(["year_month"]).agg({"high":"mean"})\
.reset_index()
ratings_by_year_month.columns = ["year_month","mean_rating"]
ratings_by_year_month['year_month'] = ratings_by_year_month.year_month.astype(str)

fig = px.line(ratings_by_year_month[ratings_by_year_month["year_month"]>='201301'], x='year_month', y='mean_rating', height=400,width = 1000,
labels={'mean_rating':'Mean of high rates','year_month':'Year and month'},
template=theme)
fig.update_traces(marker_color=k_colors[1])
fig.show()

As we can see, in general we have a balance dataset oscillating from .44 through .57 by reviewing the months of the last 2 years of ratings with a tendency of less high rates on the last months 

Lets check the diference between high rated movies by genre in the last 2 years:

In [23]:
last_2_years_ratings = ratings[ratings["year_month"]>=201301]
movies_genre_gather = movies[["movieId","genre_list"]].explode("genre_list")
movies_ratings_genre=last_2_years_ratings.merge(movies_genre_gather,how = "left")

genre_ratings=movies_ratings_genre.groupby(["genre_list","year_month"]).agg({"movieId":"count",
                                "high": ["mean", "median"]})\
.reset_index()
genre_ratings.columns = ["genre_list","year_month","count","mean_rating","median_rating"]
genre_ratings.head(3)

Unnamed: 0,genre_list,year_month,count,mean_rating,median_rating
0,(no genres listed),201302,1,0.0,0.0
1,(no genres listed),201305,1,1.0,1.0
2,(no genres listed),201310,1,1.0,1.0


In [30]:
genre_ratings['year_month'] = genre_ratings.year_month.astype(str)

fig = px.line(genre_ratings[genre_ratings["genre_list"]!="(no genres listed)"], x='year_month', y='mean_rating', color="genre_list",
              line_group="genre_list", hover_name="count", height=400,width = 1000,
labels={'mean_rating':'Mean of high rates','year_month':'Year and month'},
template=theme)
fig.update_traces(marker_color=k_colors[1])
fig.show()


As we can see, there are 2 genres that generally are in the top of the the users than rate them with high scores: War and Film-Noir; as in contrast, the Horror genre hast the least amount of users, in general that rate those movies with 4 or more rate.