In [3]:
import pandas as pd
import plotly.express as px

import os 

k_colors = ['rgb(0,127,206)','rgb(62,196,4)','rgb(255,125,16)','rgb(253,99,90)']#blue,gree,orange,red
plotly_themes = ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]#plotly_white plotly_dark
theme = plotly_themes[1]


In [7]:
#path to data files
notebook_path = os.getcwd()
project_path = notebook_path[0:len(notebook_path)-16]+"data/"

In [9]:
movies = pd.read_csv(project_path+'movie.csv')
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
movies.shape

(27278, 3)

There are 27,278 movies with  there genre and title. We must extract and clean the year and separate the genres.

## Year extraction

In [11]:
movies["year"] = movies["title"].str.extract('(\(\d\d\d\d\))',expand=False) #everything with parentheis and numbers (1995)
movies['year'] = movies["year"].str.extract('(\d\d\d\d)',expand=False) #only numbers
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')#replace numbers and parenthesis
movies['title'] = movies['title'].apply(lambda x: x.strip())
movies.head(3)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995


In [12]:
movies_year_agg = movies.groupby(["year"]).agg(count =("year","count"))\
    .reset_index()
fig = px.bar(movies_year_agg, x='year', y='count', height=400,width = 1000,
labels={'count':'Number of movies'},
template=theme)
fig.update_traces(marker_color=k_colors[0])
fig.show()


## Genre extraction

In [13]:
movies['genre_list'] = movies['genres'].str.split('|').tolist()
movies.head(3)

Unnamed: 0,movieId,title,genres,year,genre_list
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji,Adventure|Children|Fantasy,1995,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men,Comedy|Romance,1995,"[Comedy, Romance]"


In [14]:
flat_genre = [item for sublist in movies['genre_list'] for item in sublist] # flatten the list
set_genre = set(flat_genre) # convert to a set to make unique
unique_genre = list(set_genre) # back to list
unique_genre

['IMAX',
 'Adventure',
 'Musical',
 'Documentary',
 'War',
 'Fantasy',
 'Children',
 'Film-Noir',
 'Horror',
 'Action',
 'Animation',
 'Sci-Fi',
 'Crime',
 'Western',
 'Mystery',
 '(no genres listed)',
 'Drama',
 'Comedy',
 'Romance',
 'Thriller']

In [15]:
print("Number of unique genres: "+str(len(unique_genre)-1)+" without movies with no genre listed")

Number of unique genres: 19 without movies with no genre listed


In [16]:
# remove NA
unique_genre.remove("(no genres listed)")

# create columns by each unique genre
movies = movies.reindex(movies.columns.tolist() + unique_genre, axis=1, fill_value=0)

# for each value inside column, update the dummy
for index, row in movies.iterrows():
    for val in row["genres"].split('|'):
        if val != "(no genres listed)":
            movies.loc[index, val] = 1
movies.head(5)

Unnamed: 0,movieId,title,genres,year,genre_list,IMAX,Adventure,Musical,Documentary,War,...,Action,Animation,Sci-Fi,Crime,Western,Mystery,Drama,Comedy,Romance,Thriller
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,2,Jumanji,Adventure|Children|Fantasy,1995,"[Adventure, Children, Fantasy]",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,Comedy|Romance,1995,"[Comedy, Romance]",0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,"[Comedy, Drama, Romance]",0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
4,5,Father of the Bride Part II,Comedy,1995,[Comedy],0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Percentage of genres in the movies

In [18]:
#agregation of the genre in the movies
movies_cat = movies.mean()
movies_cat_pd = pd.DataFrame(movies_cat,columns=['percentage']).reset_index()
movies_cat_pd.columns = ["genre","percentage"]
movies_cat_pd = movies_cat_pd.loc[1:19,].sort_values("percentage",ascending  = False)

In [19]:
#ploting the movies with most genres
fig = px.bar(movies_cat_pd, x='genre', y='percentage', height=400,width = 1000,
labels={'count':'% of genres in movies'},
template=theme)
fig.update_traces(marker_color=k_colors[1])
fig.show()

More than 70% of the movies are of Drama or Comedy genre.