In [None]:
!pip install currencyconverter

In [None]:
# Import some libraries

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
from currency_converter import CurrencyConverter
import datetime
from wordcloud import WordCloud, STOPWORDS 
import textwrap
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Load data from a CSV file into pandas DataFrame

data_imdb_movies = pd.read_csv('../input/imdb-extensive-dataset/IMDb movies.csv')
data_imdb_names = pd.read_csv('../input/imdb-extensive-dataset/IMDb names.csv')
data_imdb_title_principals = pd.read_csv('../input/imdb-extensive-dataset/IMDb title_principals.csv')

In [None]:
imdb_movies = data_imdb_movies.copy()
imdb_movies.head()

In [None]:
imdb_names = data_imdb_names.copy()
imdb_names.head()

In [None]:
imdb_title_principals = data_imdb_title_principals.copy()
imdb_title_principals.head()

In [None]:
# Informations on each data

print(imdb_movies.info())
print('\n')
print(imdb_names.info())
print('\n')
print(imdb_title_principals.info())

# Data Preprocessing & Exploratory Data Analysis

In [None]:
# Merge name to title principals
imdb_title_principals = pd.merge(imdb_title_principals, imdb_names[['imdb_name_id', 'name']], 
                                 left_on = ['imdb_name_id'], right_on = ['imdb_name_id']) 
# Ordering columns
imdb_title_principals = imdb_title_principals[['imdb_title_id', 'ordering', 'imdb_name_id', 'name', 'category', 'job', 'characters']]
imdb_title_principals.head()

In [None]:
# Create new column "cinematographer" into imdb_movies data
cinematographer_name = imdb_title_principals[imdb_title_principals['category']=='cinematographer'].reset_index()
cinematographer_name.rename(columns={'name' : 'cinematographer'}, inplace = True)
imdb_movies = pd.merge(imdb_movies, cinematographer_name[['imdb_title_id', 'cinematographer']],
                       left_on = 'imdb_title_id', right_on = 'imdb_title_id', how = 'left')

# Group/join cinematographer names with same imdb_title_id to avoid duplicated data (from merging imdb_movies and cinematographer_name)
duplicated_data = imdb_movies[imdb_movies['imdb_title_id'].duplicated(keep = False)]
multiple_names_cinematographer = duplicated_data.groupby('imdb_title_id')['cinematographer'].apply(', '.join).reset_index()
duplicated_data.drop(['cinematographer'], axis = 1, inplace = True)
duplicated_data.drop_duplicates(subset=['imdb_title_id'], inplace = True)
data_multiple_names = pd.merge(duplicated_data, multiple_names_cinematographer[['imdb_title_id', 'cinematographer']], 
                               left_on = 'imdb_title_id', right_on = 'imdb_title_id')
data_multiple_names[['imdb_title_id', 'cinematographer']].head()

In [None]:
# Drop and replace duplicate data (because names of cinematographer) with data which have multiple names of cinematographer
imdb_movies.drop_duplicates(subset=['imdb_title_id'], keep = False, inplace = True)
imdb_movies = pd.concat((imdb_movies, data_multiple_names), sort = False).sort_values('imdb_title_id')

# Reorder column cinematographer
cols = imdb_movies.columns.tolist()
cols = cols[0:13] + cols[-1:] + cols [13:-1]
imdb_movies = imdb_movies[cols]
imdb_movies.head()

In [None]:
# Filtering data (only USA or Hollywood Movies)
imdb_movies['country'].fillna('', inplace = True)
imdb_movies = imdb_movies[imdb_movies['country'].str.contains('USA')]

In [None]:
# Create budget_currency column and formatting budget column into numeric values for converting to usd
imdb_movies['budget_currency'] = imdb_movies['budget'].str.split(' ', expand = True)[0]
imdb_movies['budget_currency'] = imdb_movies['budget_currency'].str.replace('$', 'USD')
imdb_movies['budget'] = imdb_movies['budget'].str.split(' ', expand = True)[1]
imdb_movies['budget'] = pd.to_numeric(imdb_movies['budget'], errors='coerce')

# Create worlwide_gross_income_currency column and formatting worlwide_gross_income column into numeric values for converting to usd
imdb_movies['worlwide_gross_income_currency'] = imdb_movies['worlwide_gross_income'].str.split(' ', expand = True)[0]
imdb_movies['worlwide_gross_income_currency'] = imdb_movies['worlwide_gross_income_currency'].str.replace('$', 'USD')
imdb_movies['worlwide_gross_income'] = imdb_movies['worlwide_gross_income'].str.split(' ', expand = True)[1]
imdb_movies['worlwide_gross_income'] = pd.to_numeric(imdb_movies['worlwide_gross_income'], errors='coerce')

# Create usa_gross_income_currency column and formatting usa_gross_income column into numeric values for converting to usd
imdb_movies['usa_gross_income_currency'] = imdb_movies['usa_gross_income'].str.split(' ', expand = True)[0]
imdb_movies['usa_gross_income_currency'] = imdb_movies['usa_gross_income_currency'].str.replace('$', 'USD')
imdb_movies['usa_gross_income'] = imdb_movies['usa_gross_income'].str.split(' ', expand = True)[1]
imdb_movies['usa_gross_income'] = pd.to_numeric(imdb_movies['usa_gross_income'], errors='coerce')


In [None]:
# Convert currencies into USD

c = CurrencyConverter()
for i in range(imdb_movies.shape[0]):
    # budget column
    if (imdb_movies['budget_currency'].iloc[i] in c.currencies):
        imdb_movies['budget'].iloc[i] = c.convert(imdb_movies['budget'].iloc[i], imdb_movies['budget_currency'].iloc[i], 'USD')
    else :
        imdb_movies['budget'].iloc[i] = np.nan
   
    # worlwide_gross_income column   
    if (imdb_movies['worlwide_gross_income_currency'].iloc[i] in c.currencies):
        imdb_movies['worlwide_gross_income'].iloc[i] = c.convert(imdb_movies['worlwide_gross_income'].iloc[i], 
                                                            imdb_movies['worlwide_gross_income_currency'].iloc[i], 'USD', )
    else :
        imdb_movies['worlwide_gross_income'].iloc[i] = np.nan
    
    # usa_gross_income column   
    if (imdb_movies['usa_gross_income_currency'].iloc[i] in c.currencies):
        imdb_movies['usa_gross_income'].iloc[i] = c.convert(imdb_movies['usa_gross_income'].iloc[i], 
                                                       imdb_movies['usa_gross_income_currency'].iloc[i], 'USD', 
                                                       )
    else :
        imdb_movies['usa_gross_income'].iloc[i] = np.nan


## Analysis Numeric and Categorical Data

### Analysis Numeric Data

In [None]:
num_data = ['duration', 'avg_vote', 'votes', 'budget', 'usa_gross_income', 'worlwide_gross_income', 
            'metascore', 'reviews_from_users', 'reviews_from_critics']
imdb_movies[num_data].describe()

In [None]:
# Visualize distplot and boxplot on each numerical data/columns

fig, ax = plt.subplots(9, 2, figsize = (14, 24))
fig.tight_layout(pad = 5)

for i, n in enumerate(num_data):
    sns.distplot(ax = ax[i,0], a = imdb_movies[n].dropna(), label = 'skewness : %.2f'%(imdb_movies[n].skew()))
    ax[i,0].set_title(n, fontsize = 18)
    ax[i,0].legend(loc = 'best')
    
    sns.boxplot(ax = ax[i, 1], x = imdb_movies[n].dropna())
    ax[i, 1].set_title(n, fontsize = 18)
plt.show()

### Analysis Categorical Data

#### Which Decade Has Release Most Movies and Highest Average Vote (Rating)

In [None]:
# Clean the data on 'year' column
imdb_movies['year'].replace('TV Movie 2019', 2019, inplace = True)
imdb_movies['year'] = imdb_movies['year'].astype(int)

# Group the data based on Decades
movies_by_decades = imdb_movies[['imdb_title_id', 'original_title', 'year','avg_vote', 'votes']]
decades = movies_by_decades['year']//10*10
decades = decades.astype(str)+' - '+ (decades+9).astype(str)
decades_column = pd.DataFrame(decades)
movies_by_decades.insert(3, 'decades', decades_column)
movies_by_decades.head()

In [None]:
# Visualize movie counts release based on decade with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))
decades = movies_by_decades.groupby('decades')['imdb_title_id'].count().index
count = movies_by_decades.groupby('decades')['imdb_title_id'].count()
sns.barplot(ax = ax, x = decades, y = count)
ax.set_title('Movie Counts Based on Decade', fontsize = 18)
ax.set_xlabel('Decade')
for index,count in enumerate(count.astype(int)):
       ax.text(x=index-0.15 , y =count+1 , s=f"{count}" , fontdict=dict(fontsize=10))
ax.set_ylabel('No. of Movies')
plt.show()

In [None]:
# Visualize average vote (rating) based on decade with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))
decades = movies_by_decades.groupby('decades')['avg_vote'].mean().index
avg_vote = movies_by_decades.groupby('decades')['avg_vote'].mean()
sns.barplot(ax = ax, x = decades, y = avg_vote)
ax.set_title('Average Vote (Rating) Based on Decade', fontsize = 18)
ax.set_xlabel('Decade')
for index,avg_vote in enumerate(np.round(avg_vote, 2)):
       ax.text(x=index-0.15 , y =avg_vote+0 , s=f"{avg_vote}" , fontdict=dict(fontsize=10))
ax.set_ylabel('Average Vote')
ax.set_ylim((5, 7))

plt.show()

#### Which Month Has Release Most Movies

In [None]:
# Preprocess the data 

imdb_movies['date_published'].replace('TV Movie 2019', 2019, inplace = True)
movies_published = imdb_movies[['imdb_title_id', 'original_title', 'genre', 'date_published']]
movies_published['month_published'] = [month[5:7] for month in movies_published['date_published'].astype(str)]

#print(movies_published['month_published'].unique()) # There are blank values on month_published column
movies_published['month_published'][movies_published['month_published']==''] = np.nan # replace blank values with nan
movies_published.head()

In [None]:
# Visualize movie counts release based on month with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))
months_published = movies_published.groupby('month_published')['imdb_title_id'].count().index
count_movies = movies_published.groupby('month_published')['imdb_title_id'].count()
sns.barplot(ax = ax, x = months_published, y = count_movies)
ax.set_title('Movie Counts Based on Month', fontsize = 18)
ax.set_xlabel('Month')
ax.set_ylabel('No. of Movies')
for index,count_movies in enumerate(count_movies):
       ax.text(x=index-0.15 , y =count_movies+0 , s=f"{count_movies}" , fontdict=dict(fontsize=10))
ax.set_xticklabels(['January', 'February', 'March', 'April', 'May', 'June'
                    , 'July', 'August', 'September', 'October', 'November', 'December'])

plt.show()

#### Wordcloud Genre Column


In [None]:
# Create wordcloud on genre column data

comment_words = ''
stop_words = set(STOPWORDS)

for val in imdb_movies['genre']:
    val = str(val)
    tokens = val.split()
    
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    
    comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 600, background_color = 'black'
                      , stopwords = stop_words, min_font_size = 10).generate(comment_words)

fig, ax = plt.subplots(figsize = (8, 6))
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
plt.show()

#### Which Genre Has Release Most Movies and Highest Average Vote (Rating)

In [None]:
# Preprocess and split genre column data (because there are more than one genre in each row data)

movies_genre = imdb_movies[['imdb_title_id', 'original_title', 'genre', 'avg_vote']]
movies_genre['genre'] = movies_genre['genre'].astype('str')

genre_split = pd.DataFrame(movies_genre['genre'].str.split(',').tolist(), index=movies_genre['imdb_title_id']).stack()
genre_split = genre_split.reset_index(['imdb_title_id'])
genre_split.columns = ['imdb_title_id', 'genre_split']
movies_genre_split = pd.merge(genre_split, movies_genre[['imdb_title_id', 'original_title', 'avg_vote']],
                              left_on = 'imdb_title_id', right_on = 'imdb_title_id')
movies_genre_split['genre_split'] = movies_genre_split['genre_split'].str.lstrip(' ').str.rstrip(' ')
movies_genre_split.head()

In [None]:
# Visualize top 5 genres based on movie counts and based on average vote (rating)

fig, ax = plt.subplots(1, 2, figsize = (16,6))

genres = movies_genre_split.groupby('genre_split')['imdb_title_id'].count().sort_values(ascending = False).index[0:5]
count_movies = movies_genre_split.groupby('genre_split')['imdb_title_id'].count().sort_values(ascending = False)[0:5]
ax[0].pie(x=count_movies, autopct="%.2f%%", labels=genres, pctdistance=0.5)
ax[0].set_title('Top 5 Genres Based on Movie Counts', fontsize = 18)

genres = movies_genre_split.groupby('genre_split')['avg_vote'].mean().sort_values(ascending = False).index[0:5]
avg_votes = movies_genre_split.groupby('genre_split')['avg_vote'].mean().sort_values(ascending = False)[0:5]
sns.barplot(ax = ax[1], x = genres, y = avg_votes)
ax[1].set_title('Top 5 Genres Based on Average Vote (Rating)', fontsize = 18)
ax[1].set_xlabel('Genre')
for index,avg_votes in enumerate(round(avg_votes, 2)):
    ax[1].text(x=index-0.1 , y =avg_votes+0 , s=f"{avg_votes}" , fontdict=dict(fontsize=10))
ax[1].set_ylabel('Average Vote')
ax[1].set_ylim(6, 8)

plt.show()

#### Which Director Has Release Most Movies and Highest Average Vote (Rating)

In [None]:
# Preprocess and split director column data (because there are more than one director in each row data)

movies_director = imdb_movies[['imdb_title_id', 'original_title', 'director', 'avg_vote']]
movies_director['director'] = movies_director['director'].astype('str')

director_split = pd.DataFrame(movies_director['director'].str.split(',').tolist(), index=movies_director['imdb_title_id']).stack()
director_split = director_split.reset_index(['imdb_title_id'])
director_split.columns = ['imdb_title_id', 'director_split']
movies_director_split = pd.merge(director_split, movies_director[['imdb_title_id', 'original_title', 'avg_vote']],
                              left_on = 'imdb_title_id', right_on = 'imdb_title_id')
movies_director_split['director_split'] = movies_director_split['director_split'].str.lstrip(' ').str.rstrip(' ')
gb_director = movies_director_split.groupby('director_split').agg({ 'imdb_title_id' : ['count'], 'avg_vote': ['mean']})
gb_director.drop(gb_director[gb_director.index == 'nan'].index, inplace = True)
gb_director.head()

In [None]:
# Visualize top 10 directors based on movie counts with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))
directors = gb_director[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10].index
count_movies = gb_director[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10]
sns.barplot(ax = ax, x = directors, y = count_movies)
ax.set_title('Top 10 Directors Based on Movie Counts', fontsize = 18)
ax.set_xlabel('Director')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,count_movies in enumerate(count_movies):
    ax.text(x=index-0.05 , y =count_movies+0 , s=f"{count_movies}" , fontdict=dict(fontsize=10))
ax.set_ylabel('No. of Movies')
ax.set_ylim(50, 100)
plt.show()

In [None]:
# Visualize top 10 directors based on average vote (rating) with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

# Specification : at least have direct 5 movies
mask = movies_director_split.groupby('director_split')['imdb_title_id'].count() >= 5
directors = gb_director.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10].index
avg_vote = gb_director.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = directors, y = avg_vote)
ax.set_title('Top 10 Directors Based on Average Vote (Rating)', fontsize = 18)
ax.set_xlabel('Director')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,avg_vote in enumerate(round(avg_vote, 2)):
    ax.text(x=index-0.1 , y =avg_vote+0 , s=f"{avg_vote}" , fontdict=dict(fontsize=10))
ax.set_ylabel('Average Vote')
ax.set_ylim(7.4, 8.4)
plt.show()

#### Which Writer Has Release Most Movies and Highest Average Vote (Rating)

In [None]:
# Preprocess and split writer column data (because there are more than one writer in each row data)

movies_writer = imdb_movies[['imdb_title_id', 'original_title', 'writer', 'avg_vote']]
movies_writer['writer'] = movies_writer['writer'].astype('str')

writer_split = pd.DataFrame(movies_writer['writer'].str.split(',').tolist(), index=movies_writer['imdb_title_id']).stack()
writer_split = writer_split.reset_index(['imdb_title_id'])
writer_split.columns = ['imdb_title_id', 'writer_split']
movies_writer_split = pd.merge(writer_split, movies_writer[['imdb_title_id', 'original_title', 'avg_vote']],
                              left_on = 'imdb_title_id', right_on = 'imdb_title_id')
movies_writer_split['writer_split'] = movies_writer_split['writer_split'].str.lstrip(' ').str.rstrip(' ')
gb_writer = movies_writer_split.groupby('writer_split').agg({ 'imdb_title_id' : ['count'], 'avg_vote': ['mean']})
gb_writer.drop(gb_writer[gb_writer.index == 'nan'].index, inplace = True)
gb_writer.head()

In [None]:
# Visualize top 10 writers based on movie counts with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

writers = gb_writer[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10].index
count_movies = gb_writer[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = writers, y = count_movies)
ax.set_title('Top 10 Writers Based on Movie Counts', fontsize = 18)
ax.set_xlabel('Writer')
for index,count_movies in enumerate(count_movies):
    ax.text(x=index-0.05 , y =count_movies+0 , s=f"{count_movies}" , fontdict=dict(fontsize=10))
ax.set_ylabel('No. of Movies')
ax.set_ylim(30, 60)
plt.show()

In [None]:
# Visualize top 10 writers based on average vote (rating) with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

# Specification : at least have write 5 movies
mask = movies_writer_split.groupby('writer_split')['imdb_title_id'].count() >= 5
writers = gb_writer.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10].index
avg_vote = gb_writer.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = writers, y = avg_vote)
ax.set_title('Top 10 Writers Based on Average Vote (Rating)', fontsize = 18)
ax.set_xlabel('Writer')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,avg_vote in enumerate(round(avg_vote, 2)):
    ax.text(x=index-0.1 , y =avg_vote+0 , s=f"{avg_vote}" , fontdict=dict(fontsize=10))
ax.set_ylabel('Average Vote')
ax.set_ylim(7.4, 8.8)
plt.show()

#### Which Production Company Has Release Most Movies and Highest Average Vote (Rating)

In [None]:
# Preprocess production company column data

movies_productioncomp = imdb_movies[['imdb_title_id', 'original_title', 'production_company', 'avg_vote']]
movies_productioncomp['production_company'] = movies_productioncomp['production_company'].astype('str')

gb_productioncomp = movies_productioncomp.groupby('production_company').agg({ 'imdb_title_id' : ['count'], 'avg_vote': ['mean']})
gb_productioncomp.drop((gb_productioncomp[gb_productioncomp.index == ''].index) | (gb_productioncomp[gb_productioncomp.index == 'nan'].index), inplace = True)
gb_productioncomp.head()

In [None]:
# Visualize top 10 production companies based on movie counts with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

productioncomp = gb_productioncomp[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10].index
count_movies = gb_productioncomp[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = productioncomp, y = count_movies)
ax.set_title('Top 10 Production Companies Based on Movie Counts', fontsize = 18)
ax.set_xlabel('Production Company')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,count_movies in enumerate(count_movies):
    ax.text(x=index-0.1 , y =count_movies+0.6 , s=f"{count_movies}" , fontdict=dict(fontsize=10))
ax.set_ylabel('No. of Movies')
ax.set_ylim(100, 1400)
plt.show()

In [None]:
# Visualize top 10 production companies based on average vote (rating) with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

# Specification : at least have produce 20 movies
mask = movies_productioncomp.groupby('production_company')['imdb_title_id'].count() >= 20
productioncomp = gb_productioncomp.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10].index
avg_vote = gb_productioncomp.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = productioncomp, y = avg_vote)
ax.set_title('Top 10 Production Companies Based on Average Vote (Rating)', fontsize = 18)
ax.set_xlabel('Production Company')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,avg_vote in enumerate(round(avg_vote, 2)):
    ax.text(x=index-0.1 , y =avg_vote+0.005 , s=f"{avg_vote}" , fontdict=dict(fontsize=10))
ax.set_ylabel('Average Vote')
ax.set_ylim(6.4, 7.2)
plt.show()

#### Which Actor Has Release Most Movies and Highest Average Vote (Rating)

In [None]:
# Preprocess and split actor column data (because there are more than one actor in each row data)

movies_actor = imdb_movies[['imdb_title_id', 'original_title', 'actors', 'avg_vote']]
movies_actor['actors'] = movies_actor['actors'].astype('str')

actor_split = pd.DataFrame(movies_actor['actors'].str.split(',').tolist(), index=movies_actor['imdb_title_id']).stack()
actor_split = actor_split.reset_index(['imdb_title_id'])
actor_split.columns = ['imdb_title_id', 'actor_split']
movies_actor_split = pd.merge(actor_split, movies_actor[['imdb_title_id', 'original_title', 'avg_vote']],
                              left_on = 'imdb_title_id', right_on = 'imdb_title_id')
movies_actor_split['actor_split'] = movies_actor_split['actor_split'].str.lstrip(' ').str.rstrip(' ')
gb_actor = movies_actor_split.groupby('actor_split').agg({ 'imdb_title_id' : ['count'], 'avg_vote': ['mean']})
gb_actor.drop((gb_actor[gb_actor.index == 'nan'].index), inplace = True)
gb_actor.head()

In [None]:
# Visualize top 10 actors based on movie counts with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

actor = gb_actor[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10].index
count_movies = gb_actor[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = actor, y = count_movies)
ax.set_title('Top 10 Actors/Actess Based on Movie Counts', fontsize = 18)
ax.set_xlabel('Actor/Actess')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,count_movies in enumerate(count_movies):
    ax.text(x=index-0.1 , y =count_movies+0 , s=f"{count_movies}" , fontdict=dict(fontsize=10))
ax.set_ylabel('No. of Movies')
ax.set_ylim(90, 200)
plt.show()

In [None]:
# Visualize top 10 actors based on average vote (rating) with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

# Specification : at least have become an actor/actress on 10 movies
mask = movies_actor_split.groupby('actor_split')['imdb_title_id'].count() >= 10
actor = gb_actor.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10].index
avg_vote = gb_actor.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = actor, y = avg_vote)
ax.set_title('Top 10 Actors/Actress Based on Average Vote (Rating)', fontsize = 18)
ax.set_xlabel('Actor/Actess')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,avg_vote in enumerate(round(avg_vote, 2)):
    ax.text(x=index-0.1 , y =avg_vote+0.005 , s=f"{avg_vote}" , fontdict=dict(fontsize=10))
ax.set_ylabel('Average Vote')
ax.set_ylim(7, 7.6)
plt.show()

#### Which Cinematographer Has Release Most Movies and Highest Average Vote (Rating)

In [None]:
# Preprocess and split cinematographer column data (because there are more than one cinematographer in each row data)

movies_cinematographer = imdb_movies[['imdb_title_id', 'original_title', 'cinematographer', 'avg_vote']]
movies_cinematographer['cinematographer'] = movies_cinematographer['cinematographer'].astype('str')

cinematographer_split = pd.DataFrame(movies_cinematographer['cinematographer'].str.split(',').tolist(), index=movies_cinematographer['imdb_title_id']).stack()
cinematographer_split = cinematographer_split.reset_index(['imdb_title_id'])
cinematographer_split.columns = ['imdb_title_id', 'cinematographer_split']

movies_cinematographer_split = pd.merge(cinematographer_split, movies_cinematographer[['imdb_title_id', 'original_title', 'avg_vote']],
                              left_on = 'imdb_title_id', right_on = 'imdb_title_id')
movies_cinematographer_split['cinematographer_split'] = movies_cinematographer_split['cinematographer_split'].str.lstrip(' ').str.rstrip(' ')
gb_cinematographer = movies_cinematographer_split.groupby('cinematographer_split').agg({ 'imdb_title_id' : ['count'], 'avg_vote': ['mean']})
gb_cinematographer.drop((gb_cinematographer[gb_cinematographer.index == 'nan'].index), inplace = True)
gb_cinematographer.head()

In [None]:
# Visualize top 10 cinematographers based on movie counts with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

cinematographer = gb_cinematographer[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10].index
count_movies = gb_cinematographer[('imdb_title_id', 'count')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = cinematographer, y = count_movies)
ax.set_title('Top 10 Cinematographers Based on Movie Counts', fontsize = 18)
ax.set_xlabel('Cinematographer')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,count_movies in enumerate(count_movies):
    ax.text(x=index-0.05 , y =count_movies+0 , s=f"{count_movies}" , fontdict=dict(fontsize=10))
ax.set_ylabel('No. of Movies')
ax.set_ylim(55, 80)
plt.show()

In [None]:
# Visualize top 10 cinematographers based on average vote (rating) with barchart

max_width = 15
fig, ax = plt.subplots(figsize = (16,4))

# Specification : at least have direct 5 movies
mask = movies_cinematographer_split.groupby('cinematographer_split')['imdb_title_id'].count() >= 5
cinematographer = gb_cinematographer.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10].index
avg_vote = gb_cinematographer.loc[mask][('avg_vote',  'mean')].sort_values(ascending = False)[0:10]

sns.barplot(ax = ax, x = cinematographer, y = avg_vote)
ax.set_title('Top 10 Cinematographers Based on Average Vote (Rating)', fontsize = 18)
ax.set_xlabel('Cinematographer')
ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
for index,avg_vote in enumerate(round(avg_vote, 2)):
    ax.text(x=index-0.1 , y =avg_vote+0.005 , s=f"{avg_vote}" , fontdict=dict(fontsize=10))
ax.set_ylabel('Average Vote')
ax.set_ylim(6.8, 8)
plt.show()

# Recommender System Based on Content

In [None]:
# Features that used in this recommender system is 'original_title' (as index),  'genre', 'director', 'actors', 'description'
data_recsys=imdb_movies[['original_title', 'genre', 'director', 'actors', 'description']].reset_index(drop = True)
data_recsys.head()

In [None]:
# Preprocess the data

data_recsys.set_index('original_title', inplace = True)

data_recsys['genre'] = data_recsys['genre'].fillna('').astype('str').str.lower()
data_recsys['genre'] = data_recsys['genre'].str.split(',')

data_recsys['director'] = data_recsys['director'].fillna('').astype('str').str.lower()
data_recsys['director'] = data_recsys['director'].str.split(',')

data_recsys['actors'] = data_recsys['actors'].fillna('').astype('str').str.lower()
data_recsys['actors'] = data_recsys['actors'].str.split(',')

In [None]:
# Preprocess description column data

data_recsys['description'] = data_recsys['description'].fillna('').astype('str').str.lower()
data_recsys['description'] = data_recsys['description'].str.translate(str.maketrans('', '', string.punctuation))

#from nltk.corpus import stopwords
listStopwords = set(stopwords.words('english'))
filtered = []
ps = PorterStemmer() 
for i, text in enumerate(data_recsys['description'].str.split()):
    for word in text:
        # Filtering/Removing stopwords in the text
        if word not in listStopwords:
            # Stemming words
            word_stemmed = ps.stem(word)
            filtered.append(word_stemmed)
    data_recsys['description'][i] = filtered
    filtered = []

In [None]:
# Create new column 'bunch_of_words' that contains words taken from all features columns

data_recsys['bunch_of_words'] = ''
for i, text in data_recsys.iterrows():
    words = ''
    for col in data_recsys.columns:
        words = words + ' '.join(text[col]) + ' '
    data_recsys['bunch_of_words'][i] = words

In [None]:
data_recsys.head()

In [None]:
# Convert a collection of text documents to a vector of term/token counts (CountVectorizer)

count = CountVectorizer()
count_matrix = count.fit_transform(data_recsys['bunch_of_words']).astype(np.uint8)

In [None]:
# To reduce memory usage
del data_imdb_names
del data_imdb_title_principals

In [None]:
# Calculate Cosine Similarity
# Cosine similarity is a metric used to measure how similar the documents are irrespective of their size

chunk_size = 500 
matrix_len = count_matrix.shape[0] # Not sparse numpy.ndarray

# Calculate cosine similarity chunk by chunk
def similarity_cosine_by_chunk(start, end):
    if end > matrix_len:
        end = matrix_len
    return cosine_similarity(X=count_matrix[start:end], Y=count_matrix)
cosine_similarity_all = []
i=0
for chunk_start in range(0, matrix_len, chunk_size):
    
    # Initialize first cosine sim chunk (for first concatenating chunks purpose)
    if i == 0: 
        cosine_sim = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
    
    # Initialize other cosine sim chunk, then concatenating chunk by chunk untill all chunks concatenated
    else :
        cosine_similarity_chunk= similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
        # Use type data float32 for reduce memory usage
        cosine_sim = np.concatenate((cosine_sim.astype(np.float32), cosine_similarity_chunk.astype(np.float32)))
    
    # Change value i != 0 for execute else statement, because we dont need execute if statement anymore (if statement only to initialize first chunk for first concatenating purpose)
    i= 1

In [None]:
# Create function that return 10 recommended/similar movies based on input

# Create variable index
index_movies = pd.Series(data_recsys.index)

# Movies Recommendation function
def recommendation_movies(title, cosine_sim = cosine_sim):
    recommended_movies = []
    index_movie_input = index_movies[index_movies == title].index[0]
    score_movies = pd.Series(cosine_sim[index_movie_input]).sort_values(ascending = False)
    top_10_index_movies = list(score_movies.iloc[1:11].index)
    # Get movies title and year by index (top 10 movies)
    for i in top_10_index_movies:
        recommended_movies.append(imdb_movies['original_title'].iloc[i] + ' (' + str(imdb_movies['year'].iloc[i]) + ')')
    return recommended_movies

In [None]:
# Results
recommendation_movies('The Dark Knight')