In [None]:
#Questions:
#No of films per decade / per year
#gross revenue per decade per film
#ratings per decade / year
#us vs non-us voters for us and non-us films

# votes and ratings per genre per decade
#per genre per language

#data cleaning
#delete short movies <15 min? But be careful with old movies. No movies according to this criteria
#delete movies with less than 15 votes? No one cares about them apparently. No movies according to this criteria.
#check indian movies if they should be taken out.
#decide to include old movies or not
#only first language is considered.
#genres are not split.


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict
import seaborn as sns
from ipywidgets import interact

In [None]:
# Binning time intervals
def binning(num, divisor):
    return num - (num%divisor)

# Transforming multiple languagues into 1 language. First language is picked because main language seems to be the first one of the list.
def split_lang(row):
    return row.split(',')[0]

#Frequency: year, 5_year, Decades
#Metric: votes, avg_vote, no_of_films
def lang_graph(df,freq,metric,dim):
    fig, axes = plt.subplots(1,2)
    axes[0].set_title('{} per {} across {}s'.format(metric.capitalize(), freq.capitalize(), dim.capitalize()),fontsize=24)
    axes[0].set_xlabel('{}s'.format(metric.capitalize()),fontsize=18)
    axes[1].set_title('{} per {} (Exc. English) across {}'.format(metric.capitalize(),freq.capitalize(), dim.capitalize()),fontsize=24)
    df = df.groupby(by=[dim,freq]).agg(
        no_of_films = (dim,'count'),
        avg_vote = ('avg_vote','mean'),
        votes = ('votes','sum')
    ).reset_index()
    sns.lineplot(ax=axes[0],y=df[metric],x=df[freq],hue=df[dim])
    df = df[df[dim]!='English']
    sns.lineplot(ax=axes[1],y=df[metric],x=df[freq],hue=df[dim])
    
def top_genres(df,dim):
    top_genres = pd.DataFrame()
    
    for i in df[dim].unique().tolist():
        #print(i)
        #print(df.genre[df.language==i])
        top_genres[i] = df.genre[df[dim]==i].head(10).tolist()
        #print(top_genres[i])
    return top_genres

In [None]:
sns.set(rc={'figure.figsize':(20,6)})

### Data Retrival

In [None]:
movies = pd.read_csv('../input/imdb-extensive-dataset/IMDb movies.csv')
ratings = pd.read_csv('../input/imdb-extensive-dataset/IMDb ratings.csv')
rat_col = ratings.columns.to_list()
ratings = ratings[rat_col[:5]+rat_col[-4:]]
movies = movies.merge(ratings,on='imdb_title_id')

In [None]:
movies.info()

In [None]:
ratings.info()

In [None]:
movies.head(2)

### Data Processing

In [None]:
movies.year.unique()

In [None]:
movies = movies[movies.year != 'TV Movie 2019']
movies.year = movies.year.astype({'year':'int32'},copy = False)
movies = movies[movies.year < 2020]

In [None]:
base = 10
movies['Decade'] = movies.year.apply(binning, divisor=base)
base = 5
movies['5_year'] = movies.year.apply(binning, divisor=base)

In [None]:
new_column_list = list(movies.columns[0:4])+list(movies.columns[-2:])+list(movies.columns[4:-2])
movies = movies.reindex(columns=new_column_list)
movies.head()

In [None]:
L = movies.language.groupby(movies.language).count().sort_values(ascending=False).to_frame()
L.head(10)

In [None]:
movies.country.value_counts()

In [None]:
df_movies = movies[['imdb_title_id','year','Decade','5_year','genre','language','votes','us_voters_votes','non_us_voters_votes','avg_vote','us_voters_rating','non_us_voters_rating']][(movies.language.notna())].copy()
df_movies = df_movies[df_movies.language!='None'].reset_index(drop=True)
df_movies.rename(columns={'us_voters_rating':'avg_vote_us'
                          ,'non_us_voters_rating':'avg_vote_non_us'
                         ,'us_voters_votes':'votes_us',
                         'non_us_voters_votes':'votes_non_us'},inplace=True)
#processing language
df_movies.language = df_movies.language.apply(split_lang)
df_movies.year = df_movies.year.astype('int')
df_movies.head()

In [None]:
df_movies.info()

In [None]:
movies[(movies.duration<30) | (movies.votes<100)]

In [None]:
movies = movies[(movies.duration<30) | (movies.votes<100)]

### Language Analysis

In [None]:
#Filtering for top 10 languages
top_lang = df_movies.language.groupby(df_movies.language).size().sort_values(ascending=False)[:10]
top_lang = top_lang.index
print(top_lang)
df_movies = df_movies[df_movies.language.isin(top_lang)].reset_index(drop=True)
df_movies.head()

In [None]:
df_lang = df_movies.groupby(by=['language','year']).agg(
        no_of_films = ('language','count'),
        avg_vote = ('avg_vote','mean'),
        avg_vote_us = ('avg_vote_us','mean'),
        avg_vote_non_us = ('avg_vote_non_us','mean'),
        votes = ('votes','sum'),
        votes_us = ('votes_us','sum'),
        votes_non_us = ('votes_non_us','sum')
    ).reset_index()
df_lang.info()

In [None]:
df_movies.head()

In [None]:
df_movies['lang_group'] = df_movies['language'].apply(lambda x: 'English' if x == 'English' else 'Not English')

In [None]:
sns.set(rc={'figure.figsize':(20,12)})
#Enter  variables by order: DataFrame, frequency and metric and dimension.
#Frequency: year, 5_year, Decades
#Metric: votes, avg_vote, no_of_films
#Dimension: Language or genre
def lang_graph(df,freq,metric):
    fig, axes = plt.subplots(2,2)
    fig.tight_layout(pad=5)
    
    df_lang = df.groupby(by=['language',freq]).agg(
        no_of_films = ('imdb_title_id','count'),
        avg_vote = ('avg_vote','mean'),
        avg_vote_us = ('avg_vote_us','mean'),
        avg_vote_non_us = ('avg_vote_non_us','mean'),
        votes = ('votes','sum')
    ).reset_index()
    
    df_lang_group = df.groupby(by=['lang_group',freq]).agg(
        no_of_films = ('imdb_title_id','count'),
        avg_vote = ('avg_vote','mean'),
        avg_vote_us = ('avg_vote_us','mean'),
        avg_vote_non_us = ('avg_vote_non_us','mean'),
        votes = ('votes','sum')
    ).reset_index()
    
    palette_list = set(df_lang_group.lang_group).union(set(df_lang.language))
    palette = {}
    for i,j in enumerate(set(palette_list)): 
        #somehow list does not give same colors when the same function is ran different times
        palette[j]='C'+str(i)

    ax0 = axes[0,0]
    ax1 = axes[0,1]
    ax2 = axes[1,0]
    
    sns.lineplot(ax=ax0,y=df_lang[metric],x=df_lang[freq],palette=palette,hue=df_lang['language'])
    ax0.set_title('{} per Language across {}s'.format(metric.capitalize(), freq.capitalize()),fontsize=24)
    ax0.set_xlabel('{}s'.format(freq.capitalize()),fontsize=18)
    ax0.set_ylabel('{}s'.format(metric.capitalize()),fontsize=18)
    ax0.tick_params(which='major', width=1.0,labelsize=14)
    
    sns.lineplot(ax=ax1,y=df_lang_group[metric],x=df_lang_group[freq],hue=df_lang_group.lang_group,palette=palette)
    ax1.set_title('{} of English & Non-English Films across {}s'.format(metric.capitalize(),freq.capitalize()),fontsize=24)
    ax1.set_xlabel('{}s'.format(freq.capitalize()),fontsize=18)
    ax1.set_ylabel('{}s'.format(metric.capitalize()),fontsize=18)
    ax1.tick_params(which='major', width=1.0,labelsize=14)
    
    df_lang = df_lang[df_lang['language']!='English']
    sns.lineplot(ax=ax2,y=df_lang[metric],x=df_lang[freq],palette=palette,hue=df_lang['language'])
    ax2.set_title('{} per Language (exc. English) across {}s'.format(metric.capitalize(),freq.capitalize()),fontsize=24)
    ax2.set_xlabel('{}s'.format(freq.capitalize()),fontsize=18)
    ax2.set_ylabel('{}s'.format(metric.capitalize()),fontsize=18)
    ax2.tick_params(which='major', width=1.0,labelsize=14)

In [None]:
lang_graph(df_movies,'year','no_of_films')

Majority of films have been in English since the start. Italian cinema was 2nd biggest in 60s & 70s but now below French & Spanish films at least wrt numbers.
Films produced in Spaniss, Indian & Turkish languages have increased since 2000s quite fast.

In [None]:
lang_graph(df_movies,'5_year','votes')

In terms of popularity, English is by far the first and even compared to all other top language films combined.

In [None]:
lang_graph(df_movies,'Decade','avg_vote')

There is a declining trend of average ratings across time for all languages.
English films have lower scores on average possibly due to very high amount of films decreasing the overall quality levels.
An interesting point is the big decline in Turkish & Russian films since 1980s.

#### US vs Non-US Viewers

In [None]:
us_nonus_time = df_movies.groupby(by=['Decade']).agg(
    no_of_films = ('imdb_title_id','count'),
    avg_vote = ('avg_vote','mean'),
    avg_vote_us = ('avg_vote_us','mean'),
    avg_vote_non_us = ('avg_vote_non_us','mean'),
    votes = ('votes','sum'),
    votes_us = ('votes_us','sum'),
    votes_non_us = ('votes_non_us','sum')
).reset_index()

us_nonus_time_no_eng = df_movies[df_movies.language!='English'].groupby(by=['Decade']).agg(
    no_of_films = ('imdb_title_id','count'),
    avg_vote = ('avg_vote','mean'),
    avg_vote_us = ('avg_vote_us','mean'),
    avg_vote_non_us = ('avg_vote_non_us','mean'),
    votes = ('votes','sum'),
    votes_us = ('votes_us','sum'),
    votes_non_us = ('votes_non_us','sum')
).reset_index()

us_nonus_time_only_eng = df_movies[df_movies.language=='English'].groupby(by=['Decade']).agg(
    no_of_films = ('imdb_title_id','count'),
    avg_vote = ('avg_vote','mean'),
    avg_vote_us = ('avg_vote_us','mean'),
    avg_vote_non_us = ('avg_vote_non_us','mean'),
    votes = ('votes','sum'),
    votes_us = ('votes_us','sum'),
    votes_non_us = ('votes_non_us','sum')
).reset_index()

us_nonus_lang = df_movies.groupby(by=['language']).agg(
    no_of_films = ('imdb_title_id','count'),
    avg_vote = ('avg_vote','mean'),
    avg_vote_us = ('avg_vote_us','mean'),
    avg_vote_non_us = ('avg_vote_non_us','mean'),
    votes = ('votes','sum'),
    votes_us = ('votes_us','sum'),
    votes_non_us = ('votes_non_us','sum')
).reset_index()

In [None]:
#langs = set(non_us.language)
sns.set(rc={'figure.figsize':(20,12)})
fig, axes = plt.subplots(2,1)
fig.tight_layout(pad=5)
colormap = ['b','r']
us_nonus_time.plot(ax=axes[0], kind='bar',x='Decade',y=['avg_vote_us','avg_vote_non_us'],color=colormap,ylim=(5,7))
us_nonus_lang.plot(ax=axes[1], kind='bar',x='language',y=['avg_vote_us','avg_vote_non_us'],color=colormap,ylim=(5,7))
axes[0].legend(title='Voter Type', bbox_to_anchor=(1.2, 1))
axes[1].legend(title='Voter Type', bbox_to_anchor=(1.2, 1))
axes[0].set_title('Average Votes per Decade - Us vs NonUS Voters',fontsize=24)
axes[1].set_title('Average Votes across Films with Different Languages - Us vs NonUS Voters',fontsize=24)

plt.show()

US voters fancy English & to lesser extent Spanish films much more than Non-US voters.

In [None]:
#sns.set(rc={'figure.figsize':(20,16)})
sns.set(rc={'figure.figsize':(20,14)})
#langs = set(non_us.language)
fig, axes = plt.subplots(3,1)
fig.tight_layout(pad=5)
colormap = ['b','r']
us_nonus_time_no_eng.plot(ax=axes[0], kind='bar',x='Decade',y=['avg_vote_us','avg_vote_non_us'],color=colormap,ylim=(4,7.1))
us_nonus_time_only_eng.plot(ax=axes[1], kind='bar',x='Decade',y=['avg_vote_us','avg_vote_non_us'],color=colormap,ylim=(4,7.1))
us_nonus_time_no_eng.plot(ax=axes[2], kind='bar',x='Decade',y=['avg_vote_us','avg_vote_non_us'],color=colormap,ylim=(4,7.1))


axes[0].legend(title='Voter Type', bbox_to_anchor=(1.2, 1),fontsize=14)
axes[1].legend(title='Voter Type', bbox_to_anchor=(1.2, 1),fontsize=14)
axes[2].legend(title='Voter Type', bbox_to_anchor=(1.2, 1),fontsize=14)

axes[0].set_title('Average Votes per Decade - Us vs NonUS Voters',fontsize=24)
axes[1].set_title('Average Votes per Decade for English Language Films - Us vs NonUS Voters',fontsize=24)
axes[2].set_title('Average Votes per Decade for Non-English Language Films - Us vs NonUS Voters',fontsize=24)

plt.show()

The big differnce in average votes of US & non-US voters to older films appears to stem from US-Voters' favorable ratings to English films.

### Top 10 Genre per Language

In [None]:
df_movies.genre.unique().tolist()[:5]

There are a lot of mixed genres. That also means sth, so cant really choose the first entry as in Language analysis. Instead, we can use mixed genres.

In [None]:
df_movies.head(2)

In [None]:
top10_genre = df_movies.groupby(by='genre').agg(No_of_films=('language','count')).sort_values(by='No_of_films',ascending=False)[:10]
top10_genre = top10_genre.index
top10_genre

In [None]:
df_gen = df_movies[df_movies['genre'].isin(top10_genre)].reset_index(drop=True)
df_gen.info()

In [None]:
df_gen = df_gen.groupby(by=['genre','year','5_year','Decade']).agg(
        no_of_films = ('genre','count'),
        avg_vote = ('avg_vote','mean'),
        votes = ('votes','sum')
    ).reset_index()
df_gen.info()

In [None]:
df_gen.head()

In [None]:
sns.set(rc={'figure.figsize':(20,6)})
def genre_graph(df,freq,metric):

    df = df.groupby(by=['genre',freq]).agg(no_of_films = ('no_of_films','sum'),
                                         avg_vote = ('avg_vote','mean'),
                                         votes = ('votes','sum')).reset_index()
    palette = {} #should be a dictionary mapping values to matplotlib colours
    #hue connects which values are going to be matched with colors.
    #palette connects these values with specific colors.
    for i,j in enumerate(set(df.genre)):
        palette[j]='C'+str(i)
    sns.lineplot(y=df[metric],x=df[freq],hue=df['genre'],palette=palette)
    plt.title('{} per Genre'.format(metric.capitalize()),fontsize=20)
    plt.xlabel(xlabel=freq.capitalize(),fontsize=16)
    plt.ylabel(ylabel=metric.capitalize(),fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.show()

In [None]:
genre_graph(df_gen,'5_year','no_of_films')
genre_graph(df_gen,'5_year','votes')
genre_graph(df_gen,'5_year','avg_vote')

Drama & Comedy has been the 2 most common genres since the early times. However, Romantic-Comedies got really popular since 1990s getting the highest amoung of total votes.
Most genres cluster around average vote of 6 out of 10. Horror movies have got by far the lowes scores.

In [None]:
xx = df_movies.groupby(by=['language','genre']).agg(no_of_films = ('genre','count')).reset_index()
genre_lang_table = xx[(xx.language.isin(top_lang))].reset_index(drop=True)
genre_lang_table.sort_values(by=['language','no_of_films'],inplace=True,ascending=False)
genre_lang_table = top_genres(genre_lang_table,'language')
genre_lang_table

In [None]:
xx2 = xx[(xx.language.isin(top_lang)) & (xx.genre.isin(top10_genre))]
xx2 = xx2.pivot(index='genre',columns='language')
xx2.loc['total']=xx2.sum(axis=0)
xx2.columns = xx2.columns.droplevel(0) #drop index level no_of_films
xx2['total'] = xx2.sum(axis=1)
xx2 = xx2 / xx2.loc['total']
xx2 = xx2.drop(axis=0,labels='total')
xx2.sort_values(by='total',ascending=False,inplace=True)
genre_lang_ratios = xx2.copy()
genre_lang_ratios

In [None]:
xtick = genre_lang_ratios.columns.tolist()

In [None]:
sns.heatmap(genre_lang_ratios,vmin=0,vmax=0.3,cmap='Blues',xticklabels=xtick)
plt.show()

In almost all languages, Drama is the main genre except for Italian. Hindi movies have frequently incorporated Action or Comedy features to their films.
Another interesting fact is the higher share of horror films in Japanese films.

In [None]:
xx3 = xx2.transpose()
xx3.plot(kind='bar', stacked=True)
plt.title('Distribution of Most Common Film Genres across Languages')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1)) # dont know how it exactly works.
plt.show()

Its just so interesting to see how huge Comedy is for Italy, Horror is big for Japan.

In [None]:
genre_decade_table = df_movies.groupby(by=['genre','Decade']).agg(no_of_films = ('genre','count')).reset_index()
genre_decade_table.sort_values(by=['no_of_films','genre'],inplace=True,ascending=False,ignore_index=True)
genre_decade_table = top_genres(genre_decade_table,'Decade').sort_index()
genre_decade_table

What did not change from 1910s to 2010s is the top 2: Drama & Comedy.
However, we see some genres becoming very popular for some periods such as Horror between 60s & 80s; Western in 50s & 60s; Musicals in 30s.

In [None]:
xx = df_movies.groupby(by=['Decade','genre']).agg(no_of_films = ('genre','count')).reset_index()
xx2 = xx[(xx.genre.isin(top10_genre))]
xx2 = xx2.pivot(index='genre',columns='Decade')
xx2.loc['total']=xx2.sum(axis=0)
xx2.columns = xx2.columns.droplevel(0) #drop index level no_of_films
xx2['total'] = xx2.sum(axis=1)
xx2 = xx2 / xx2.loc['total']
xx2 = xx2.drop(axis=0,labels='total')
xx2.sort_values(by='total',ascending=False,inplace=True)
genre_decade_ratios = xx2.copy()
genre_decade_ratios

In [None]:
xtick = genre_decade_ratios.columns.tolist()
sns.heatmap(genre_lang_ratios,vmin=0,vmax=0.3,cmap='Blues',xticklabels=xtick)
plt.show()

In [None]:
xx3 = xx2.transpose()
xx3.plot(kind='bar', stacked=True)
plt.title('Distribution of Most Common Film Genres across Languages')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1)) # dont know how it exactly works.
plt.xlabel(xlabel='Decade',fontsize=14)
plt.show()