In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
review = pd.read_csv('../input/rotten-tomatoes-movies-and-critic-reviews-dataset/rotten_tomatoes_critic_reviews.csv')
film = pd.read_csv('../input/rotten-tomatoes-movies-and-critic-reviews-dataset/rotten_tomatoes_movies.csv')

In [None]:
film['original_release_date'].astype(str).apply(lambda x: x[8:10])

In [None]:
film['original_release_year'] = film['original_release_date'].astype(str).apply(lambda x: x[:4])
film['original_release_month'] = film['original_release_date'].astype(str).apply(lambda x: x[5:7])
film['original_release_dom'] = film['original_release_date'].astype(str).apply(lambda x: x[8:10])
film['genre_1'] = film['genres'].str.split(', ', expand=True)[0]

In [None]:
film['genres'].str.split(', ', expand=True)[0]

In [None]:
msno.matrix(film)

In [None]:
film.info()

In [None]:
film.head()

## Histogram of Rating

In [None]:
plt.figure( figsize=(10,5))
plt.title('Histogram of tomatometer rating')
plt.hist(film['tomatometer_rating'], bins = 10, histtype='bar', color='lightsteelblue')

plt.figure( figsize=(10,5))
plt.title('Histogram of audience rating')
plt.hist(film['audience_rating'], bins = 10, histtype='bar', color='slategrey')

> It seems that both rating (when plot in histogram) have negative skew distribution

## Histogram of runtime

In [None]:
plt.figure(figsize=(15,10))
sns.histplot(data=film, x=film['runtime']
#              , bins=20
             , palette='Purples'
            ,color='lightsteelblue')
plt.title('Histogram of runtime')

## Count plot of Content Rating/ Tomatometer Status/ Audience Status

In [None]:
sns.countplot(data=film, x='content_rating'
              , palette='Purples'
              , order=film['content_rating'].value_counts().index)

In [None]:
sns.countplot(data=film, x='tomatometer_status'
              , palette='Purples'
             , order=film['tomatometer_status'].value_counts().index)

In [None]:
sns.countplot(data=film, x='audience_status'
              , palette='Purples'
             , order= film['audience_status'].value_counts().index)

## Relation between Tomatometer Rating and Audience Rating 

In [None]:
corr = film.corr()
sns.heatmap(corr, annot=True)

In [None]:
g = sns.lmplot(
    data=film,
    x='tomatometer_rating', y='audience_rating', hue="tomatometer_status", height=10, aspect=2, palette=['lightgreen', 'khaki', 'salmon'], hue_order=['Rotten', 'Fresh', 'Certified-Fresh'])

## Number of Films throughout years

In [None]:
tmp = film.groupby('original_release_year').agg({'movie_title':'count'}).reset_index()

a=tmp[~(tmp['original_release_year'] == 'nan')]['original_release_year'].astype(int).min()
b=tmp[~(tmp['original_release_year'] == 'nan')]['original_release_year'].astype(int).max()

sns.catplot(data=tmp[~(tmp['original_release_year'] == 'nan')], kind='bar', x='original_release_year', y='movie_title', aspect=5, palette='Purples')
plt.xticks(rotation=90)
plt.title(f"Number of films through out {a} : {b}")

# print(f'''Earliest Year: {tmp[~(tmp['original_release_year'] == 'nan')]['original_release_year'].astype(int).min()}
# Latest Year: {tmp[~(tmp['original_release_year'] == 'nan')]['original_release_year'].astype(int).max()}''')

## Which month film got released the most?

In [None]:
tmp = film.groupby('original_release_month').agg({'movie_title':'count'}).reset_index()
sns.catplot(data=tmp[~((tmp['original_release_month'] == 'nan') | (tmp['original_release_month'] == ''))], kind='bar', x='original_release_month', y='movie_title', aspect=5, palette='Purples')
plt.xticks(rotation=90)

## Which day of month film get released the most?

In [None]:
tmp = film.groupby('original_release_dom').agg({'movie_title':'count'}).reset_index()
sns.catplot(data=tmp[~((tmp['original_release_dom'] == 'nan') | (tmp['original_release_dom'] == ''))], kind='bar', x='original_release_dom', y='movie_title', aspect=5, palette='Purples_r')
plt.xticks(rotation=90)

## Which film's genre are the most used ?

In [None]:
sns.catplot(kind='count', data=film, x='genre_1', aspect=3, order=film['genre_1'].value_counts().iloc[:10].index, palette='Purples_r')
plt.xticks(rotation=30)

## Trend of Genre

In [None]:
tmp = film.groupby(['original_release_year', 'genre_1']).agg({'movie_title':'count'}).reset_index()

sns.relplot(data=tmp[~(tmp['original_release_year'] == 'nan') & (tmp['movie_title'] > 5) & (tmp[~(tmp['original_release_year'] == 'nan')]['original_release_year'].astype(int) > 5)]
            , kind='line', x='original_release_year', y='movie_title'
            , aspect=5, palette='RdYlBu'
           , hue='genre_1')
plt.xticks(rotation=90)
plt.axis(['1930','2020',0,250])
plt.title(f'Trend of Genre through out {a} : {b}')

## How Month of release affect film's genre ?

In [None]:
tmp = film.groupby(['original_release_month', 'genre_1']).agg({'movie_title':'count'}).reset_index()
sns.catplot(data=tmp[~((tmp['original_release_month'] == 'nan') | (tmp['original_release_month'] == '')) & (tmp['movie_title'] > 5)], kind='bar', x='original_release_month', y='movie_title'
            , aspect=5, palette='RdYlBu'
            , hue='genre_1')
plt.xticks(rotation=90)

In [None]:
# tmp = film.groupby(['original_release_month', 'genre_1']).agg({'movie_title':'count'}).reset_index()
# sns.catplot(data=tmp[~((tmp['original_release_month'] == 'nan') | (tmp['original_release_month'] == '')) & (tmp['movie_title'] > 5)], kind='bar', x='original_release_month', y='movie_title'
#             , height = 7.5, aspect=2, palette='RdYlBu'
#             , row='genre_1')
# plt.xticks(rotation=0)
# plt.legend()

In [None]:
from matplotlib import rcParams as rcp
rcp.update({'figure.max_open_warning': 0})
tmp = film.groupby(['original_release_month', 'genre_1']).agg({'movie_title':'count'}).reset_index()

for i in film['genre_1'].unique():
    sns.catplot(data=tmp[~((tmp['original_release_month'] == 'nan') | (tmp['original_release_month'] == '')) & (tmp['genre_1'] == i)], kind='bar', x='original_release_month', y='movie_title'
            , height=7.5, aspect=2, palette='RdYlBu'
                )
    plt.xticks(rotation=0)
    plt.title(i)

## Interesting Keywords

Defining Function for further use

In [None]:
def keyword_month(keyword, col,  regex=True, aspect=3):
    df = film[film[str(col)].str.contains(keyword, regex=regex).fillna(False)].groupby('original_release_month').agg({"movie_title":"count"}).reset_index()
    df_c = df[~(df['original_release_month'] == '')]
    sns.catplot(data=df_c, kind='bar', x='original_release_month', y='movie_title'
            , aspect=aspect
            , palette='RdYlBu'
           )
    plt.xticks(rotation=0)
    plt.title(f'Number of films contain this keyword: "{keyword}" by Month of release')

In [None]:
def keyword_trend(keyword, col,  regex=True, aspect=3):
    tick = np.arange(1914, 2021, step=1)
    df = film[film[str(col)].str.contains(keyword, regex=regex).fillna(False)].groupby(['original_release_year']).agg({"movie_title":"count"}).reset_index()
    df_c = df[~((df['original_release_year'] == '') | (df['original_release_year'] == 'nan'))]
    sns.catplot(data=df_c, kind='bar', x='original_release_year', y='movie_title'
            , aspect=aspect
            , palette='Purples'
           )
    plt.title(f'Trend of films contain this keyword: "{keyword}" through out {a} : {b}')

In [None]:
def keyword_rotten(keyword, col, regex=True, aspect=3):
    df = film[film[str(col)].str.contains(keyword, regex=regex).fillna(False)].groupby('tomatometer_status').agg({"movie_title":"count"}).reset_index()
    df_c = df[~(df['tomatometer_status'] == '')]
    sns.catplot(data=df_c, kind='bar'
                , x='tomatometer_status', y='movie_title'
                , aspect=aspect
                , palette='RdYlGn'
               )
    plt.xticks(rotation=0)
#     plt.title(f'Number of films contain this keyword: "{keyword}" by Month of release')

## Christmas movie release more on December ?

In [None]:
keyword_month('[Cc]hristmas', 'movie_info')

In [None]:
keyword_trend('[Cc]hristmas', 'movie_info')

In [None]:
keyword_trend('[Cc]hristmas', 'movie_title')

In [None]:
keyword_rotten('[Cc]hristmas', 'movie_info')

In [None]:
keyword_rotten('[Cc]hristmas', 'movie_title')

## LGBTQ Films

In [None]:
keyword_trend(r'LGBT|lgbt|Lesbian|lesbian|Gay|gay|Bisexual|bisexual|Transgender|transgender|Queer|queer|Asexual|Pansexual|Non-binary', 'movie_info',aspect=5)

## Harry Potter movie

In [None]:
keyword_month('Harry Potter', 'movie_title')

Harry Potter films are likely to get released on November

In [None]:
keyword_trend('Harry Potter', 'movie_title')

In [None]:
keyword_rotten('Harry Potter', 'movie_title')

## Star Wars

In [None]:
keyword_month('Star Wars', 'movie_title')

In [None]:
keyword_trend('Star Wars', 'movie_title')

In [None]:
keyword_rotten('Star Wars', 'movie_title')

## XXX

In [None]:
keyword_rotten('Meryl Streep', 'actors')

## Marvel Films

In [None]:
keyword_month(r'Stan Lee', 'actors')

In [None]:
film[film['actors'].str.contains('Stan Lee').fillna(False)]