In [78]:
##### IMPORT ALL PACKAGES #####
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandasql import sqldf
%matplotlib inline
pysqldf = lambda q: sqldf(q, globals())

##### IMPORT ALL DATA #####
df_bom_movie_gross = pd.read_csv('data/bom.movie_gross.csv') # PRIMARY
df_imdb_name_basics = pd.read_csv('data/imdb.name.basics.csv')
df_imdb_title_akas = pd.read_csv('data/imdb.title.akas.csv')
df_imdb_title_basics = pd.read_csv('data/imdb.title.basics.csv') # PRIMARY
df_imdb_title_crew = pd.read_csv('data/imdb.title.crew.csv')
df_imdb_title_principals = pd.read_csv('data/imdb.title.principals.csv')
df_imdb_title_ratings = pd.read_csv('data/imdb.title.ratings.csv') # PRIMARY
df_rt_movie_info = pd.read_csv('data/rt.movie_info.tsv', delimiter = '\t')
df_rt_reviews = pd.read_csv('data/rt.reviews.tsv', delimiter = '\t', encoding= 'unicode_escape')
df_tmdb_movies = pd.read_csv('data/tmdb.movies.csv').drop('Unnamed: 0', axis=1)
df_tn_movie_budgets = pd.read_csv('data/tn.movie_budgets.csv')

##### GENRES #####
GENRES_OG = ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror', 'Music',
       'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi',
       'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']
GENRES_FULL = [
     'genre_action','genre_adult','genre_adventure','genre_animation','genre_biography','genre_comedy','genre_crime',
     'genre_documentary','genre_drama','genre_family','genre_fantasy','genre_gameshow','genre_history','genre_horror',
     'genre_music','genre_musical','genre_mystery','genre_news','genre_reality_tv','genre_romance','genre_sci_fi',
     'genre_short','genre_sport','genre_talkshow','genre_thriller','genre_war','genre_western']
GENRE_CONVERSION = dict(zip(GENRES_FULL, GENRES_OG))

##### COMBINE AND CLEAN FINANCIAL DATA #####
df_money = df_tn_movie_budgets.loc[:, ['movie', 'production_budget', 'domestic_gross','worldwide_gross']].copy()
production_budget = [float(x.replace('$', '').replace(',','')) for x in df_money.production_budget] 
domestic_gross = [float(x.replace('$', '').replace(',','')) for x in df_money.domestic_gross]
worldwide_gross = [float(x.replace('$', '').replace(',','')) for x in df_money.worldwide_gross]
df_money = pd.DataFrame([production_budget, domestic_gross, worldwide_gross]).transpose()
df_money.columns = ['production_budget', 'domestic_gross', 'worldwide_gross']
df_money['profit'] = df_money.worldwide_gross - df_money.production_budget
df_money['movie'] = df_tn_movie_budgets.movie
df_money = df_money[['movie','production_budget', 'domestic_gross', 'worldwide_gross', 'profit']]
df_money = df_money.sort_values(by='movie').reset_index(drop=True)
df_genres = df_imdb_title_basics[df_imdb_title_basics['genres'].isna() == False].copy()
for category in GENRES_OG:
    df_genres[category] = [category in x for x in df_genres['genres']]
df_genres.drop(['start_year', 'runtime_minutes', 'genres', 'original_title'], axis=1, inplace=True)

##### CREATE FULL IMDB DB #####
df_imdb = pysqldf(''' SELECT * FROM df_imdb_title_basics basics
LEFT JOIN df_genres genres USING (tconst)
LEFT JOIN df_money money ON basics.primary_title == money.movie
LEFT JOIN df_tmdb_movies tmdb ON basics.primary_title == tmdb.original_title;''')

df_imdb = df_imdb[[
       'tconst', 'primary_title', 'original_title', 'start_year', 'runtime_minutes', 
       'production_budget', 'domestic_gross', 'worldwide_gross', 'profit',
       'genre_ids', 'id', 'original_language', 'popularity',
       'release_date', 'title', 'vote_average', 'vote_count',
       'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror', 'Music',
       'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi',
       'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']]
df_imdb.columns = ['tconst', 'primary_title', 'DELETE1', 'original_title',
       'DELETE2', 'start_year', 'runtime_minutes', 'production_budget',
       'domestic_gross', 'worldwide_gross', 'profit', 'genre_ids', 'id_tmdb',
       'original_language', 'popularity', 'release_date', 'title',
       'vote_average', 'vote_count', 'genre_action', 'genre_adult', 'genre_adventure',
       'genre_animation', 'genre_biography', 'genre_comedy', 'genre_crime', 'genre_documentary', 'genre_drama',
       'genre_family', 'genre_fantasy', 'genre_gameshow', 'genre_history', 'genre_horror', 'genre_music',
       'genre_musical', 'genre_mystery', 'genre_news', 'genre_reality_tv', 'genre_romance', 'genre_sci_fi',
       'genre_short', 'genre_sport', 'genre_talkshow', 'genre_thriller', 'genre_war', 'genre_western']
df_imdb.drop(['DELETE1', 'DELETE2'], axis=1, inplace=True)

In [90]:
genre_counts = {}
for item in df_rt_movie_info.genre.loc[df_rt_movie_info.genre.isna()==False]:
    for sub_item in item.split(sep = '|'):
        if sub_item in genre_counts.keys():
            genre_counts[sub_item] += 1
        else:
            genre_counts[sub_item] = 1
genre_counts = pd.Series(genre_counts).sort_values(ascending=False)
genre_counts

Drama                          912
Comedy                         550
Action and Adventure           366
Mystery and Suspense           309
Art House and International    265
Romance                        198
Classics                       193
Science Fiction and Fantasy    172
Horror                         134
Kids and Family                 99
Musical and Performing Arts     98
Documentary                     69
Special Interest                61
Western                         48
Animation                       47
Television                      23
Faith and Spirituality          11
Sports and Fitness              10
Cult Movies                      4
Anime and Manga                  2
Gay and Lesbian                  2
dtype: int64

In [92]:
#df_rt_reviews.rating.unique() #used to extract the ratings_dirty list below

ratings_dirty = ['3/5', 'C', '2/5', 'B-', '2/4', 'B', '3/4', '4/5', '4/4',
       '6/10', '1/4', '8', '2.5/4', '4/10', '2.0/5', '3/10', '7/10', 'A-',
       '5/5', 'F', '3.5/4', 'D+', '1.5/4', '3.5/5', '8/10', 'B+', '9/10',
       '2.5/5', '7.5/10', '5.5/10', 'C-', '1.5/5', '1/5', '5/10', 'C+',
       '0/5', '6', '0.5/4', 'D', '3.1/5', '3/6', '4.5/5', '0/4', '2/10',
       'D-', '7', '1/10', '3', 'A+', 'A', '4.0/4', '9.5/10', '2.5',
       '2.1/2', '6.5/10', '3.7/5', '8.4/10', '9', '1', '7.2/10', '2.2/5',
       '0.5/10', '5', '0', '2', '4.5', '7.7', '5.0/5', '8.5/10', '3.0/5',
       '0.5/5', '1.5/10', '3.0/4', '2.3/10', '4.5/10', '4/6', '3.5',
       '8.6/10', '6/8', '2.0/4', '2.7', '4.2/10', '5.8', '4', '7.1/10',
       '5/4', 'N', '3.5/10', '5.8/10', 'R', '4.0/5', '0/10', '5.0/10',
       '5.9/10', '2.4/5', '1.9/5', '4.9', '7.4/10', '1.5', '2.3/4',
       '8.8/10', '4.0/10', '2.2', '3.8/10', '6.8/10', '7.3', '7.0/10',
       '3.2', '4.2', '8.4', '5.5/5', '6.3/10', '7.6/10', '8.1/10',
       '3.6/5', '2/6', '7.7/10', '1.8', '8.9/10', '8.9', '8.2/10',
       '8.3/10', '2.6/6', '4.1/10', '2.5/10', 'F+', '6.0/10', '1.0/4',
       '7.9/10', '8.7/10', '4.3/10', '9.6/10', '9.0/10', '4.0', '1.7',
       '7.9', '6.7', '8.0/10', '9.2/10', '5.2', '5.9', '3.7', '4.7',
       '6.2/10', '1/6', '8.2', '2.6/5', '3.4', '9.7', '3.3/5', '3.8/5',
       '1/2', '7.4', '4.8', '1.6/5', '2/2', '1-5', '1.0', '4.3/5', '5/6',
       '9.2', '2.7/5', '4.9/10', '3.0', '3.1', '7.8/10', 'F-', '2.3/5',
       '3.0/10', '3/2', '7.8', '4.2/5', '9.0', '7.3/10', '4.4/5',
       '6.9/10', '0/6', 'T', '6.2', '3.3', '9.8', '8.5', '1.0/5', '4.1',
       '7.1', '3 1/2']

ratings_clean = [6, 'C', 4, 'B-', 5, 'B', 7.5, 8, 10,
       6, 2.5, 8, 6.25, 4, 4, 3, 7, 'A-',
       10, 'F', 8.75, 'D+', 3.75, 7, 8, 'B+', 9,
       5, 7.5, 5.5, 'C-', 3, 2, 5, 'C+',
       0, 6, 1.25, 'D', 6.2, 5, 9, 0, 2,
       'D-', 7, 1, 'unclear', 'A+', 'A', 10, 9.5, 'unclear',
       'unclear', 6.5, 7.4, 8.4, 9, 'unclear', 7.2, 4.4,
       0.5, 'unclear', 0, 'unclear', 'unclear', 7.7, 10, 8.5, 6,
       1, 1.5, 7.5, 2.3, 4.5, 6.67, 'unclear',
       8.6, 7.5, 5, 'unclear', 4.2, 5.8, 'unclear', 7.1,
       10, 'unclear', 3.5, 5.8, 'unclear', 8, 0, 5,
       5.9, 4.8, 3.8, 'unclear', 7.4, 'unclear', 5.75,
       8.8, 4, 'unclear', 3.8, 6.8, 7.3, 7,
       'unclear', 'unclear', 8.4, 10, 6.3, 7.6, 8.1,
       7.2, 3.33, 7.7, 'unclear', 8.9, 8.9, 8.2,
       8.3, 6.5, 4.1, 2.5, 'F+', 6, 2.5,
       7.9, 8.7, 4.3, 9.6, 9, 'unclear', 'unclear',
       7.9, 6.7, 8, 9.2, 5.2, 5.9, 'unclear', 'unclear',
       6.2, 1.667, 8.2, 5.2, 'unclear', 9.7, 6.6, 7.6,
       5, 7.4, 'unclear', 3.2, 10, 'unclear', 'unclear', 8.6, 8.33,
       9.2, 5.4, 4.9, 'unclear', 'unclear', 7.8, 'F-', 4.6,
       3, 10, 7.8, 8.4, 9, 7.3, 8.8,
       6.9, 0, 'unclear', 6.2, 'unclear', 9.8, 8.5, 2, 'unclear',
       7.1, 'unclear']

dict_for_cleaning = dict(zip(ratings_dirty, ratings_clean))
cleaned = []
for item in df_rt_reviews.rating:
    if (str(item) !='nan'):
        cleaned.append(dict_for_cleaning[item])
    else:
        cleaned.append('???')
        
cleaned_alpha = []
cleaned_numeric = []
for item in cleaned:
    if item == '???':
        cleaned_alpha.append('NA')
        cleaned_numeric.append('NA')        
    elif str(type(item)) == str(type("-")):
        cleaned_alpha.append(item)
        cleaned_numeric.append('NA')
    else:
        cleaned_alpha.append('NA')
        cleaned_numeric.append(item)

df_rt_reviews['rating_clean_alpha'] = cleaned_alpha
df_rt_reviews['rating_clean_numeric'] = cleaned_numeric

In [100]:
rating_counts = {}
for item in df_rt_reviews['rating_clean_alpha']:
    if item in rating_counts.keys():
        rating_counts[item] += 1
    else:
        rating_counts[item] = 1
rating_counts = pd.DataFrame(rating_counts.values(), rating_counts.keys()).reset_index()
rating_counts.columns = ['rating', 'count']
rating_counts.drop(rating_counts.loc[rating_counts.rating=='unclear'].index, inplace=True)
rating_counts = rating_counts.sort_values(by='rating', ascending=True).reset_index(drop=True)
rating_counts.head(60)

Unnamed: 0,rating,count
0,A,397
1,A+,73
2,A-,514
3,B,1163
4,B+,832
5,B-,821
6,C,779
7,C+,665
8,C-,493
9,D,324


# NEXT

Determine if you can map IMDB with rotten tomatoes. If so, take a look at financial metrics for rotten tomatoes movies. Specifically, metrics by directors, actors, and production company

In [113]:
# df_rt_movie_info
# df_rt_reviews