# Contents

1. Setup
2. Studio Analysis
3. Opus Analysis

# 1. Setup
1. Import libraries
2. Import data
3. Basic data cleaning

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
#import numpy as np
import string
import difflib
%matplotlib inline

tn_df = pd.read_csv('/Users/ronlodetti/Documents/Flatiron/1_phase/Project_1/Movie_Analysis_Project/data/imported/tn.movie_budgets.csv.gz')
bom_df = pd.read_csv('/Users/ronlodetti/Documents/Flatiron/1_phase/Project_1/Movie_Analysis_Project/data/imported/bom.movie_gross.csv.gz')
opus_df = pd.read_csv('/Users/ronlodetti/Documents/Flatiron/1_phase/Project_1/Movie_Analysis_Project/data/imported/MovieData.csv')
conn = sqlite3.connect('/Users/ronlodetti/Documents/Flatiron/1_phase/Project_1/Movie_Analysis_Project/data/imported/im.db')
q = """
SELECT primary_title AS title,
    start_year AS year,
    genres
FROM movie_basics
WHERE year <= 2018;

"""
imdb_df = pd.read_sql(q, conn)


# Imported Data

## The Numbers

In [None]:
tn_df.info()


## Box Office Mojo

In [None]:
bom_df.info()

## OpusData

In [None]:
opus_df.info()

## IMDB

In [None]:
imdb_df.info()

# Data Preparation

## Data Cleaning

In [28]:
def clean_titles(series):
    series = series.translate(str.maketrans('', '', string.punctuation))
    series = series.replace(' ','')
    series = series.lower()
    return series

def clean_currency(series):
    series = series.replace('$','')
    series = series.replace(',','')
    series = series.replace(' ','')
    series = int(series)
    return series

def title_norm(df1,df2):
    '''
    This function takes the titles from df1, looks through the titles from
    df2, and if they pass a threshold, are replaced by matched title from 
    df2, aligning the titles for merging purposes. 
    '''
    df1['title'] = df1['title'].apply(clean_titles)
    df2['title'] = df2['title'].apply(clean_titles)
    df1.reset_index(drop=True,inplace=True)
    df2.reset_index(drop=True,inplace=True)
    for i in range(len(df1)):
        df1_title = df1['title'][i]
        match = difflib.get_close_matches(df1_title, df2['title'], n=1,cutoff=0.8)
        try:
            df2_title = match[0]
            index = df2[df2['title']==df2_title].index[0]
            if (df1_title != df2_title) & (df1['year'][i] == df2['year'][index]):
                df1['title'].replace(df1_title, df2_title,inplace=True)
            else:
                continue
        except:
            continue

In [29]:
# Convert currencies to integers.
tn_df['production_budget'] = tn_df['production_budget'].apply(clean_currency)
tn_df['domestic_gross'] = tn_df['domestic_gross'].apply(clean_currency)
tn_df['worldwide_gross'] = tn_df['worldwide_gross'].apply(clean_currency)

# Drop any rows with missing studio data.
bom_df = bom_df.loc[bom_df['studio'].notna()]

## Feature Engineering

In [30]:
# Create a new column to calculate profit.
tn_df['profit'] = tn_df['worldwide_gross'] - tn_df['production_budget']
opus_df['profit'] = opus_df['international_box_office'] + opus_df['domestic_box_office'] - opus_df['production_budget']

# Extracting the year from 'release_date' column.
tn_df['year'] = pd.to_datetime(tn_df['release_date']).dt.year

# Filtering data for movies that have been released since 2006
tn_df = tn_df.loc[(tn_df['year']>=2006) & (tn_df['year']<=2018)]

# Create a new column based on 'source' column.
opus_df['is_original'] = opus_df['source'].map(lambda x : True if x == 'Original Screenplay' else False)

# Norming and filtering column names
opus_df.rename(columns={'movie_name':'title','production_year':'year'},inplace=True)
tn_df.rename(columns={'movie':'title','production_budget':'budget'},inplace=True)

bom_df = bom_df[['title', 'year', 'studio']]
opus_df = opus_df[['title', 'year', 'rating', 'genre', 'sequel', 'profit', 'is_original']]
tn_df = tn_df[['title', 'year', 'budget', 'profit']]

imdb_df['title'] = imdb_df['title'].apply(clean_titles)
bom_df['title'] = bom_df['title'].apply(clean_titles)
tn_df['title'] = tn_df['title'].apply(clean_titles)
opus_df['title'] = opus_df['title'].apply(clean_titles)

In [35]:
x = pd.merge(bom_df,opus_df,how='inner',on=['title','year'])
y = pd.merge(x,imdb_df,how='inner',on=['title','year'])
len(x)


524

In [36]:
# df to analize studios and profit
studio_profits = pd.merge(bom_df,tn_df,how='inner',on=['title','year'])

# df to analize studios and genres
studio_genres = pd.merge(bom_df,imdb_df,how='inner',on=['title','year'])

# df to analize profits and genres/sequel/adaptation/rating
genre_profits = opus_df.copy()
genre_sequel = opus_df.copy()
genre_adaptation = opus_df.copy()
genre_rating = opus_df.copy()


In [None]:
len(pd.merge(bom_df,opus_df,how='inner',on=['title','year']))

# Studio Analysis

In [None]:
#Merge bom and the_num to get studio data with profit
box_office_by_studio = pd.merge(bom, the_num, how='inner', on =['title','year'])
profit_by_studio = box_office_by_studio[['studio','profit']]

In [None]:
# Filtering the data to account for studios who have made 100 or more movies
movie_counts = bom['studio'].value_counts().reset_index().rename(columns={'index':'studio','studio':'num_movies'})
avg_profit_by_studio = profit_by_studio.groupby('studio').mean().sort_values('studio').reset_index()

# I forget what this code below was for.
#     fig,ax = plt.subplots()
#     x = movie_counts
#     y = profit_by_studio
#     df = pd.merge(x,y,how='inner',on='studio')
#     sns.displot(df, x="num_movies", hue="profit")
#     ax = sns.scatterplot(data=df,x='num_movies',y='profit')

In [None]:
# Based on the scatter plot, lets filter the data for studios who have made over 100 movies
big_studios = list(movie_counts[movie_counts['num_movies']>100]['studio'])

# Finding the average profit per movie for each studio
avg_profit_big_studio = avg_profit_by_studio[avg_profit_by_studio['studio'].isin(big_studios)]

# Sorting to output the top and bottom 3 studios
top_studios = list(avg_profit_big_studio.sort_values('profit',ascending=False).head(3)['studio'])
bottom_studios = list(avg_profit_big_studio.sort_values('profit',ascending=False).tail(3)['studio'])

In [None]:
box_office_by_studio['top_or_bottom'] = ['Top Studio' if x in top_studios else ('Bottom Studio' if x in bottom_studios else np.nan) for x in box_office_by_studio['studio']]
    
top_bottom_studios = box_office_by_studio.dropna()


In [None]:
# A good statistic to take away
top_bottom_studios.groupby('top_or_bottom').mean()

In [None]:
# potential visual of the distribution of budgets by top and bottom
sns.displot(top_bottom_studios, x="budget", hue="top_or_bottom");

In [None]:
bom['top_or_bottom'] = ['Top Studio' if x in top_studios else ('Bottom Studio' if x in bottom_studios else np.nan) for x in bom['studio']]
bom_imbd = pd.merge(bom,imdb,how='inner',on=['title','year']).dropna().reset_index(drop=True)
tb_genre = []
for i in range(len(bom_imbd)):
    studio = bom_imbd['studio'][i]
    rank = bom_imbd['top_or_bottom'][i]
    try:
        genres_list = bom_imbd['genres'][i].split(',')
        for j in range(len(genres_list)):
            tb_genre.append([rank,studio,genres_list[j]])
    except:
        continue
tb_genre = pd.DataFrame(tb_genre).rename(columns={0:'rank',1:'studio',2:'genre'})

In [None]:
# Must use visual about top genres for top or bottom studios
top = tb_genre[tb_genre['rank']=='Top Studio']['genre'].value_counts(normalize=True)[:5].reset_index().rename(columns={'index':'genre','genre':'percentage'})
bottom = tb_genre[tb_genre['rank']=='Bottom Studio']['genre'].value_counts(normalize=True)[:5].reset_index().rename(columns={'index':'genre','genre':'percentage'})
fig,axes = plt.subplots(ncols=2,figsize=(10,5))
df1 = top
df2 = bottom
sns.barplot(ax=axes[0],data=df1,x='genre',y='percentage')
sns.barplot(ax=axes[1],data=df2,x='genre',y='percentage');


# Data Visualizations

In [None]:
#profit v budget
fig,ax = plt.subplots()
datax = the_num['budget']
datay = the_num['profit']
ax = sns.scatterplot(x = datax,y = datay)

# Opus Analysis

In [None]:
new_opus = opus[['profit','rating','source','genre','sequel']]



In [None]:
# avg_profit vs rating
new_opus.groupby('rating').mean().sort_values('profit',ascending=False)

 

In [None]:
# avg_profit vs original or adapted
new_opus['source'] = new_opus['source'].map(lambda x : 'Original' if x=='Original Screenplay' else 'Adapted').copy()
# new_opus.loc[new_opus['source'] != 'Original Screenplay', 'source'] = 'Adapted'
#new_opus.loc[new_opus['source'] == 'Original Screenplay', 'source'] = 'Original'

#new_opus.groupby('source').agg(['mean','count'])



In [None]:
#avg_profit vs genre
new_opus.groupby('genre').agg(['mean','count'])



In [None]:
#avg_profit vs sequel
new_opus.groupby('sequel').agg(['mean','count'])