# Setup

Notes: What categories do I want to compare?
Title, Year, Genres, Studios, Run Time, Streaming vs Theatre, Net income

In [337]:
import pandas as pd
import requests
import json
import sqlite3
import numpy as np
from sqlalchemy import create_engine
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import difflib
import string

In [3]:
# Making the locations of all of the data files
folder = '/Users/ronlodetti/Documents/Flatiron/1_phase/Project_1/Movie_Analysis_Project/data/imported/'
bom_loc = folder + 'bom.movie_gross.csv.gz'
imdb_loc = folder + 'im.db'
opus_loc = folder + 'MovieData.csv'

In [359]:
# Box Office Mojo
bom = pd.read_csv(bom_loc)
bom['foreign_gross'] = bom['foreign_gross'].str.replace(',','').astype(float)
bom['title'] = clean_titles(bom['title'])

In [357]:
def clean_titles(series):
    series = series.str.strip()
    series = series.str.translate(str.maketrans('', '', string.punctuation))
    series = series.str.lower()
    return series

In [344]:
test.str.strip()

0                                       Toy Story 3
1                        Alice in Wonderland (2010)
2       Harry Potter and the Deathly Hallows Part 1
3                                         Inception
4                               Shrek Forever After
                           ...                     
3382                                      The Quake
3383                    Edward II (2018 re-release)
3384                                       El Pacto
3385                                       The Swan
3386                              An Actor Prepares
Name: title, Length: 3387, dtype: object

In [360]:
# Opus Data
opus = pd.read_csv(opus_loc)
opus.drop('movie_odid',axis=1,inplace=True)
opus = opus.rename(columns={'movie_name':'title','production_year':'year'})
opus['title'] = clean_titles(opus['title'])

In [361]:
# IMDB
conn = sqlite3.connect(imdb_loc)
q = """
SELECT DISTINCT(ma.movie_id) AS id,
    ma.title,
    mb.start_year AS year
FROM movie_basics AS mb
JOIN movie_akas AS ma
    USING(movie_id)
WHERE language = 'en';

"""
imdb = pd.read_sql(q, conn)
imdb['title'] = clean_titles(imdb['title'])

# Merge!

In [366]:
#len(bom) 3,387
#len(opus) 1,936
#len(imdb) 21,403


21403

In [102]:
df = pd.merge(opus, bom,  how='left', left_on=['movie_name','production_year'], right_on = ['title','year'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1936 entries, 0 to 1935
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   movie_name                1936 non-null   object 
 1   production_year           1936 non-null   int64  
 2   production_budget         1936 non-null   int64  
 3   domestic_box_office       1936 non-null   int64  
 4   international_box_office  1936 non-null   int64  
 5   rating                    1913 non-null   object 
 6   creative_type             1923 non-null   object 
 7   source                    1915 non-null   object 
 8   production_method         1925 non-null   object 
 9   genre                     1926 non-null   object 
 10  sequel                    1934 non-null   float64
 11  running_time              1822 non-null   float64
 12  title                     501 non-null    object 
 13  studio                    501 non-null    object 
 14  domestic

# Creating a cleaned Database

In [373]:
opus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1936 entries, 0 to 1935
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     1936 non-null   object 
 1   year                      1936 non-null   int64  
 2   production_budget         1936 non-null   int64  
 3   domestic_box_office       1936 non-null   int64  
 4   international_box_office  1936 non-null   int64  
 5   rating                    1913 non-null   object 
 6   creative_type             1923 non-null   object 
 7   source                    1915 non-null   object 
 8   production_method         1925 non-null   object 
 9   genre                     1926 non-null   object 
 10  sequel                    1934 non-null   float64
 11  running_time              1822 non-null   float64
dtypes: float64(2), int64(4), object(6)
memory usage: 181.6+ KB


In [374]:
#creating a cleaned database
conn = sqlite3.connect('movie_data.db')
c = conn.cursor()

In [375]:
#
c.execute('''
CREATE TABLE 
    opus (
        title text,
        year int,
        production_budget int,
        domestic_box_office int,
        international_box_office int,
        rating text,
        creative_type text,
        source text,
        production_method text,
        genre text,
        sequel int,
        running_time int)
''')
opus.to_sql('opus', conn, if_exists='append', index = False)

<sqlite3.Cursor at 0x7f95b61e9c00>

In [380]:
c.execute('''
CREATE TABLE 
    opus (
        title text,
        year int,
        production_budget int,
        domestic_box_office int,
        international_box_office int,
        rating text,
        creative_type text,
        source text,
        production_method text,
        genre text,
        sequel int,
        running_time int)
''')
bom.to_sql('bom', conn, if_exists='append', index = False)

In [437]:
q = """
SELECT 
    AVG(domestic_box_office+international_box_office-production_budget) AS avg_profit,
    COUNT(*) AS num_movies,
    genre
FROM opus
GROUP BY genre
ORDER BY avg_profit DESC;

"""
pd.read_sql(q, conn)

Unnamed: 0,avg_profit,num_movies,genre
0,247383500.0,25,Musical
1,209648700.0,334,Adventure
2,191950300.0,311,Action
3,70428930.0,104,Horror
4,67738460.0,82,Romantic Comedy
5,66159610.0,231,Thriller/Suspense
6,55795640.0,318,Comedy
7,51478690.0,5,Documentary
8,48436040.0,24,Black Comedy
9,46681110.0,15,Western


In [406]:
#top_studio - top studio by average profit P/DW, BV, GrtIndia
avg_profit_sequel

In [427]:
q = """
SELECT 
    COUNT(*)
FROM opus AS o
JOIN bom as b
    USING(title)
;

"""
pd.read_sql(q, conn)

Unnamed: 0,COUNT(*)
0,75


In [425]:
#Type of movies BV studios made
for i in df:
    print(i)
    print(df[i].value_counts())
    print('\n')

rating
PG       37
PG-13    30
G         7
R         1
Name: rating, dtype: int64


creative_type
Kids Fiction            22
Super Hero              14
Fantasy                 12
Contemporary Fiction     9
Science Fiction          8
Dramatization            6
Historical Fiction       3
Factual                  1
Name: creative_type, dtype: int64


source
Original Screenplay                    22
Based on Fiction Book/Short Story      14
Based on Comic/Graphic Novel           13
Based on Real Life Events               6
Based on Folk Tale/Legend/Fairytale     5
Spin-Off                                4
Based on TV                             3
Based on Theme Park Ride                2
Based on Short Film                     2
Remake                                  1
Based on Play                           1
Based on Factual Book/Article           1
Based on Game                           1
Name: source, dtype: int64


production_method
Live Action              44
Digital Animation     

In [362]:
Replaced = 0
for i in range(0,len(bom)):
    key = bom['title'][i]
    value = difflib.get_close_matches(key, imdb['title'], n=1,cutoff=0.9)
    try:
        value = value[0]
        index = imdb[imdb['title']==value].index[0]
        if (key != value) & (bom['year'][i] == imdb['year'][index]):
            bom['title'].replace(key, value,inplace=True)
            print(f'{key}  ------->  {value}')
            Replaced +=1
        else:
            continue
    except:
        continue
Replaced

jackass 3d  ------->  jackass 3
step up 3d  ------->  step up 3
date night  ------->  date  night
waste land  ------->  wasteland
mission impossible  ghost protocol  ------->  mission impossible 4  ghost protocol
the girl with the dragon tattoo 2011  ------->  the girl with the dragon tattoo
spy kids all the time in the world  ------->  spy kids 4d all the time in the world
sex and zen 3d extreme ecstasy  ------->  3d sex and zen extreme ecstasy
the ledge  ------->  the pledge
the black power mix tape 19671975  ------->  the black power mixtape 19671975
ice age continental drift  ------->  ice age 4 continental drift
the counselor  ------->  the counsellor
one direction this is us  ------->  one direction  this is us
dallas buyers club  ------->  the dallas buyers club
tiny times 2  ------->  tiny times 20
admission  ------->  admissions
puella magi madoka magica the movie rebellion  ------->  puella magi madoka magica the movie part iii rebellion
mission impossible  rogue nation  ----

41

In [363]:
Replaced = 0
for i in range(0,len(opus)):
    key = opus['title'][i]
    value = difflib.get_close_matches(key, imdb['title'], n=1,cutoff=0.9)
    try:
        value = value[0]
        index = imdb[imdb['title']==value].index[0]
        if (key != value) & (opus['year'][i] == imdb['year'][index]):
            opus['title'].replace(key, value,inplace=True)
            print(f'{key}  ------->  {value}')
            Replaced +=1
        else:
            continue
    except:
        continue
Replaced

jackass 3d  ------->  jackass 3
step up 3d  ------->  step up 3
date night  ------->  date  night
wall street 2 money never sleeps  ------->  wall street money never sleeps
spy kids all the time in the world  ------->  spy kids 4d all the time in the world
harry potter and the deathly hallows part ii  ------->  harry potter and the deathly hallows part 2
mission impossible—ghost protocol  ------->  mission impossible 4  ghost protocol
ernest et celestine  ------->  ernest  celestine
ice age continental drift  ------->  ice age 4 continental drift
one direction this is us  ------->  one direction  this is us
the counselor  ------->  the counsellor
plastic  ------->  platic
ricki and the flash  ------->  ricki  the flash
secret in their eyes  ------->  the secret in their eyes
queen of the desert  ------->  the queen of the desert
yip man 3  ------->  ip man 3
mortdecai  ------->  mordecai
mission impossible—rogue nation  ------->  mission impossible 5  rogue nation
whiskey tango foxtrot

22