# Data Exploration and Beginning Analysis

## Importing necessary packages and libraries

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import zipfile
import matplotlib.pyplot as plt

%matplotlib inline

## Opening .csv/.tsv/.db files and assigning to variables

### Box Office Mojo

In [3]:
# Columns of note:
    # studio
    # domestic_gross / foreign_gross
    
box_office_df = pd.read_csv('data/bom.movie_gross.csv.gz')
box_office_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


### RottenTomatoes

In [14]:
# Columns of note:
    # (MPAA) rating
    # synopsis (?)
    # studio
    # box_office
    # runtime

rt_movies_df = pd.read_csv('data/rt.movie_info.tsv.gz', sep="\t")

rt_movies_df['genre list'] = rt_movies_df['genre'].str.split("|")
rt_movies_df.drop(columns=['currency', 'box_office', 'studio'], inplace=True)


rt_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   runtime       1530 non-null   object
 9   genre list    1552 non-null   object
dtypes: int64(1), object(9)
memory usage: 122.0+ KB


In [15]:
# Columns of note
    # rating
    # fresh
    # top_critic (use as filter?)
    
rt_reviews_df = pd.read_csv('data/rt.reviews.tsv.gz', sep='\t', encoding = 'unicode_escape')
rt_reviews_df.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


### The Movie Database

In [8]:
# Columns of note
    # vote_count
    # vote_average
    # id
    # title

tmdb_df = pd.read_csv('data/tmdb.movies.csv.gz', index_col=0)
tmdb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26517 entries, 0 to 26516
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genre_ids          26517 non-null  object 
 1   id                 26517 non-null  int64  
 2   original_language  26517 non-null  object 
 3   original_title     26517 non-null  object 
 4   popularity         26517 non-null  float64
 5   release_date       26517 non-null  object 
 6   title              26517 non-null  object 
 7   vote_average       26517 non-null  float64
 8   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


### The Numbers

In [9]:
# Columns of note:
    # movie
    # production_budget
    # domestic_gross / worldwide_gross

tn_df = pd.read_csv('data/tn.movie_budgets.csv.gz', index_col=0)
tn_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 1 to 82
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   release_date       5782 non-null   object
 1   movie              5782 non-null   object
 2   production_budget  5782 non-null   object
 3   domestic_gross     5782 non-null   object
 4   worldwide_gross    5782 non-null   object
dtypes: object(5)
memory usage: 271.0+ KB


### IMDb (Internet Movie Database)

In [11]:
# Extract IMDb SQL .db file
with zipfile.ZipFile('data/im.db.zip') as zipObj:
    # Extract all contents of .zip file into current directory
    zipObj.extractall(path='data/')

In [10]:
# Connect to IMDB DB and get schema
con = sqlite3.connect('data/im.db')

pd.read_sql("""

SELECT *
FROM sqlite_master

""",con)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,movie_basics,movie_basics,2,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
1,table,directors,directors,3,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
2,table,known_for,known_for,4,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
3,table,movie_akas,movie_akas,5,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
4,table,movie_ratings,movie_ratings,6,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,table,persons,persons,7,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,table,principals,principals,8,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,table,writers,writers,9,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [13]:
# Columns of note (movie_basics):
    # movie_id
    # primary_title / original_title
    # runtime_minutes
    # genres
    
# Columns of note (movie_ratings):
    # movie_id
    # averagerating
    # numvotes

In [None]:
# imdb_basics_df = pd.read_sql("""
# SELECT *
# FROM movie_basics
# """,con);

# imdb_ratings_df = pd.read_sql("""
# SELECT *
# FROM movie_ratings
# """,con1)

## Data Cleaning:

In [30]:
tn_df['release_date'] = pd.to_datetime(tn_df['release_date'])

tn_df

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2009-12-18,Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
3,2019-06-07,Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
4,2015-05-01,Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


## Unaffiliated Code: Phase 1 Code Challenge

In [None]:
# 4.1
player_names = list(players.keys())

In [None]:
# 4.2, dict. comp
player_nationalities = [(player, info['nationality']) for player, info in players.items()]

# This is what we're looping through
in players.items()
# This is what we're looking into -- player is a key, info is a value within player['nationality']
for player, info
# This is the resulting tuple
(player, info['nationality'])

In [None]:
# 4.2, for loop
player_nationalities = []

for name, details in players.items():
    nationality = details['nationality']
    player_and_nationality = (name, nationality)
    player_nationalities.append(player_and_nationality)