# Data Exploration and Beginning Analysis

## Importing necessary packages and libraries

In [2]:
import pandas as pd
import numpy as np
import sqlite3
import zipfile
import matplotlib.pyplot as plt

%matplotlib inline

## Opening .csv/.tsv/.db files and assigning to variables

### Box Office Mojo

In [3]:
# Columns of note:
    # studio
    # domestic_gross / foreign_gross
    
box_office_df = pd.read_csv('../../data/bom.movie_gross.csv.gz')
box_office_df;

### RottenTomatoes

In [4]:
# Columns of note:
    # (MPAA) rating
    # synopsis (?)
    # studio
    # box_office
    # runtime

rt_movies_df = pd.read_csv('../../data/rt.movie_info.tsv.gz', sep="\t")

rt_movies_df['genre list'] = rt_movies_df['genre'].str.split("|")
rt_movies_df.drop(columns=['currency', 'box_office', 'studio'], inplace=True)

rt_movies_df;

In [5]:
# Columns of note
    # rating
    # fresh
    # top_critic (use as filter?)
    
rt_reviews_df = pd.read_csv('../../data/rt.reviews.tsv.gz', sep='\t', encoding = 'unicode_escape')
rt_reviews_df;

### The Movie Database

In [6]:
# Columns of note
    # vote_count
    # vote_average
    # id
    # title

tmdb_df = pd.read_csv('../../data/tmdb.movies.csv.gz', index_col=0)
tmdb_df;

### The Numbers

In [7]:
# Columns of note:
    # movie
    # production_budget
    # domestic_gross / worldwide_gross

tn_df = pd.read_csv('../../data/tn.movie_budgets.csv.gz', index_col=0)
tn_df;

### IMDb (Internet Movie Database)

In [8]:
# Extract IMDb SQL .db file
with zipfile.ZipFile('../../data/im.db.zip') as zipObj:
    # Extract all contents of .zip file into current directory
    zipObj.extractall(path='../../data/')

In [9]:
# Connect to IMDB DB and get schema
con = sqlite3.connect('../../data/im.db')

pd.read_sql("""

SELECT *
FROM sqlite_master

""",con);

In [10]:
# Columns of note (movie_basics):
    # movie_id
    # primary_title / original_title
    # runtime_minutes
    # genres
    
# Columns of note (movie_ratings):
    # movie_id
    # averagerating
    # numvotes

In [11]:
# imdb_basics_df = pd.read_sql("""
# SELECT *
# FROM movie_basics
# """,con);

# imdb_ratings_df = pd.read_sql("""
# SELECT *
# FROM movie_ratings
# """,con1)

## Data Cleaning:

In [12]:
pd.read_sql("""
    SELECT *
    FROM sqlite_master
""", con)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,movie_basics,movie_basics,2,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
1,table,directors,directors,3,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
2,table,known_for,known_for,4,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
3,table,movie_akas,movie_akas,5,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
4,table,movie_ratings,movie_ratings,6,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,table,persons,persons,7,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,table,principals,principals,8,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,table,writers,writers,9,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [25]:
pd.read_sql("""
    SELECT *
    FROM
        movie_ratings
""",con).head()

Unnamed: 0,movie_id,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [32]:
imdb_df_sample = pd.read_sql("""
    SELECT
        --
        mb.primary_title AS "Title",
        mb.start_year AS "Release Year",
        mb.genres AS "genres",
        --
        mr.averagerating AS "Avg. Rating",
        mr.numvotes AS "# of Ratings"
    FROM
        movie_basics AS mb
        JOIN movie_ratings AS mr
            ON mb.movie_id = mr.movie_id
    WHERE
        mr.numvotes >= 100000 AND
        mb.start_year >= 2012
    ORDER BY
        mr.numvotes DESC
""",con)

# Database column 'genres' gives us multiple genres separated
# by commas -- using split() to create a LIST of genres
imdb_df_sample['genres'] = imdb_df_sample['genres'].str.split(',')

imdb_df_sample.head(10)

Unnamed: 0,Title,Release Year,genres,Avg. Rating,# of Ratings
0,The Dark Knight Rises,2012,"[Action, Thriller]",8.4,1387769
1,Interstellar,2014,"[Adventure, Drama, Sci-Fi]",8.6,1299334
2,Django Unchained,2012,"[Drama, Western]",8.4,1211405
3,The Avengers,2012,"[Action, Adventure, Sci-Fi]",8.1,1183655
4,The Wolf of Wall Street,2013,"[Biography, Crime, Drama]",8.2,1035358
5,Guardians of the Galaxy,2014,"[Action, Adventure, Comedy]",8.1,948394
6,Deadpool,2016,"[Action, Adventure, Comedy]",8.0,820847
7,The Hunger Games,2012,"[Action, Adventure, Sci-Fi]",7.2,795227
8,Star Wars: Episode VII - The Force Awakens,2015,"[Action, Adventure, Fantasy]",8.0,784780
9,Mad Max: Fury Road,2015,"[Action, Adventure, Sci-Fi]",8.1,780910


In [33]:
imdb_df_sample['# of Ratings']

0      1387769
1      1299334
2      1211405
3      1183655
4      1035358
        ...   
458     100650
459     100568
460     100520
461     100467
462     100318
Name: # of Ratings, Length: 463, dtype: int64

### Unaffiliated Code: Phase 1 Code Challenge

In [16]:
# 4.1
# player_names = list(players.keys())

In [17]:
# 4.2, dict. comp
# player_nationalities = [(player, info['nationality']) for player, info in players.items()]

# This is what we're looping through
# in players.items()
# This is what we're looking into -- player is a key, info is a value within player['nationality']
# for player, info
# This is the resulting tuple
# (player, info['nationality'])

In [18]:
# 4.2, for loop
# player_nationalities = []

# for name, details in players.items():
    # nationality = details['nationality']
    # player_and_nationality = (name, nationality)
    # player_nationalities.append(player_and_nationality)