# Shelly-Ann Duncan
# 12/8/22
# Project 3 - IMDB - Part 4
# Hypothesis Testing

* The stakeholder's first question is: does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

    * They want you to perform a statistical test to get a mathematically-supported answer.
    * They want you to report if you found a significant difference between ratings.
        * If so, what was the p-value of you analysis?
        * And which rating earns the most revenue?
* They want you to prepare a visualization that supports your finding.

* It is then up to you to think of 2 additional hypotheses to test that your stakeholder may want to know.

# Import necessary libraries

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json, os
from scipy import stats

import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

import pymysql
pymysql.install_as_MySQLdb()

from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists

# Load data

In [2]:
# laod API credentials
with open('/Users/shell/.secret/tmdb_api.json', 'r') as f:
    json_data = json.load(f)
json_data.keys()

dict_keys(['API Key'])

In [3]:
# instantiate tmdb API variable
tmdb.API_KEY = json_data['API Key']

In [4]:
# specify folder for saving data
FOLDER = 'Data/'
os.makedirs(FOLDER, exist_ok = True)
os.listdir(FOLDER)

[' final_tmdb_data_2000.csv.gz',
 ' final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'combined_tmdb_data (1).csv.gz',
 'title.akas (1).tsv.gz',
 'title.basics.tsv.gz',
 'title.ratings (1).tsv.gz',
 'title_akas.csv.gz',
 'title_akas_chunk_001.csv.gz',
 'title_basics.csv.gz',
 'title_basics_cleaned (1).csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_combined.csv.gz']

### Extract revenue and certification

In [5]:
# define a function to get movie with certification included

def get_movie_with_rating(movie_id):
    """Adapted from source = https://github.com/celioa/tmdbsimple"""
    # get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    
    # save the .info .releases dictionaries
    info = movie.info()
    
    releases = movie.releases()
    
    # loop through coountries in releases
    for c in releases['countries']:
       
        # if the country abbreviation==US
        if c['iso_3166_1'] == 'US':
            
            # save a "certification" key in info with the certification 
            info['certification'] = c['certification']
    
    return info

In [6]:
# define a function to load/write the new json files

def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        
        # Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        
        # Sets file's current position at offset.
        file.seek(0)
        
        # convert back to json.
        json.dump(file_data, file)

In [7]:
# load basics dataset from part 1
basics = pd.read_csv('Data/title_basics_cleaned (1).csv.gz')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020.0,,74,"Horror,Music,Thriller"


In [8]:
# 10 year range of data needed
YEARS_TO_GET = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [9]:



# create a for loop to filter through the years
# Outer loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc = 'YEARS', position = 0):
    
    # Define the JSON file to store results for years
    # This will show at the beginng 
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    # Check to see if our file exists
    file_exists = os.path.isfile(JSON_FILE)
    
    # Now, create the file if it exists, or leave it if it does exist
    if file_exists == False:
    
        # Save an empty dict to a new json file to append later
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id':0}],f)
        
    # Now, save our specifisc years as the current df YEAR = 2010
    df = basics.loc[basics['startYear'] == YEAR].copy()
    df.head()

    # Save movie ids from the year 2010 to a list
    movie_ids = df['tconst'].copy()
    movie_ids

    # Check for any previous data
    previous_df = pd.read_json(JSON_FILE)
    previous_df # will be empty at first
    
    # Filter out any ids that already exist in our JSON_FILE (helps when returning to our problem later)
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
# Get index and movie id from list
# Inner loop

    for movie_id in tqdm_notebook(movie_ids_to_get,
                                      desc = f'Movies from {YEAR}',
                                      position = 1,
                                      leave = True):
            # Make attempt to retrieve data for movie id
        try:
            temp = get_movie_with_rating(movie_id) # Our first function

                # Extend results of our file with our other function
                
            write_json(temp, JSON_FILE)

                # Create a short sleep (helps bog down the server less)
            time.sleep(0.02)

            # If it fails, make a dict with just the id
        except Exception as e:
            continue
            
        # Finally, store our info in the JSON_FILE
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER} final_tmdb_data_{YEAR}.csv.gz",
                         compression = 'gzip', index = False)

YEARS:   0%|          | 0/10 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3749 [00:00<?, ?it/s]

ValueError: Expected object or value