## Hypothesis Testing

In [21]:
# Import Libraries:

import pandas as pd
import numpy as py
import tmdbsimple as tmdb

In [22]:
import json
with open('/Users/jnate/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
#Display the keys of the loaded dict:
login.keys()

dict_keys(['api-key'])

In [23]:
# Load API key:

tmdb.API_KEY =  login['api-key']

In [24]:
# Load JSON files:

with open('Data\\tmdb_api_results_2019.json') as y1:
    jsonfile1 = json.load(y1)

with open('Data\\tmdb_api_results_2020.json') as y2:
    jsonfile2 = json.load(y2)

In [25]:
# Convert separate years to pandas DataFrames:

year2019 = pd.DataFrame(jsonfile1)

# Reduce the DataFrame down to 4 columns:
year2019 = year2019[['imdb_id', 'revenue', 'budget', 'certification']]

# Imputation:

year2019['certification'].replace({'' : 'NR', '' : 'NR'}, inplace = True);

year2019['certification'].fillna('NR', inplace =True)

year2019['revenue'].fillna(0.0, inplace =True)

# Check:

year2019.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,0.0,,NR
1,tt0385887,18377736.0,26000000.0,R
2,tt0437086,404852543.0,170000000.0,PG-13
3,tt0441881,2078370.0,23934823.0,R
4,tt0448115,365971656.0,80000000.0,PG-13


In [26]:
# Convert separate years to pandas DataFrames:

year2020 = pd.DataFrame(jsonfile2)

# Reduce the DataFrame down to 4 columns:
year2020 = year2020[['imdb_id', 'revenue', 'budget', 'certification']]

# Imputation:

year2020['certification'].replace({'' : 'NR', '' : 'NR'}, inplace = True);

year2020['certification'].fillna('NR', inplace =True)

year2020['revenue'].fillna(0.0, inplace =True)

# Check:

year2020.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,0.0,,NR
1,tt0062336,0.0,0.0,NR
2,tt0093119,0.0,7500000.0,NR
3,tt0805647,26900000.0,0.0,PG
4,tt0920462,106045.0,0.0,R


In [27]:
# Combined DataFrame:

# Combine / Concatinate separate DataFrames: 

combined = [year2019, year2020]

df = pd.concat(combined)

df.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,0.0,,NR
1,tt0385887,18377736.0,26000000.0,R
2,tt0437086,404852543.0,170000000.0,PG-13
3,tt0441881,2078370.0,23934823.0,R
4,tt0448115,365971656.0,80000000.0,PG-13


In [28]:
# Removing movies with 0 revenue: 

no_revenue = df.loc[df['revenue']==0]

# df[~no_revenue]

In [29]:
# Excluding titles with a budget of 0. 

budget = df.loc[df['budget'].copy() != 0]

df = budget

# Excluding titles with 0 revenue:

revenue = df.loc[df['revenue'].copy() != 0]

df = revenue

df.head()

Unnamed: 0,imdb_id,revenue,budget,certification
1,tt0385887,18377736.0,26000000.0,R
2,tt0437086,404852543.0,170000000.0,PG-13
3,tt0441881,2078370.0,23934823.0,R
4,tt0448115,365971656.0,80000000.0,PG-13
5,tt0783640,3364426.0,20000000.0,R


In [30]:
# Exporting the final Data Frame as csv:

df.to_csv("Data/year2019_2020.csv.gz",compression='gzip',index=False)

In [31]:
df['certification'].value_counts()

NR       118
R         84
PG-13     54
PG        35
G          4
NC-17      1
Name: certification, dtype: int64

### Question: Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

***1. State your Null Hypothesis and Alternative Hypothesis:***

$H_0$: MPAA rating does not affect a movie's revenue.

$H_A$: MPAA does affect a movie's revenue.

***2. Select the correct test according to the data type and number of samples.***

    Type of comparison: more than 2 samples

    Data type: Numeric

    Hypothesis Test: ANOVA and/or Tukey

### Assumptions of the selected test:

    No significant outliers
    Equal variance
    Normality

In [32]:
# Create dictionary for the various regions:

ratings = {}

# Loop through column to get the unique designations:

for i in df['certification'].unique():
    # Get series for group and rename:
    data = df.loc[df['certification']==i, 'revenue'].copy()
    
    # Append dictionary:
    ratings[i] = data
ratings.keys()

dict_keys(['R', 'PG-13', 'NR', 'NC-17', 'PG', 'G'])

In [33]:
# Removing certifications with n < 3 observations 

del ratings['NC-17']

ratings.keys()

dict_keys(['R', 'PG-13', 'NR', 'PG', 'G'])

In [34]:
import numpy as np
from scipy import stats

# Check for significant outliers:

for rating_name, revenue_df in ratings.items():
    # Identify outliers:
    outliers = np.abs(stats.zscore(revenue_df))>3
    
    # Outliers per region:
    print(f"There were {outliers.sum()} outliers in the {rating_name} group.")
    
    # Remove the outliers from dat and overwrite teh region dictionary:
    revenue_df = revenue_df[~outliers]
    
    ratings[rating_name] = revenue_df

There were 1 outliers in the R group.
There were 1 outliers in the PG-13 group.
There were 2 outliers in the NR group.
There were 2 outliers in the PG group.
There were 0 outliers in the G group.


In [35]:
data

1389    1.073395e+09
4079    6.163178e+07
4190    1.589437e+07
3416    1.085563e+07
Name: revenue, dtype: float64

In [36]:
# Normality Test:

# Normal test for each region to confirm that there are more than 20 samples in each group.
norm_results = [['certification', 'n', 'p', 'significance?']]

# Loop through each certification in the dictionary:

for ratings_name, temp_df in ratings.items():
    
    #Calculate normal test reuslts:
    result = stats.shapiro(temp_df)
    p = result.pvalue
    n = len(temp_df)
    
    # Appred the normal results as list:
    norm_results.append([ratings_name, n, p, p<.05])
nrdf = pd.DataFrame(norm_results[1:], columns = norm_results[0])
nrdf

Unnamed: 0,certification,n,p,significance?
0,R,83,2.167157e-12,True
1,PG-13,53,1.39788e-09,True
2,NR,116,1.363012e-15,True
3,PG,33,4.865038e-07,True
4,G,4,0.004169193,True


In [38]:
# Levene's test for equal variance:

ratingsLT = stats.levene(*ratings.values())
ratingsLT

LeveneResult(statistic=12.730268110171655, pvalue=1.5250374377409812e-09)

In [39]:
# If-else statement to interpret the p-value:

if ratingsLT.pvalue < 0.05:
    print('The groups ***DO NOT*** have equal variance')
    
else:
    print('The groups ***DO*** have equal variance')

The groups ***DO NOT*** have equal variance


### Interpretation:

We will need to use a Welch's T test because the assumption of equal variance was inaccurate.

In [None]:
import numpy as np
from scipy import stats

# Check for significant outliers:

for rating_name, revenue_df in ratings.items():
    # Identify outliers:
    outliers = np.abs(stats.zscore(revenue_df))>3
    
    # Outliers per region:
    print(f"There were {outliers.sum()} outliers in the {rating_name} group.")
    
    # Remove the outliers from dat and overwrite teh region dictionary:
    revenue_df = revenue_df[~outliers]
    
    ratings[rating_name] = revenue_df

In [49]:
# For loop for each certification: 

for rating_name, wtt_df in ratings.items():
    
    welchs = stats.ttest_ind(*wtt_df, equal_var = False)
    
    print(welchs)
    

TypeError: ttest_ind() got multiple values for argument 'equal_var'

### Question: Do some movie genres earn more revenue than others?

***1. State your Null Hypothesis and Alternative Hypothesis:***

$H_0$: genre classification does not play a significant role in move revenue.

$H_A$: genre classification has a significant influence on revenue.

***2. Select the correct test according to the data type and number of samples.***

    Type of comparison: 2 samples

    Data type: Numeric

    Hypothesis Test: 2 Sample T-Test

### Assumptions of the selected test:

    No significant outliers
    Normality
    Equal Variance

### Question: Do movies released in 2020 earn less revenue than movies released in 2019?

### How do the years compare for movie ratings?

***1. State your Null Hypothesis and Alternative Hypothesis:***

$H_0$: Movies released in 2020 do not earn less than movies released in 2019.

$H_A$: Movies released in 2020 make significantly less than movies released in 2019.

***2. Select the correct test according to the data type and number of samples.***

    Type of comparison: 2 samples

    Data type: Numeric

    Hypothesis Test: 2 Sample T-Test

### Assumptions of the selected test:

    No significant outliers
    Normality
    Equal Variance

In [40]:
# Create dictionary for the various regions:

ratings = {}

# Loop through column to get the unique designations:

for i in df['certification'].unique():
    # Get series for group and rename:
    data = df.loc[df['certification']==i, 'revenue'].copy()
    
    # Append dictionary:
    ratings[i] = data
ratings.keys()

dict_keys(['R', 'PG-13', 'NR', 'NC-17', 'PG', 'G'])

In [41]:
# Check for significant outliers:

for rating_name, revenue_df in ratings.items():
    # Identify outliers:
    outliers = np.abs(stats.zscore(revenue_df))>3
    
    # Outliers per region:
    print(f"There were {outliers.sum()} outliers in the {rating_name} group.")
    
    # Remove the outliers from dat and overwrite teh region dictionary:
    revenue_df = revenue_df[~outliers]
    
    ratings[rating_name] = revenue_df

There were 1 outliers in the R group.
There were 1 outliers in the PG-13 group.
There were 2 outliers in the NR group.
There were 0 outliers in the NC-17 group.
There were 2 outliers in the PG group.
There were 0 outliers in the G group.
