## Hypothesis Testing

In [82]:
# Import Libraries:

import pandas as pd
import numpy as py
import tmdbsimple as tmdb

In [83]:
import json
with open('/Users/jnate/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
#Display the keys of the loaded dict:
login.keys()

dict_keys(['api-key'])

In [84]:
# Load API key:

tmdb.API_KEY =  login['api-key']

In [85]:
# Load JSON files:

with open('Data\\tmdb_api_results_2019.json') as y1:
    jsonfile1 = json.load(y1)

with open('Data\\tmdb_api_results_2020.json') as y2:
    jsonfile2 = json.load(y2)

In [86]:
# Convert separate years to pandas DataFrames:

year2019 = pd.DataFrame(jsonfile1)

# Reduce the DataFrame down to 4 columns:
year2019 = year2019[['imdb_id', 'revenue', 'budget', 'certification']]

# Imputation:

year2019['certification'].replace({'' : 'NR', '' : 'NR'}, inplace = True);

year2019['certification'].fillna('NR', inplace =True)

year2019['revenue'].fillna(0.0, inplace =True)

# Check:

year2019.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,0.0,,NR
1,tt0385887,18377736.0,26000000.0,R
2,tt0437086,404852543.0,170000000.0,PG-13
3,tt0441881,2078370.0,23934823.0,R
4,tt0448115,365971656.0,80000000.0,PG-13


In [87]:
# Convert separate years to pandas DataFrames:

year2020 = pd.DataFrame(jsonfile2)

# Reduce the DataFrame down to 4 columns:
year2020 = year2020[['imdb_id', 'revenue', 'budget', 'certification']]

# Imputation:

year2020['certification'].replace({'' : 'NR', '' : 'NR'}, inplace = True);

year2020['certification'].fillna('NR', inplace =True)

year2020['revenue'].fillna(0.0, inplace =True)

# Check:

year2020.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,0.0,,NR
1,tt0062336,0.0,0.0,NR
2,tt0093119,0.0,7500000.0,NR
3,tt0805647,26900000.0,0.0,PG
4,tt0920462,106045.0,0.0,R


In [88]:
# Remvoe NaN values:

year2019.dropna(inplace = False)
year2020.dropna(inplace = False)

Unnamed: 0,imdb_id,revenue,budget,certification
1,tt0062336,0.0,0.0,NR
2,tt0093119,0.0,7500000.0,NR
3,tt0805647,26900000.0,0.0,PG
4,tt0920462,106045.0,0.0,R
5,tt0926132,0.0,0.0,NR
...,...,...,...,...
3713,tt9904004,0.0,0.0,NR
3714,tt9904802,0.0,0.0,NR
3715,tt9911196,0.0,0.0,NR
3716,tt9916190,0.0,0.0,NR


In [89]:
# Combined DataFrame:

# Combine / Concatinate separate DataFrames: 

combined = [year2019, year2020]

df = pd.concat(combined)

df.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,0.0,,NR
1,tt0385887,18377736.0,26000000.0,R
2,tt0437086,404852543.0,170000000.0,PG-13
3,tt0441881,2078370.0,23934823.0,R
4,tt0448115,365971656.0,80000000.0,PG-13


In [90]:
# Removing movies with 0 revenue: 

no_revenue = df.loc[df['revenue']==0]

# df[~no_revenue]

In [91]:
# Excluding titles with a budget of 0. 

budget = df.loc[df['budget'].copy() != 0]

df = budget

# Excluding titles with 0 revenue:

revenue = df.loc[df['revenue'].copy() != 0]

df = revenue

df.head()

Unnamed: 0,imdb_id,revenue,budget,certification
1,tt0385887,18377736.0,26000000.0,R
2,tt0437086,404852543.0,170000000.0,PG-13
3,tt0441881,2078370.0,23934823.0,R
4,tt0448115,365971656.0,80000000.0,PG-13
5,tt0783640,3364426.0,20000000.0,R


In [92]:
# Exporting the final Data Frame as csv:

df.to_csv("Data/year2019_2020.csv.gz",compression='gzip',index=False)

In [93]:
df['certification'].value_counts()

NR       118
R         84
PG-13     54
PG        35
G          4
NC-17      1
Name: certification, dtype: int64

### Question: Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

***1. State your Null Hypothesis and Alternative Hypothesis:***

$H_0$: MPAA rating does not affect a movie's revenue.

$H_A$: MPAA does affect a movie's revenue.

***2. Select the correct test according to the data type and number of samples.***

    Type of comparison: more than 2 samples

    Data type: Numeric

    Hypothesis Test: ANOVA and/or Tukey

### Assumptions of the selected test:

    No significant outliers
    Equal variance
    Normality

In [94]:
# Create dictionary for the various regions:

ratings = {}

# Loop through column to get the unique designations:

for i in df['certification'].unique():
    # Get series for group and rename:
    data = df.loc[df['certification']==i, 'revenue'].copy()
    
    # Append dictionary:
    ratings[i] = data
ratings.keys()

dict_keys(['R', 'PG-13', 'NR', 'NC-17', 'PG', 'G'])

In [95]:
# # Removing certifications with n < 3 observations 

# del ratings['NC-17']

# ratings.keys()

In [96]:
import numpy as np
from scipy import stats

# Check for significant outliers:

for rating_name, revenue_df in ratings.items():
    # Identify outliers:
    outliers = np.abs(stats.zscore(revenue_df))>3
    
    # Outliers per region:
    print(f"There were {outliers.sum()} outliers in the {rating_name} group.")
    
    # Remove the outliers from dat and overwrite teh region dictionary:
    revenue_df = revenue_df[~outliers]
    
    ratings[rating_name] = revenue_df

There were 1 outliers in the R group.
There were 1 outliers in the PG-13 group.
There were 2 outliers in the NR group.
There were 0 outliers in the NC-17 group.
There were 2 outliers in the PG group.
There were 0 outliers in the G group.


In [97]:
# Certification n value:

for certification, n_value in ratings.items():
    print(certification, ":", len(n_value))

R : 83
PG-13 : 53
NR : 116
NC-17 : 1
PG : 33
G : 4


In [98]:
# Normality Test:

# Normal test for each region to confirm that there are more than 20 samples in each group.
norm_results = [['certification', 'n', 'p', 'significance?']]

# Loop through each certification in the dictionary:

for ratings_name, temp_df in ratings.items():
    n = len(temp_df)
    if n < 3:
        n = 'n/a'
    else:
        
        # Calculate normal test reuslts:
        result = stats.shapiro(temp_df)
        p = result.pvalue
        n = len(temp_df)
    
    # Appred the normal results as list:
    norm_results.append([ratings_name, n, p, p<.05])
nrdf = pd.DataFrame(norm_results[1:], columns = norm_results[0])
nrdf

Unnamed: 0,certification,n,p,significance?
0,R,83.0,2.167157e-12,True
1,PG-13,53.0,1.39788e-09,True
2,NR,116.0,1.363012e-15,True
3,NC-17,,1.363012e-15,True
4,PG,33.0,4.865038e-07,True
5,G,4.0,0.004169193,True


#### Interpretation:

p less than alpha for all categories - we do not have normal distributions. We must move on to selecting a non-parametric equivalent test. 

In [99]:
# Kruskal-Wallis Test for the various movie categories:

kw_result = stats.kruskal(*ratings.values())
kw_result

KruskalResult(statistic=73.92697971857233, pvalue=1.5577054948795198e-14)

In [100]:
# If-else statement to interpret the p-value:

if kw_result.pvalue < 0.05:
    print('p is less than alpha, reject null hypothesis')
    
else:
    print('p is greater than alpah, the null hypothesis is validated')

p is less than alpha, reject null hypothesis


### ***Interpretation:***

p = 1.558 x 10 e-14
alpha = 0.05

p is significantly smaller than alpha meaning that we can reject the null hypothesis and support the alternative hypothesis; MPAA rating does affect a movie's revenue.

### Question: Do movies released in 2020 earn less revenue than movies released in 2019?

### How do the years compare for movie ratings?

***1. State your Null Hypothesis and Alternative Hypothesis:***

$H_0$: Movies released in 2020 do not earn less than movies released in 2019.

$H_A$: Movies released in 2020 make significantly less than movies released in 2019.

***2. Select the correct test according to the data type and number of samples.***

    Type of comparison: 2 samples

    Data type: Numeric

    Hypothesis Test: 2 Sample T-Test

### Assumptions of the selected test:

    No significant outliers
    Normality
    Equal Variance

In [101]:
# Define the two groups:

# # required columns:

# requiredcols = ['revenue']

y19 = pd.DataFrame(year2019['revenue'].copy())
y20 = pd.DataFrame(year2020['revenue'].copy())

y19

Unnamed: 0,revenue
0,0.0
1,18377736.0
2,404852543.0
3,2078370.0
4,365971656.0
...,...
4672,0.0
4673,0.0
4674,0.0
4675,0.0


### Test for significant outliers

In [102]:
# Check for outliers using Z-scores:

y19_outliers = np.abs(stats.zscore(y19))>3
y20_outliers = np.abs(stats.zscore(y20))>3

y19_outliers.value_counts()

revenue
False      4645
True         32
dtype: int64

In [103]:
y20_outliers.value_counts()

revenue
False      3701
True         17
dtype: int64

In [104]:
# Removal of outliers:

y19_final = y19[-y19_outliers]
y20_final = y20[-y20_outliers]

y19_final.value_counts()

revenue    
0.0            4293
3758846.0         1
14454622.0        1
13935410.0        1
13601384.0        1
               ... 
1000000.0         1
989310.0          1
968853.0          1
929868.0          1
216601214.0       1
Length: 353, dtype: int64

### Normality Test

In [105]:
# Normality test for 2019:

NT2019 = stats.normaltest(y19_final)
NT2019

NormaltestResult(statistic=array([nan]), pvalue=array([nan]))

In [106]:
# Normality test for 2019:

NT2020 = stats.normaltest(y20_final)
NT2020

NormaltestResult(statistic=array([nan]), pvalue=array([nan]))

### Question: Do some movie genres earn more revenue than others?

***1. State your Null Hypothesis and Alternative Hypothesis:***

$H_0$: genre classification does not play a significant role in move revenue.

$H_A$: genre classification has a significant influence on revenue.

***2. Select the correct test according to the data type and number of samples.***

    Type of comparison: more than 2 samples

    Data type: Numeric

    Hypothesis Test: ANOVA and/or Tukey

### Assumptions of the selected test:

    No significant outliers
    Normality
    Equal Variance

In [107]:
# Create dictionary for the various regions:

ratings = {}

# Loop through column to get the unique designations:

for i in df['certification'].unique():
    # Get series for group and rename:
    data = df.loc[df['certification']==i, 'revenue'].copy()
    
    # Append dictionary:
    ratings[i] = data
ratings.keys()

dict_keys(['R', 'PG-13', 'NR', 'NC-17', 'PG', 'G'])

In [108]:
# Check for significant outliers:

for rating_name, revenue_df in ratings.items():
    # Identify outliers:
    outliers = np.abs(stats.zscore(revenue_df))>3
    
    # Outliers per region:
    print(f"There were {outliers.sum()} outliers in the {rating_name} group.")
    
    # Remove the outliers from dat and overwrite teh region dictionary:
    revenue_df = revenue_df[~outliers]
    
    ratings[rating_name] = revenue_df

There were 1 outliers in the R group.
There were 1 outliers in the PG-13 group.
There were 2 outliers in the NR group.
There were 0 outliers in the NC-17 group.
There were 2 outliers in the PG group.
There were 0 outliers in the G group.
