Data dictionary: https://www.imdb.com/interfaces/

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data_dir = Path("/kaggle/input/imdb-extensive-dataset/")

In [None]:
principals = pd.read_csv(data_dir/'IMDb title_principals.csv')
names = pd.read_csv(data_dir/'IMDb names.csv')
movies = pd.read_csv(data_dir/'IMDb movies.csv')
ratings = pd.read_csv(data_dir/'IMDb ratings.csv')

# Data Preparation

In [None]:
uk_people = names[names['place_of_birth'].str.endswith("UK").fillna(False)]['imdb_name_id'].unique().tolist()
usa_people = names[names['place_of_birth'].str.endswith("USA").fillna(False)]['imdb_name_id'].unique().tolist()

In [None]:
uk_directors = principals.query("imdb_name_id in @uk_people and category == 'director'")['imdb_name_id'].unique().tolist()
usa_directors = principals.query("imdb_name_id in @usa_people and category == 'director'")['imdb_name_id'].unique().tolist()

In [None]:
uk_directors_movies = principals.query("imdb_name_id in @uk_directors")['imdb_title_id'].unique().tolist()
usa_directors_movies = principals.query("imdb_name_id in @usa_directors")['imdb_title_id'].unique().tolist()

print(len(uk_directors_movies), len(usa_directors_movies))

In [None]:
movies['year'] = movies['year'].astype(str).str[-4:].astype(int)
subset_movies = movies.query("year > 1979 and year < 2020 and language == 'English'")
subset_movies = subset_movies[subset_movies['country'].str.contains("USA").fillna(False)]

print(subset_movies.shape)

In [None]:
uk_directors_movies = subset_movies.query("imdb_title_id in @uk_directors_movies")['imdb_title_id'].unique().tolist()
usa_directors_movies = subset_movies.query("imdb_title_id in @usa_directors_movies")['imdb_title_id'].unique().tolist()

print(len(uk_directors_movies), len(usa_directors_movies))          

In [None]:
rating_columns = ['imdb_title_id', 'weighted_average_vote', 'total_votes', 'mean_vote', 'median_vote', 
                  'us_voters_rating', 'us_voters_votes', 'non_us_voters_rating', 'non_us_voters_votes']

# ratings[rating_columns]

In [None]:
uk_directors_ratings = ratings.query("imdb_title_id in @uk_directors_movies")[rating_columns]
usa_directors_ratings = ratings.query("imdb_title_id in @usa_directors_movies")[rating_columns]

uk_directors_ratings['director_country'] = 'UK'
usa_directors_ratings['director_country'] = 'USA'
all_ratings = pd.concat([uk_directors_ratings, usa_directors_ratings])

In [None]:
sns.displot(data=all_ratings, x='mean_vote', kind='kde', rug=True, hue='director_country');

In [None]:
# fig, ax = plt.subplots(1,2)
# sns.histplot(data=uk_directors_ratings, x='mean_vote',  kde=True, ax=ax[0]);
# sns.histplot(data=usa_directors_ratings, x='mean_vote', kde=True, ax=ax[1]);
# fig.show()

# Inference

#### Q1) Write down your theoretical hypothesis.
There is no difference in the average rating garnered by movies made by British and American directors.

#### Q2) Write down which dependent variables you will measure.
The mean of ratings for movies released in the USA between 1980 and 2019, made by British verus American directors.

#### Q3) Justify your sample size.

To achieve an $\alpha$ = 0.05 and $\beta$ = 0.05 for a two-sided equivalence test, $N$ = 105

In [None]:
from statsmodels.stats.power import tt_ind_solve_power

alpha = 0.01
beta = 0.01
power = 1 - beta
d = 0.5

N = int(tt_ind_solve_power(effect_size=d, nobs1=None, alpha=alpha, power=power, ratio=1.0, alternative='two-sided')) + 1
print(N)

Q4) Specify the statistical test you will conduct. 

Equivalence test with equivalence interval of $d$ = [-0.5, 0.5]

In [None]:
from statsmodels.stats.weightstats import ttost_ind

x1 = all_ratings.query("director_country == 'UK'")['mean_vote']
x2 = all_ratings.query("director_country == 'USA'")['mean_vote']

print(len(x1), len(x2))          

In [None]:
from statsmodels.stats.power import tt_ind_solve_power

alpha = 0.01
d = 0.5
nobs1 = len(x1)
ratio = len(x2)/nobs1

power = tt_ind_solve_power(effect_size=d, nobs1=nobs1, alpha=alpha, power=None, ratio=ratio, alternative='two-sided')
print(power)

In [None]:
p, _low, _upp = ttost_ind(x1=x1, x2=x2, low=-0.5, upp=0.5, usevar='unequal')

if p < alpha:
    print("Ratings for British and American directors are equivalent")
else:
    print("Ratings for British and American directors are significantly different")