In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import scipy.stats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Loading Data and Check Null Value

In [None]:
nf = pd.read_csv("/kaggle/input/netflix-original-films-imdb-scores/NetflixOriginals.csv")
nf.head()

In [None]:
nf.info()

In [None]:
sns.histplot(data=nf, x="IMDB Score", kde=True).set_title('IMDB Score Distribution')

In [None]:
print(nf.Genre.nunique())
nf.Genre.unique()

In [None]:
nf[~nf.Genre.str.contains('/')].Genre.nunique()

In [None]:
print(nf.Language.nunique())
nf.Language.unique()

In [None]:
nf[~nf.Language.str.contains('/')].Language.nunique()

## Data Pre-Processing

In [None]:
def scoreing_clf(score):
    if score < 5:
        return "Low"
    elif score <= 5 and score <= 7:
        return "Middle"
    elif score > 7:
        return "High"

nf["scoreRange"] = nf["IMDB Score"].apply(scoreing_clf)

In [None]:
genre_score = nf[["Genre", "IMDB Score", "scoreRange"]]
genre_count = genre_score.groupby("Genre").count()["IMDB Score"].rename("count").sort_values(ascending=False).reset_index()

In [None]:
language_score = nf[["Language", "IMDB Score", "scoreRange"]]
language_count = language_score.groupby("Language").count()["IMDB Score"].rename("count").sort_values(ascending=False).reset_index()

In [None]:
runtime_score = nf[["Runtime", "IMDB Score", "scoreRange"]]

## Question #1:  Is there an Association between Genre and IMDB Score Range?
**Test:** Chi-square test.           
**Null Hypothesis $H_0$:** The IMDB Score Range is independent of Genre.      
**Alternative Hypothesis $H_1$:** The IMDB Score Range is associated with Genre.    
**Use α = 0.05**

In [None]:
cont_table  = pd.crosstab(genre_score.Genre, genre_score.scoreRange)
cont_table

In [None]:
X2, p_value, degree_freedom, expected_values = scipy.stats.chi2_contingency(cont_table, correction = True)
print("X2: {0}, P-Value: {1}".format(X2,p_value))

**Conclusion:** Since the p-value is less than 0.05, we will reject the null hypothesis as there is significant evidence that IMDB Score Range is associated with genre.

## Question #2: Does IMDB Score for Films Differ by Genre?
**Test:** ANOVA   
**Test Statistic:** Mean IMDB score of Genres containing more than 20 films.           
**Null Hypothesis $H_0$:** The Genres' mean scores are equal.     
**Alternative Hypothesis $H_1$:**  At least one of the Genres' mean scores differ.    
**Use α = 0.05**

In [None]:
genre_count.query("count > 20")

In [None]:
# Test for equality of variance
scipy.stats.levene(genre_score.query("Genre=='Documentary'")["IMDB Score"],
                   genre_score.query("Genre=='Drama'")["IMDB Score"],
                   genre_score.query("Genre=='Comedy'")["IMDB Score"],
                   genre_score.query("Genre=='Romantic comedy'")["IMDB Score"],
                   genre_score.query("Genre=='Thriller'")["IMDB Score"],
                   center='mean')

Since p-value in LeveneResult is greater than the 0.05, the variance are equal.

In [None]:
# Run a one-way ANOVA
f_statistic, p_value = scipy.stats.f_oneway(genre_score.query("Genre=='Documentary'")["IMDB Score"],
                                            genre_score.query("Genre=='Drama'")["IMDB Score"],
                                            genre_score.query("Genre=='Comedy'")["IMDB Score"],
                                            genre_score.query("Genre=='Romantic comedy'")["IMDB Score"],
                                            genre_score.query("Genre=='Thriller'")["IMDB Score"])
print("F_Statistic: {0}, P-Value: {1}".format(f_statistic,p_value))

**Conclusion:** Since the p-value is less than 0.05, we will reject the null hypothesis as there is significant evidence that at least one of the means differ.

## Question #3:  Is there an Association between Languages and IMDB Score Range?
**Test:** Chi-square test.           
**Null Hypothesis $H_0$:** The IMDB Score Range is independent of Language.      
**Alternative Hypothesis $H_1$:** The IMDB Score Range is associated with Language.    
**Use α = 0.05**

In [None]:
cont_table  = pd.crosstab(language_score.Language, language_score.scoreRange)
cont_table

In [None]:
X2, p_value, degree_freedom, expected_values = scipy.stats.chi2_contingency(cont_table, correction = True)
print("X2: {0}, P-Value: {1}".format(X2,p_value))

**Conclusion:** Since the p-value is less than 0.05, we will reject the null hypothesis as there is significant evidence that IMDB Score Range is associated with language.

## Question #4: Does IMDB Score for Non-English Films Differ by Language?
**Test:** ANOVA   
**Test Statistic:** Mean IMDB score of non-English Languages containing more than 10 films.           
**Null Hypothesis $H_0$:** The Languages' mean scores are equal.     
**Alternative Hypothesis $H_1$:**  At least one of the Languages' mean scores differ.    
**Use α = 0.05**

In [None]:
language_count.query("count > 10 & Language!='English'")

In [None]:
# Test for equality of variance
scipy.stats.levene(language_score.query("Language=='Hindi'")["IMDB Score"],
                   language_score.query("Language=='Spanish'")["IMDB Score"],
                   language_score.query("Language=='French'")["IMDB Score"],
                   language_score.query("Language=='Italian'")["IMDB Score"],
                   language_score.query("Language=='Portuguese'")["IMDB Score"],
                   center='mean')

Since p-value in LeveneResult is greater than the 0.05, the variance are equal.

In [None]:
# Run a one-way ANOVA
f_statistic, p_value = scipy.stats.f_oneway(language_score.query("Language=='Hindi'")["IMDB Score"],
                                            language_score.query("Language=='Spanish'")["IMDB Score"],
                                            language_score.query("Language=='French'")["IMDB Score"],
                                            language_score.query("Language=='Italian'")["IMDB Score"],
                                            language_score.query("Language=='Portuguese'")["IMDB Score"])
print("F_Statistic: {0}, P-Value: {1}".format(f_statistic,p_value))

**Conclusion:** Since the p-value is greater than 0.05, we will fail to reject the null hypothesis as there is no significant evidence that at least one of the means differ.

## Question #5: Is IMDB Score for Films correlated with Runtime?
**Test:** Pearson correlation test.      
**Null Hypothesis $H_0$:** IMDB score for films is not correlated with runtime.    
**Alternative Hypothesis $H_1$:**  IMDB score for films is correlated with runtime.    
**Use α = 0.05**

In [None]:
ax = sns.scatterplot(x="Runtime", y="IMDB Score", data=runtime_score)

In [None]:
r, p_value = scipy.stats.pearsonr(runtime_score['Runtime'], runtime_score['IMDB Score']) 
print("Pearson’s correlation coefficient: {0}, P-Value: {1}".format(r,p_value))

Conclusion: Since the two-tailed p-value is greater than 0.05, we cannot reject the null hypothesis and conclude that there does not exist a relationship between runtime and teaching IMDB score. 