In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.cluster import DBSCAN

In [2]:
df = pd.read_csv(r'movieReplicationSet-2.csv')
shrek = df['Shrek (2001)'] 
gender = df['Gender identity (1 = female; 2 = male; 3 = self-described)']
frames = [shrek, gender]

In [3]:
df.shape

(1097, 477)

In [4]:
shrek

0       3.0
1       3.0
2       3.5
3       0.5
4       3.0
       ... 
1092    3.0
1093    4.0
1094    3.5
1095    2.5
1096    2.0
Name: Shrek (2001), Length: 1097, dtype: float64

In [5]:
gender

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
1092    1.0
1093    1.0
1094    1.0
1095    1.0
1096    1.0
Name: Gender identity (1 = female; 2 = male; 3 = self-described), Length: 1097, dtype: float64

In [6]:
combined = pd.concat(frames,axis=1)
combined

Unnamed: 0,Shrek (2001),Gender identity (1 = female; 2 = male; 3 = self-described)
0,3.0,1.0
1,3.0,1.0
2,3.5,1.0
3,0.5,1.0
4,3.0,1.0
...,...,...
1092,3.0,1.0
1093,4.0,1.0
1094,3.5,1.0
1095,2.5,1.0


In [7]:
combined = combined[~np.isnan(combined).any(axis=1)] # row-wise NA removal
combined

Unnamed: 0,Shrek (2001),Gender identity (1 = female; 2 = male; 3 = self-described)
0,3.0,1.0
1,3.0,1.0
2,3.5,1.0
3,0.5,1.0
4,3.0,1.0
...,...,...
1092,3.0,1.0
1093,4.0,1.0
1094,3.5,1.0
1095,2.5,1.0


In [8]:
combined['Gender identity (1 = female; 2 = male; 3 = self-described)'].value_counts()

1.0    743
2.0    241
3.0      6
Name: Gender identity (1 = female; 2 = male; 3 = self-described), dtype: int64

In [9]:
df['Gender identity (1 = female; 2 = male; 3 = self-described)'].value_counts()

1.0    807
2.0    260
3.0      6
Name: Gender identity (1 = female; 2 = male; 3 = self-described), dtype: int64

In [10]:
df.shape

(1097, 477)

In [11]:
female = combined[combined['Gender identity (1 = female; 2 = male; 3 = self-described)']==1]
female

Unnamed: 0,Shrek (2001),Gender identity (1 = female; 2 = male; 3 = self-described)
0,3.0,1.0
1,3.0,1.0
2,3.5,1.0
3,0.5,1.0
4,3.0,1.0
...,...,...
1092,3.0,1.0
1093,4.0,1.0
1094,3.5,1.0
1095,2.5,1.0


In [12]:
male = combined[combined['Gender identity (1 = female; 2 = male; 3 = self-described)']==2]
male

Unnamed: 0,Shrek (2001),Gender identity (1 = female; 2 = male; 3 = self-described)
9,4.0,2.0
13,3.0,2.0
15,3.5,2.0
19,2.0,2.0
22,3.0,2.0
...,...,...
1082,4.0,2.0
1084,1.0,2.0
1088,2.5,2.0
1089,3.0,2.0


In [13]:
female.mean()

Shrek (2001)                                                  3.155451
Gender identity (1 = female; 2 = male; 3 = self-described)    1.000000
dtype: float64

In [14]:
male.mean()

Shrek (2001)                                                  3.082988
Gender identity (1 = female; 2 = male; 3 = self-described)    2.000000
dtype: float64

In [15]:
# find significant diff p -val 
from scipy.stats import ttest_ind

ttest_ind(female, male)

Ttest_indResult(statistic=array([1.10166997,       -inf]), pvalue=array([0.27087512, 0.        ]))

In [16]:
stats.ttest_ind(female, male, equal_var = False) # this is the one we use since diff sample sizes

Ttest_indResult(statistic=array([1.15589072,       -inf]), pvalue=array([0.24834908, 0.        ]))