# Simpson's Paradox
Use `admission_data.csv` for this exercise.

In [4]:
# Load and view first few lines of dataset
import numpy as np
import pandas as pd
df = pd.read_csv('admission_data.csv')
df.head()

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
1,56105,male,Physics,True
2,31441,female,Chemistry,False
3,51765,male,Physics,True
4,53714,female,Physics,True


### Proportion and admission rate for each gender

In [None]:
# Proportion of students that are female
len(df[df['gender'] == 'female'])/df.shape[0]

In [None]:
# Proportion of students that are male
1 - _

In [None]:
# Admission rate for females
df[df['gender'] == 'female']['admitted'].mean()

In [None]:
# Admission rate for males
# admission rates for females appear to be lower
df[df['gender'] == 'male']['admitted'].mean()

### Proportion and admission rate for physics majors of each gender

In [None]:
# What proportion of female students are majoring in physics?

# given that a student is female, what is the probability they major in physics
# that is the proportion of females and physics majors divided by the proportion of females
# since the denominators are the same, we can just get counts of each and take the ratio

df.query('gender == "female" and major == "Physics"').count()[
    0]/len(df[df['gender'] == 'female'])

In [None]:
# What proportion of male students are majoring in physics?

df.query('gender == "male" and major == "Physics"').count()[
    0]/len(df[df['gender'] == 'male'])  # many more males apply

In [None]:
# Admission rate for female physics majors

# That is what proportion of females who apply in physics are admitted
fem_adm_phys = df.query(
    'gender == "female" and major == "Physics" and admitted == True').count()[0]
fem_phys = df.query('gender == "female" and major == "Physics"').count()[0]

fem_adm_phys/fem_phys

In [None]:
# Admission rate for male physics majors

# That is what proportion of males who apply in physics are admitted
male_adm_phys = df.query(
    'gender == "male" and major == "Physics" and admitted == True').count()[0]
male_phys = df.query('gender == "male" and major == "Physics"').count()[0]

male_adm_phys/male_phys  # female admissions in physics are higher

### Proportion and admission rate for chemistry majors of each gender

In [None]:
# What proportion of female students are majoring in chemistry?
df.query('gender == "female" and major == "Chemistry"').count()[
    0]/len(df[df['gender'] == 'female'])

In [None]:
# What proportion of male students are majoring in chemistry?
df.query('gender == "male" and major == "Chemistry"').count()[
    0]/len(df[df['gender'] == 'male'])  # many fewer males

In [None]:
# Admission rate for female chemistry majors
fem_adm_chem = df.query(
    'gender == "female" and major == "Chemistry" and admitted == True').count()[0]
fem_chem = df.query('gender == "female" and major == "Chemistry"').count()[0]

fem_adm_chem/fem_chem

In [None]:
# Admission rate for male chemistry majors
male_adm_chem = df.query(
    'gender == "male" and major == "Chemistry" and admitted == True').count()[0]
male_chem = df.query('gender == "male" and major == "Chemistry"').count()[0]

male_adm_chem/male_chem  # fewer males are admitted into chemistry as well as physics

### Admission rate for each major

In [None]:
# Admission rate for physics majors
df[df['major'] == "Physics"]['admitted'].mean()

In [None]:
# Admission rate for chemistry majors
df[df['major'] == "Chemistry"]['admitted'].mean()

Many more females applied to chemistry, which had a lower admissions rate.  Therefore, they had an overall lower admission rate.  Though, females had higher admission rates conditionally in both physics and chemistry.  This is known as **Simpson's Paradox**.

## Alternative solution

can we simplify the above code?

In [None]:
pd.crosstab(df.major, df.gender, normalize='columns')

In [None]:
df.groupby(['major', 'gender']).agg({'admitted': 'mean'})

In [None]:
df.groupby(['major']).agg({'admitted': "mean"})

# TableOne trial

In [30]:
from tableone import TableOne

  and should_run_async(code)


In [32]:
mytable = TableOne(df)
mytable

  and should_run_async(code)


Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,500
"student_id, mean (SD)",,0.0,41495.1 (10065.3)
"gender, n (%)",female,0.0,257 (51.4)
"gender, n (%)",male,,243 (48.6)
"major, n (%)",Chemistry,0.0,244 (48.8)
"major, n (%)",Physics,,256 (51.2)
"admitted, n (%)",False,0.0,308 (61.6)
"admitted, n (%)",True,,192 (38.4)


In [34]:
columns = ['gender', 'major', 'admitted']
gp = 'gender'

mytable = TableOne(df, columns = columns, groupby = gp)

  and should_run_async(code)


In [35]:
mytable

  and should_run_async(code)


Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by gender,Grouped by gender,Grouped by gender,Grouped by gender
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,female,male
n,,,500,257,243
"major, n (%)",Chemistry,0.0,244 (48.8),226 (87.9),18 (7.4)
"major, n (%)",Physics,,256 (51.2),31 (12.1),225 (92.6)
"admitted, n (%)",False,0.0,308 (61.6),183 (71.2),125 (51.4)
"admitted, n (%)",True,,192 (38.4),74 (28.8),118 (48.6)
