# Simpson's Paradox
Use `admission_data.csv` for this exercise.

In [11]:
# Load and view first few lines of dataset
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

my_df = pd.read_csv('admission_data.csv')
my_df.head()

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
1,56105,male,Physics,True
2,31441,female,Chemistry,False
3,51765,male,Physics,True
4,53714,female,Physics,True


In [2]:
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
student_id    500 non-null int64
gender        500 non-null object
major         500 non-null object
admitted      500 non-null bool
dtypes: bool(1), int64(1), object(2)
memory usage: 12.3+ KB


###  Proportion and admission rate for each gender

In [7]:
gender_count = my_df['gender'].value_counts()
gender_total = my_df['gender'].count()
gender_count


female    257
male      243
Name: gender, dtype: int64

In [8]:
gender_total

500

In [9]:
# Proportion of students that are female
print("Proportion of females is {}%".format(100*gender_count[0]/gender_total))
print("Proportion of females is {}%".format(gender_count[0]/gender_total))




Proportion of females is 51.4%
Proportion of females is 0.514%


In [10]:
# Proportion of students that are male

print("Proportion of males is {}%".format(100*gender_count[1]/gender_total))
print("Proportion of males is {}%".format(gender_count[1]/gender_total))

Proportion of males is 48.6%
Proportion of males is 0.486%


In [13]:
#To view statistics based on gender, we group by gender variable
# Next we use value_counts() function to see the numbers
my_df.groupby('gender')['admitted'].value_counts()

gender  admitted
female  False       183
        True         74
male    False       125
        True        118
Name: admitted, dtype: int64

In [15]:
# count of females
# https://chrisalbon.com/python/pandas_selecting_rows_on_conditions.html
female_df = my_df['gender'] == "female"
female_total = sum(female_df)
print("Number of female students are {}".format(female_total))


Number of female students are 257


In [16]:
# Admission rate for females
female_admin_rate = 100* 74/female_total
print("Female admission rate is {}%".format(round(female_admin_rate),2))



Female admission rate is 29.0%


In [18]:
#count of males
male_df = my_df['gender'] == "male"
male_total = sum(male_df)
print("Number of male students are {}".format(male_total))

Number of male students are 243


In [19]:
# Admission rate for males
male_admin_rate = 100* 118/male_total
print("Male admission rate is {}%".format(round(male_admin_rate,2)))


Male admission rate is 48.56%


### Proportion and admission rate for physics majors of each gender

In [20]:
# group by major and gender to get counts of students 
major_gender_counts = my_df.groupby(['major', 'gender']).describe()
major_gender_counts.info()



<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, (Chemistry, female) to (Physics, male)
Data columns (total 8 columns):
(student_id, count)    4 non-null float64
(student_id, mean)     4 non-null float64
(student_id, std)      4 non-null float64
(student_id, min)      4 non-null float64
(student_id, 25%)      4 non-null float64
(student_id, 50%)      4 non-null float64
(student_id, 75%)      4 non-null float64
(student_id, max)      4 non-null float64
dtypes: float64(8)
memory usage: 405.0+ bytes


In [23]:
# What proportion of female students are majoring in physics?
# First, get count of females in Physics major 
fem_phy = len(my_df.loc[(my_df['gender'] == 'female') & (my_df['major'] == 'Physics')])
print("Number of females with Physics major is {}".format(fem_phy))

#now evaluate proportion of female candidates in Phsics
print("Proportion of female candidates in Physics is {}%".format(round((fem_phy*100/female_total),2)))

Number of females with Physics major is 31
Proportion of female candidates in Physics is 12.06%


In [24]:
# What proportion of male students are majoring in physics?
# Frst,get count of males in Physics major 
m_phy = len(my_df.loc[(my_df['gender'] == 'male') & (my_df['major'] == 'Physics')])
print("Number of males with Physics major is {}".format(m_phy))

#now evaluate proportion of male candidates in Physics
print("Proportion of male candidates in Physics is {}%".format(round((m_phy*100/male_total),2)))



Number of males with Physics major is 225
Proportion of male candidates in Physics is 92.59%


In [26]:
# Admission rate for female physics majors
# compute count of females in Physics who got admitted
fem_phy_admit = len(my_df.loc[(my_df['gender'] == 'female') 
                           & (my_df['major'] == 'Physics') 
                           & (my_df['admitted'] == True)])

#now evaluate proportion of admitted female candidates in Phsics
print("Proportion of female candidates admitted in Physics is {}%".format(round((fem_phy_admit*100/fem_phy),2)))



Proportion of female candidates admitted in Physics is 74.19%


In [27]:
# Admission rate for male physics majors
# compute count of females in Physics who got admitted
m_phy_admit = len(my_df.loc[(my_df['gender'] == 'male') 
                           & (my_df['major'] == 'Physics') 
                           & (my_df['admitted'] == True)])

#now evaluate proportion of admitted male candidates in Phsics
print("Proportion of male candidates admitted in Physics is {}%".format(round((m_phy_admit*100/m_phy),2)))



Proportion of male candidates admitted in Physics is 51.56%


### Proportion and admission rate for chemistry majors of each gender

In [28]:
# What proportion of female students are majoring in chemistry?
# First, get count of females in Chemistry major 
fem_chem = len(my_df.loc[(my_df['gender'] == 'female') & (my_df['major'] == 'Chemistry')])
print("Number of females with Chemistry major is {}".format(fem_chem))

#now evaluate proportion of female candidates in Chemistry
print("Proportion of female candidates in Chemistry is {}%".format(round((fem_chem*100/female_total),2)))


Number of females with Chemistry major is 226
Proportion of female candidates in Chemistry is 87.94%


In [29]:
# What proportion of male students are majoring in chemistry?
# First, get count of females in Chemistry major 
m_chem = len(my_df.loc[(my_df['gender'] == 'male') & (my_df['major'] == 'Chemistry')])
print("Number of males with Chemistry major is {}".format(m_chem))

#now evaluate proportion of male candidates in Chemistry
print("Proportion of male candidates in Chemistry is {}%".format(round((m_chem*100/male_total),2)))

Number of males with Chemistry major is 18
Proportion of male candidates in Chemistry is 7.41%


In [30]:
# Admission rate for female chemistry majors
# compute count of females in chemistry who got admitted
fem_chem_admit = len(my_df.loc[(my_df['gender'] == 'female') 
                           & (my_df['major'] == 'Chemistry') 
                           & (my_df['admitted'] == True)])

#now evaluate proportion of admitted female candidates in chemistry
print("Proportion of female candidates admitted in chemistry is {}%".format(round((fem_chem_admit*100/fem_chem),2)))


Proportion of female candidates admitted in chemistry is 22.57%


In [31]:
# Admission rate for male chemistry majors
# compute count of males in chemistry who got admitted
m_chem_admit = len(my_df.loc[(my_df['gender'] == 'male') 
                           & (my_df['major'] == 'Chemistry') 
                           & (my_df['admitted'] == True)])

#now evaluate proportion of admitted male candidates in chemistry
print("Proportion of male candidates admitted in chemistry is {}%".format(round((m_chem_admit*100/m_chem),2)))


Proportion of male candidates admitted in chemistry is 11.11%


### Admission rate for each major

In [32]:
# Admission rate for physics majors
# compute count of students in Physics who got admitted
phy_admit = len(my_df.loc[(my_df['major'] == 'Physics') 
                           & (my_df['admitted'] == True)])

#now evaluate proportion of admitted candidates in Phsics
print("Proportion of candidates admitted in Physics is {}%".format(round((phy_admit*100/(fem_phy+m_phy)),2)))

Proportion of candidates admitted in Physics is 54.3%


In [33]:
# Admission rate for chemistry majors
# compute count of students in Chemistry who got admitted
chem_admit = len(my_df.loc[(my_df['major'] == 'Chemistry') 
                           & (my_df['admitted'] == True)])

#now evaluate proportion of admitted candidates in Chemistry
print("Proportion of candidates admitted in Chemistry is {}%".format(round((chem_admit*100/(fem_chem+m_chem)),2)))


Proportion of candidates admitted in Chemistry is 21.72%
