In [175]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import norm, chi2_contingency

In [4]:
data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')

In [12]:
# number of callbacks for black-sounding names
sum(data[data.race=='b'].call)

157.0

In [192]:
sum(data[data.race=='w'].call)

235.0

In [193]:
data.columns

Index(['id', 'ad', 'education', 'ofjobs', 'yearsexp', 'honors', 'volunteer',
       'military', 'empholes', 'occupspecific', 'occupbroad', 'workinschool',
       'email', 'computerskills', 'specialskills', 'firstname', 'sex', 'race',
       'h', 'l', 'call', 'city', 'kind', 'adid', 'fracblack', 'fracwhite',
       'lmedhhinc', 'fracdropout', 'fraccolp', 'linc', 'col', 'expminreq',
       'schoolreq', 'eoe', 'parent_sales', 'parent_emp', 'branch_sales',
       'branch_emp', 'fed', 'fracblack_empzip', 'fracwhite_empzip',
       'lmedhhinc_empzip', 'fracdropout_empzip', 'fraccolp_empzip',
       'linc_empzip', 'manager', 'supervisor', 'secretary', 'offsupport',
       'salesrep', 'retailsales', 'req', 'expreq', 'comreq', 'educreq',
       'compreq', 'orgreq', 'manuf', 'transcom', 'bankreal', 'trade',
       'busservice', 'othservice', 'missind', 'ownership'],
      dtype='object')

# #Answer 1:
The most appropriate test would be to test a difference in proportions.  Since the sample is relative large (ie. n=4,870) and the question specifies random assignment of names, the CLT is applicable here.

# #Answer 2:
The null hypothesis is that there is no statistical difference between the proportion of "black people names" that got a call-back and the proportion of "white people names" that got a call-back.
The altenative hypothesis is that there is a significant difference in the proportion of people with black names vs people with white names that get call-backs to job interviews.

# #Answer 3:
I've already created a function that calculates difference of proportion and calculates z-score. But I still need to find out proportions of white vs black names that got a call-back.

In [194]:
def ci_propDiff(ci,p1,p2,n1,n2):
    """
    INPUT:  ci=confidence interval requested
            p1=proportion 1
            p2=proportion 2
            n1=population of p1
            n2=population of p2
    OUTPUT: difference in proportions, upper limit, lower limit, and z-score
    """
    diff=p1-p2
    std=((p1*(1-p1)/n1)+(p2*(1-p2)/n2))**(0.5)
    ppool=((p1*n1)+(p2*n2))/(n1+n2)
#    std_pooled=(2*ppool*(1-ppool)/((n1+n2)/2))**(0.5)
    zscore=diff/std
    if ci==95:
        zcrit=1.96
    elif ci==99:
        zcrit=2.626
    else:
        print("Error")
    limit=zcrit*(std)
    upper=diff+limit
    lower=diff-limit
    return(diff,lower, upper,zscore)

In [195]:
df = data.groupby(['race','call'], as_index=False)['call'].agg(['count'])
df=df.reset_index()

In [196]:
df1 = data.groupby(['race'], as_index=False)['call'].agg(['count'])
df1=df1.reset_index()

In [198]:
df_working = df.set_index('race').join(df1.set_index('race'),lsuffix='_n', rsuffix='_tot')
df_working['Percent']=100*df_working['count_n']/df_working['count_tot']
df_working

Unnamed: 0_level_0,call,count_n,count_tot,Percent
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
b,0.0,2278,2435,93.552361
b,1.0,157,2435,6.447639
w,0.0,2200,2435,90.349076
w,1.0,235,2435,9.650924


In [212]:
d,l,u,sc=ci_propDiff(95,.06448,.096509,2435,2435)

In [217]:
u

-0.01677344090253731

In [215]:
print("The mean difference was:{:.2f} with a 95%CI of ({:.2f},{:.2f})".format(d,l,u))

The mean difference was:-0.03 with a 95%CI of (-0.05,-0.02)


In [199]:
sc

-4.115014048252158

# I can calculate the area under the CDF up to the z-critical score to determine the p-value. I will then multiple by 2 to get the other half to get a two-tailed p-value:

In [209]:
p_value = 2*(norm.sf(abs(sc)))

In [210]:
p_value

3.87155834572941e-05

In [219]:
print ("The p-value was found to be <0.0001 and is statistically significant.".format(p_value))

The p-value was found to be <0.0001 and is statistically significant.
