In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import norm, chi2_contingency

In [2]:
data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')

In [3]:
# number of callbacks for black-sounding names
sum(data[data.race=='b'].call)

157.0

In [4]:
sum(data[data.race=='w'].call)

235.0

In [18]:
pd.crosstab(data.call,data.race, margins=True)

race,b,w,All
call,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2278,2200,4478
1.0,157,235,392
All,2435,2435,4870


# #Answer 1:
Since the variables we're interested in are not continuous, the CLT is not applicable here. That is, we wouldn't expect a normal distribution.   Since we're dealing with categorical variables, a chi-square test would be appropriate.  Note that if there was no association between race-names and call-backs, then we'd expect approximately 8% (392/4870) of people with black name(i.e. 196) and 8% of white people (i.e. 196) to get call-backs. The most appropriate test would be a chi-square test of association between race-names and call-backs.  Since there are no cells with counts < 5 this should be ok.

# #Answer 2:
The null hypothesis is that there is no statistical difference between the proportion of "black people names" that got a call-back and the proportion of "white people names" that got a call-back.  That is, the difference between observed and expected results are not statistically significant.
The altenative hypothesis is that there is a significant difference in the proportion of people with black names vs people with white names that get call-backs to job interviews.  That is there is a statistical significant difference between expected and observered proprotions.

# #Answer 3:
I've already created a function that calculates difference of proportion and calculates z-score. But I still need to find out proportions of white vs black names that got a call-back.

In [5]:
def ci_propDiff(ci,p1,p2,n1,n2):
    """
    INPUT:  ci=confidence interval requested
            p1=proportion 1
            p2=proportion 2
            n1=population of p1
            n2=population of p2
    OUTPUT: difference in proportions, upper limit, lower limit, and z-score
    """
    diff=100*(p1-p2)
    std=((p1*(1-p1)/n1)+(p2*(1-p2)/n2))**(0.5)
    ppool=((p1*n1)+(p2*n2))/(n1+n2)
#    std_pooled=(2*ppool*(1-ppool)/((n1+n2)/2))**(0.5)
    zscore=diff/std
    if ci==95:
        zcrit=1.96
    elif ci==99:
        zcrit=2.626
    else:
        print("Error")
    limit=zcrit*(std)
    upper=diff+limit
    lower=diff-limit
    return(diff,lower, upper,zscore)

In [6]:
df = data.groupby(['race','call'], as_index=False)['call'].agg(['count'])
df=df.reset_index()

In [7]:
df1 = data.groupby(['race'], as_index=False)['call'].agg(['count'])
df1=df1.reset_index()

In [8]:
df_working = df.set_index('race').join(df1.set_index('race'),lsuffix='_n', rsuffix='_tot')
df_working['Percent']=100*df_working['count_n']/df_working['count_tot']
df_working

Unnamed: 0_level_0,call,count_n,count_tot,Percent
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
b,0.0,2278,2435,93.552361
b,1.0,157,2435,6.447639
w,0.0,2200,2435,90.349076
w,1.0,235,2435,9.650924


In [9]:
d,l,u,sc=ci_propDiff(95,.06448,.096509,2435,2435)

In [10]:
u

-3.1876444409025373

In [11]:
print("The mean difference was:{:.2f} with a 95%CI of ({:.2f},{:.2f})".format(d,l,u))

The mean difference was:-3.20 with a 95%CI of (-3.22,-3.19)


In [12]:
sc

-411.50140482521584

# I can calculate the area under the CDF up to the z-critical score to determine the p-value. I will then multiple by 2 to get the other half to get a two-tailed p-value:

In [13]:
p_value = 2*(norm.sf(abs(sc)))

In [14]:
p_value

0.0

In [15]:
print ("The p-value was found to be <0.0001 and is statistically significant.".format(p_value))

The p-value was found to be <0.0001 and is statistically significant.
