In [44]:
import pandas as pd
import numpy as np
from scipy import stats

admissions = pd.read_csv('admissions.csv')
admissions = admissions.dropna()
admissions.prestige = admissions.prestige.astype('int8')
admissions

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
395,0,620.0,4.00,2
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2


In [5]:
x = pd.crosstab(admissions['admit'], admissions['prestige'])
x

prestige,1.0,2.0,3.0,4.0
admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,28,97,93,55
1,33,53,28,12


$H_{0}: \text{Prestige and admission are independent} $  
$H_{1}: \text{Prestige and admission are not independent} $

In [25]:
# Chi squared independence testing
# Chi2 independence testing works best with a categorical variable and a T/F variable 
# (need to confirm if it could be 2 categorical variables)

# This is different to ANOVA because that test uses a categorical feature and a continuous feature

chi2 = stats.chi2_contingency(x)
chi2

(25.07674731940608,
 1.4880370582972673e-05,
 3,
 array([[ 41.73684211, 102.63157895,  82.78947368,  45.84210526],
        [ 19.26315789,  47.36842105,  38.21052632,  21.15789474]]))

In [20]:
# This is what we would expect if the data was truly independent
chi2[3]

array([[ 41.73684211, 102.63157895,  82.78947368,  45.84210526],
       [ 19.26315789,  47.36842105,  38.21052632,  21.15789474]])

In [50]:
# The p-value is the probability that you would've obtained at least this data result or more extreme if the null hypothesis were true.
# For example, here we get an extremely low value, so there is a very low chance we would have got this data if the null hypthesis were true.
# This p-value is extremely low, so we reject the null hypothesis and accept the alternative hypothesis - they are not independent.
chi2[1]

1.4880370582972673e-05

In [23]:
# This test can be extremely useful in things like marketing campaigns or other A-B tests
# for an email campaign
email_campaign = pd.DataFrame({
    'email': [90,850],
    'no email': [50, 1300]
})

In [24]:
stats.chi2_contingency(email_campaign)

(32.26018157821708,
 1.3484861525617584e-08,
 1,
 array([[  57.46724891,   82.53275109],
        [ 882.53275109, 1267.46724891]]))

## Back to Logistic regression

In [37]:
data = pd.concat([admissions, pd.get_dummies(admissions['prestige'], drop_first=True, prefix='prestige')], axis=1).drop('prestige', axis=1)
data

Unnamed: 0,admit,gre,gpa,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,0,1,0
1,1,660.0,3.67,0,1,0
2,1,800.0,4.00,0,0,0
3,1,640.0,3.19,0,0,1
4,0,520.0,2.93,0,0,1
...,...,...,...,...,...,...
395,0,620.0,4.00,1,0,0
396,0,560.0,3.04,0,1,0
397,0,460.0,2.63,1,0,0
398,0,700.0,3.65,1,0,0


In [38]:
data['intercept'] = 1

In [40]:
import statsmodels.api as sm

fit_lr = sm.GLM(data['admit'], data.drop(columns='admit'), family=sm.families.Binomial()).fit()

In [42]:
fit_lr.summary()

0,1,2,3
Dep. Variable:,admit,No. Observations:,397.0
Model:,GLM,Df Residuals:,391.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-227.82
Date:,"Thu, 01 Apr 2021",Deviance:,455.64
Time:,20:40:53,Pearson chi2:,394.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gre,0.0022,0.001,2.028,0.043,7.44e-05,0.004
gpa,0.7793,0.333,2.344,0.019,0.128,1.431
prestige_2,-0.6801,0.317,-2.146,0.032,-1.301,-0.059
prestige_3,-1.3387,0.345,-3.882,0.000,-2.015,-0.663
prestige_4,-1.5534,0.417,-3.721,0.000,-2.372,-0.735
intercept,-3.8769,1.142,-3.393,0.001,-6.116,-1.638


In [48]:
# way to intepret this is that:
print('GPA is between (95%CI)')
print((np.exp(0.128)-1) * 100)
print((np.exp(1.431)-1) * 100)

GPA is between (95%CI)
13.655300269706029
318.2879981161389


In [54]:
y, X = data['admit'], data.drop(columns='admit')

In [59]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)

In [64]:
classifier = linear_model.LogisticRegression(max_iter=1000)

In [65]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [69]:
predicted = classifier.predict(X_test)

evaluate = pd.DataFrame({
    'actual': y_test,
    'prediction': predicted
})

evaluate

Unnamed: 0,actual,prediction
114,0,0
281,0,0
240,0,0
57,0,0
72,0,0
...,...,...
286,1,0
378,0,0
256,1,0
334,1,0


In [70]:
from sklearn import metrics
metrics.accuracy_score(y_test, predicted)

0.5916666666666667