In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import statsmodels.api as sm

In [3]:
df = pd.read_csv('../data/hsb2.csv', index_col=0)
df['honcomp'] = 0

for key in df.index.values:
    if df.loc[key,'write'] >= 60:
        df.loc[key,'honcomp'] = 1

df

Unnamed: 0_level_0,female,race,ses,schtyp,prog,read,write,math,science,socst,honcomp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
70,0,4,1,1,1,57,52,41,47,57,0
121,1,4,2,1,3,68,59,53,63,61,0
86,0,4,3,1,1,44,33,54,58,31,0
141,0,4,3,1,3,63,44,47,53,56,0
172,0,4,2,1,2,47,52,57,53,61,0
...,...,...,...,...,...,...,...,...,...,...,...
31,1,2,2,2,1,55,59,52,42,56,0
145,1,4,2,1,3,42,46,38,36,46,0
187,1,4,2,2,1,57,41,57,55,52,0
118,1,4,2,1,1,55,62,58,58,61,1


In [4]:
independent_variables = pd.concat([df['female'], df['read'], df['science']], axis=1)
x = independent_variables
y = df['honcomp'].values

lr = LogisticRegression(C=1e8)
lr.fit(x, y)
y_pred = lr.predict(x)

print('Coefficients = ', lr.coef_)

Coefficients =  [[1.48250682 0.10353644 0.09478972]]


In [5]:
y_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1])

In [6]:
lr.predict_proba(x)

array([[0.91837766, 0.08162234],
       [0.15218863, 0.84781137],
       [0.93841608, 0.06158392],
       [0.77391461, 0.22608539],
       [0.94720664, 0.05279336],
       [0.90463708, 0.09536292],
       [0.92933524, 0.07066476],
       [0.9961666 , 0.0038334 ],
       [0.68061514, 0.31938486],
       [0.89436617, 0.10563383],
       [0.82363302, 0.17636698],
       [0.71174419, 0.28825581],
       [0.36282276, 0.63717724],
       [0.87791126, 0.12208874],
       [0.99440114, 0.00559886],
       [0.97561698, 0.02438302],
       [0.95974761, 0.04025239],
       [0.79864047, 0.20135953],
       [0.62791608, 0.37208392],
       [0.88683784, 0.11316216],
       [0.49957061, 0.50042939],
       [0.36113164, 0.63886836],
       [0.91582621, 0.08417379],
       [0.68629195, 0.31370805],
       [0.99477755, 0.00522245],
       [0.9961666 , 0.0038334 ],
       [0.56590385, 0.43409615],
       [0.91782709, 0.08217291],
       [0.98927927, 0.01072073],
       [0.89842723, 0.10157277],
       [0.

In [7]:
print('Intercept = ', lr.intercept_)

Intercept =  [-12.77719968]


In [8]:
print('Accuracy {}'.format(lr.score(x, y)))

Accuracy 0.81


In [None]:
print('Confusion = \n{}'.format(metrics.confusion_matrix(y, y_pred, labels=[0,1])))

In [None]:
print(classification_report(y, y_pred))

In [None]:
x = independent_variables
y = df['honcomp']

x2 = sm.add_constant(x)
logit_model=sm.Logit(y,x2)
result=logit_model.fit()
result.summary()

In [None]:
logit_roc_auc = roc_auc_score(y, lr.predict(x))
fpr, tpr, thresholds = roc_curve(y, lr.predict_proba(x)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()