#### Logistic Regression

In [1]:
import pandas as pd
import numpy as np

In [3]:
url = 'https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/study_hours.csv'
df = pd.read_csv(url)
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Hours,0.5,0.75,1.0,1.25,1.5,1.75,1.75,2.0,2.25,2.5,2.75,3.0,3.25,3.5,4.0,4.25,4.5,4.75,5.0,5.5
Pass,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


#### sklearn: LogisticRegression

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [5]:
X, y=df[['Hours']], df.Pass

test_size=.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=7)

In [7]:
df[['Hours']]

Unnamed: 0,Hours
0,0.5
1,0.75
2,1.0
3,1.25
4,1.5
5,1.75
6,1.75
7,2.0
8,2.25
9,2.5


In [6]:
X_train

Unnamed: 0,Hours
18,5.0
6,1.75
13,3.5
19,5.5
10,2.75
14,4.0
8,2.25
16,4.5
9,2.5
12,3.25


In [8]:
X_test

Unnamed: 0,Hours
1,0.75
17,4.75
2,1.0
5,1.75
11,3.0
0,0.5


In [9]:
model = LogisticRegression()
model

In [10]:
model.fit(X_train, y_train)

In [13]:
model.score(X_train, y_train)

0.7142857142857143

In [15]:
model.intercept_, model.coef_

(array([-2.10044528]), array([[0.92289082]]))

In [16]:
predicted = model.predict(X_test)
predicted

array([0, 1, 0, 0, 1, 0], dtype=int64)

In [17]:
y_test.values

array([0, 1, 0, 0, 0, 0], dtype=int64)

In [18]:
model.score(X_test, y_test)

0.8333333333333334

In [19]:
model.predict_proba(X_test)

array([[0.80349407, 0.19650593],
       [0.09251669, 0.90748331],
       [0.76450781, 0.23549219],
       [0.61901897, 0.38098103],
       [0.33889392, 0.66110608],
       [0.8373988 , 0.1626012 ]])

In [20]:
import math

In [21]:
def sp(intercept, coef, x):
    ex = np.exp(-(intercept + x*coef))
    return (1/(1+ex))

In [25]:
sp(model.intercept_, model.coef_,.75) # assume study hours = .75

array([[0.19650593]])

In [26]:
sp(model.intercept_, model.coef_,4.75) # assume study hours = .75

array([[0.90748331]])

In [27]:
from sklearn import metrics

In [28]:
X_test['Hours'].values

array([0.75, 4.75, 1.  , 1.75, 3.  , 0.5 ])

In [29]:
y_test.values

array([0, 1, 0, 0, 0, 0], dtype=int64)

In [30]:
predicted = model.predict(X_test)
predicted

array([0, 1, 0, 0, 1, 0], dtype=int64)

#### sklearn: confusion matrix

In [31]:
metrics.confusion_matrix(y_test, predicted)

array([[4, 1],
       [0, 1]], dtype=int64)

In [32]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predicted).ravel()

In [33]:
print(f'tn = {tn}')
print(f'fp = {fp}')
print(f'fn = {fn}')
print(f'tp = {tp}')

tn = 4
fp = 1
fn = 0
tp = 1


In [34]:
model.score(X_test, y_test)

0.8333333333333334

Accuracy = (tp+tn)/(tp+tn+fp+fn)
Precision = tp/(tp+fp)
Recall = tp/(tp+fn)
F! = 2*(precision*recall)/(precision+recall)

In [37]:
metrics.accuracy_score(y_test, predicted)

0.8333333333333334

In [38]:
metrics.precision_score(y_test, predicted)

0.5

In [39]:
metrics.recall_score(y_test, predicted)

1.0

In [41]:
metrics.f1_score(y_test, predicted)

0.6666666666666666

In [42]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.50      1.00      0.67         1

    accuracy                           0.83         6
   macro avg       0.75      0.90      0.78         6
weighted avg       0.92      0.83      0.85         6

