### 2024-05-16 Confusion Matrix

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [32]:
titanic = pd.read_csv("titanic.csv.bz2")
titanic.shape

(1309, 14)

In [33]:
titanic

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [34]:
titanic[["survived", "age", "sex", "pclass"]].head()

Unnamed: 0,survived,age,sex,pclass
0,1,29.0,female,1
1,1,0.9167,male,1
2,0,2.0,female,1
3,0,30.0,male,1
4,0,25.0,female,1


In [35]:
titanic = titanic.dropna(subset = ["survived", "sex", "age", "pclass"])
titanic.shape

(1046, 14)

In [36]:
y = titanic.survived.values
y

array([1, 1, 0, ..., 0, 0, 0])

In [37]:
X = titanic[["age", "pclass"]].values
X[:5]

array([[29.    ,  1.    ],
       [ 0.9167,  1.    ],
       [ 2.    ,  1.    ],
       [30.    ,  1.    ],
       [25.    ,  1.    ]])

In [38]:
### above records pclass as 0 or 1s, numbers, but it is a categorical variable, and therefore it needs conversion

In [39]:
y = titanic.survived.values
X = pd.get_dummies(titanic[["age", "pclass"]],
               	columns = ["pclass"],
              	drop_first = True, dtype = int).values
X[:5]

array([[29.    ,  0.    ,  0.    ],
       [ 0.9167,  0.    ,  0.    ],
       [ 2.    ,  0.    ,  0.    ],
       [30.    ,  0.    ,  0.    ],
       [25.    ,  0.    ,  0.    ]])

In [40]:
from sklearn.linear_model import LogisticRegression

m = LogisticRegression()
_ = m.fit(X, y)
m.coef_

array([[-0.03583153, -1.12853381, -2.11967558]])

In [41]:
yhat = m.predict(X)
yhat[:10]    # predicted

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0])

In [42]:
y[:10]   # actual

array([1, 1, 0, 0, 0, 1, 1, 0, 1, 0])

In [43]:
pd.crosstab(y, yhat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,518,101
1,221,206


In [44]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, yhat)

array([[518, 101],
       [221, 206]])

In Class Exercise:

- Add sex to the model

- Compute CM

- Do you get more corrext predictions than w/o sex?

- Extract TN, FN from the CM (get single numbers out)

In [47]:
X = pd.get_dummies(titanic[["age", "pclass", "sex"]],
               	columns = ["pclass"],
              	drop_first = True, dtype = int).values
X[:5]

array([[29.0, 'female', 0, 0],
       [0.9167, 'male', 0, 0],
       [2.0, 'female', 0, 0],
       [30.0, 'male', 0, 0],
       [25.0, 'female', 0, 0]], dtype=object)

In [49]:
X = pd.get_dummies(titanic[["age", "pclass", "sex"]],
               	columns = ["pclass", "sex"],
              	drop_first = True, dtype = int)
X[:5]

Unnamed: 0,age,pclass_2,pclass_3,sex_male
0,29.0,0,0,0
1,0.9167,0,0,1
2,2.0,0,0,0
3,30.0,0,0,1
4,25.0,0,0,0


In [50]:
_ = m.fit(X, y)
m.coef_

array([[-0.03219683, -1.14841466, -2.13088901, -2.41229273]])

In [51]:
yhat = m.predict(X)
cm = confusion_matrix(y, yhat)
cm

array([[524,  95],
       [126, 301]])

Confusion Matrix Table
-------
TN | FP

FN | TP

In [52]:
tn = cm[0,0]
fn = cm[1,0]
tn, fn

(524, 126)

In [53]:
cm

array([[524,  95],
       [126, 301]])

Calulate Accuracy 

Accuracy = TP + TN  / T

In [57]:
(524 + 301)/(524 + 95 + 126 + 301)   # method 1

0.7887189292543021

In [58]:
(cm[0,0] + cm[1,1])/np.sum(cm)       # method 2

0.7887189292543021

In [60]:
tp = cm[1, 1]
(tp + tn) / np.sum(cm)               # method 3

0.7887189292543021

In [61]:
np.mean(y == yhat)                   # method 4

0.7887189292543021

In [62]:
from sklearn.metrics import accuracy_score

accuracy_score(y, yhat)             # method 5

0.7887189292543021

Calculate Precision:

Precision = TP / TP + FP = TP / ^P

• sensitive to false positives

• good measure if false positives bad

• judges must be sure the defendant is guilty

• scientific discovery is not just a statistical blip

In [63]:
cm

array([[524,  95],
       [126, 301]])

In [64]:
301/(301+95)

0.76010101010101

In [65]:
from sklearn.metrics import precision_score

precision_score(y, yhat)

0.76010101010101

In [66]:
301/(301+126)

0.7049180327868853

In [67]:
from sklearn.metrics import recall_score

recall_score(y, yhat)

0.7049180327868853

Calculate Recall:

Recall =  TP / TP + FN  =  TP / P

• sensitive to false negatives

• good measure if false negatives bad

• doctor must not miss a dangerous disease

• security must not miss a terrorist


F Score: Harmonic mean of *Precision, Recall*

In [68]:
from sklearn.metrics import f1_score

f1_score(y, yhat)

0.7314702308626975