# STA130 LEC Week 12 (Nov 25)

## Ethics + Metrics = Methrics


1. Special Guest Dr. Steven Coyne

    1. BA Honours, University of Calgary (Philosophy)
    2. BSc,University of Calgary (Mathematics)
    3. MA, University of British Columbia (Philosophy)
    4. PhD, University of Toronto (Philosophy)


2. **Prediction**, **thresholding**, and different **Metrics**


In [None]:
import pandas as pd 

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

column_names = ["age", "workclass", "fnlwgt", "education", "education-num", 
                "marital-status", "occupation", "relationship", "race", "sex", 
                "capital-gain", "capital-loss", "hours-per-week", "native-country", 
                "income"]
data_raw = pd.read_csv(url, names=column_names, skipinitialspace=True)
data_use = data_raw.copy()
#data_use = data_use.drop(columns=['workclass', 'marital-status', 'occupation', 
#                                  'capital-gain', 'capital-loss', 'hours-per-week', 
#                                  'native-country', 'education-num', 'fnlwgt'])
display(data_use.head(), data_use.shape)

In [None]:
data_use.income.value_counts()

In [None]:
data_use.education.value_counts()

In [None]:
data_use.loc[data_use.education == 'Preschool', 'education'] = "<=6th"
data_use.loc[data_use.education == '1st-4th', 'education'] = "<=6th"
data_use.loc[data_use.education == '5th-6th', 'education'] = "<=6th"
data_use.education.value_counts()

In [None]:
data_use.workclass.value_counts()

In [None]:
data_use.loc[data_use.workclass == 'Without-pay', 'workclass'] = "?"
data_use.loc[data_use.workclass == 'Never-worked', 'workclass'] = "?"
data_use.workclass.value_counts()

In [None]:
data_use.occupation.value_counts()

In [None]:
data_use.loc[data_use.occupation == 'Armed-Forces', 'occupation'] = "?"
data_use.occupation.value_counts()


In [None]:
#data_use['workclass-occupation'] = data_use.workclass + " " + data_use.occupation
#data_use['workclass-occupation'].value_counts()
#for i,k in zip(data_use['workclassoccupation'].value_counts().index,data_use['workclass-occupation'].value_counts().values):
#    print(i, k)

In [None]:
from sklearn import model_selection
import numpy as np

np.random.seed(130)
train, test = model_selection.train_test_split(data_use, train_size=0.8)

In [None]:
train["sex"].value_counts()

In [None]:
train["marital-status"].value_counts()

In [None]:
train.relationship.value_counts()

In [None]:
train.race.value_counts()

In [None]:
import statsmodels.formula.api as smf
formula = '''
I((income=='>50K').astype(int)) ~ scale(age) * I(scale(age)**2) 
                                * scale(Q("education-num")) 
                                * C(race, Treatment(reference='White'))
                                * C(sex, Treatment(reference='Male')) 
                                + C(education, Treatment(reference='HS-grad'))
                                + C(Q("marital-status"), Treatment(reference='Married-civ-spouse')) 
                                + C(relationship, Treatment(reference='Husband'))
                                + C(workclass) + C(occupation)
'''
logreg = smf.logit(formula, data=train)
logreg_fit = logreg.fit()
logreg_fit.summary()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm_disp = ConfusionMatrixDisplay(
    confusion_matrix((test.income=='>50K'), logreg_fit.predict(test)>0.5, 
    labels=[False, True]), display_labels=['<=50K','>50K'])
_ = cm_disp.plot()

#### Accuracy
Accuracy measures the proportion of true results (both true positives and true negatives) in the population.
$$\text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}$$

#### Specificity (True Negative Rate)
 Specificity measures the proportion of actual negatives that are correctly identified.
$$\text{Specificity} = \frac{TN}{TN + FP}$$

#### Sensitivity (True Positive Rate)
Sensitivity measures the proportion of actual positives that are correctly identified.
$$\text{Sensitivity} = \frac{TP}{TP + FN}$$

#### Precision (Positive Predictive Value)
Precision measures the proportion of positive identifications that were actually correct.
$$\text{Precision} = \frac{TP}{TP + FP}$$

> - **Negative Predictive Value** is the "negative" version of **precision** $\frac{TN}{TN + FN}$
> - **False negative rates (FNR)** are defined to be the proportion of actually positive cases which are incorrectly identified (as false negatives) $TNR = TN/(TN+FP) = 1-FPR$
> - **False positive rates (FPR)** are defined to be the proportion of actually negative cases which are incorrectly identified (as false positives) $TPR = TP/(TP+FN) = 1-FNR$
 


In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score
# in sklearn specificity is recall_score(y_true, y_pred, pos_label=0)
# while sensitivity recall_score(y_true, y_pred, pos_label=1) is the default 

print("In sample (training) sensitivity", recall_score(train.income=='>50K', logreg_fit.predict(train)>0.5, pos_label=True))
print("Out of sample (testing) sensitivity", recall_score(test.income=='>50K', logreg_fit.predict(test)>0.5, pos_label=True))
print("In sample (training) specificity", recall_score(train.income=='>50K', logreg_fit.predict(train)>0.5, pos_label=False))
print("Out of sample (testing) specificity", recall_score(test.income=='>50K', logreg_fit.predict(test)>0.5, pos_label=False))
print("In sample (training) precision", precision_score(train.income=='>50K', logreg_fit.predict(train)>0.5))
print("Out of sample (testing) precision", precision_score(test.income=='>50K', logreg_fit.predict(test)>0.5))

In [None]:
X_train = pd.get_dummies(train.iloc[:,:-1]).astype(float)
X_test = X_train[:0].copy()
X_test_tmp = pd.get_dummies(test.iloc[:,:-1])
for col in X_test_tmp:
    X_test[col] = X_test_tmp[col].astype(float)
X_test = X_test.fillna(0.0)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

clf = DecisionTreeClassifier(max_depth=30, random_state=42, 
                             min_samples_leaf=30, 
                             min_samples_split=100)
clf.fit(X=X_train, y=(train.iloc[:,-1]=='>50K').astype(int))

plt.figure(figsize=(10,5), dpi=200)
plot_tree(clf, feature_names=X_train.columns.tolist(), 
          class_names=['<=50k','>50k'],
          filled=True, rounded=True)
plt.show()

In [None]:
#https://stackoverflow.com/questions/52771328/plotly-chart-not-showing-in-jupyter-notebook
import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()

In [None]:
cm_disp = ConfusionMatrixDisplay(
    confusion_matrix((test.income=='>50K'), clf.predict(X_test)==1.0, 
    labels=[False, True]), display_labels=['<=50K','>50K'])
_ = cm_disp.plot()

In [None]:
import plotly.express as px

feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns.tolist(),
    'Importance': clf.feature_importances_
}).sort_values(by='Importance', ascending=False).reset_index()

fig = px.bar(feature_importance_df[:20], y='Feature', x='Importance', 
             title='Feature Importance')
fig.show()

In [None]:
from sklearn.inspection import PartialDependenceDisplay
# X_train.columns=='education-num' # 2
_ = PartialDependenceDisplay.from_estimator(clf, X_train, (2,))


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Fit 1000 Decision Trees with unlimited depth
rfc = RandomForestClassifier(n_estimators=1000, random_state=1,
                             min_samples_leaf=10, min_samples_split=30)
rfc.fit(X=X_train, y=(train.iloc[:,-1]=='>50K').astype(int))

In [None]:
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns.tolist(),
    'Importance': rfc.feature_importances_
}).sort_values(by='Importance', ascending=False).reset_index()

fig = px.bar(feature_importance_df[:60], y='Feature', x='Importance', 
             title='Feature Importance',
              width=800, height=1200)
fig.show()

In [None]:
_ = PartialDependenceDisplay.from_estimator(rfc, X_train, (2,))


In [None]:
cm_disp = ConfusionMatrixDisplay(
    confusion_matrix((test.income=='>50K'), rfc.predict(X_test)==1.0, 
    labels=[False, True]), display_labels=['<=50K','>50K'])
_ = cm_disp.plot()

In [None]:
cm_disp = ConfusionMatrixDisplay(
    confusion_matrix((test.income=='>50K'), rfc.predict_proba(X_test)[:,1]>0.5, 
    labels=[False, True]), display_labels=['<=50K','>50K'])
_ = cm_disp.plot()

In [None]:
cm_disp = ConfusionMatrixDisplay(
    confusion_matrix((test.income=='>50K'), rfc.predict_proba(X_test)[:,1]>0.8, 
    labels=[False, True]), display_labels=['<=50K','>50K'])
_ = cm_disp.plot()

In [None]:
cm_disp = ConfusionMatrixDisplay(
    confusion_matrix((test.income=='>50K'), rfc.predict_proba(X_test)[:,1]>0.2, 
    labels=[False, True]), display_labels=['<=50K','>50K'])
_ = cm_disp.plot()

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# fpr: 1-specificity
# tpr: sensitivity
fpr, tpr, thresholds = roc_curve((test.income=='>50K').astype(int), 
                                 rfc.predict_proba(X_test)[:,1])
roc_df = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate': tpr,
                       'Threshold': thresholds})
# Compute AUC score
auc_score = roc_auc_score((test.income=='>50K').astype(int), 
                          rfc.predict_proba(X_test)[:,1])

fig = px.area(roc_df, x='False Positive Rate', y='True Positive Rate',
              title=f"ROC Curve (AUC = {auc_score:.2f})",
              labels={'False Positive Rate': 'One minus Specificity', 
              'True Positive Rate': 'Sensitivity'},
              hover_data={'Threshold': ':.3f'})

# Add diagonal line (random model)
fig.add_shape(type='line', x0=0, y0=0, x1=1, y1=1, 
              line=dict(dash='dash', color='gray'))

fig.update_layout(title_x=0.5)  # Center the title
fig.show()

In [None]:
logreg_fit.predict(test)
(test.income == '>50K').astype(int)

In [None]:
fpr_rfc, tpr_rfc, thresholds_rfc = roc_curve((test.income == '>50K').astype(int), 
                                             rfc.predict_proba(X_test)[:, 1])
auc_rfc = roc_auc_score((test.income == '>50K').astype(int), 
                        rfc.predict_proba(X_test)[:, 1])

fpr_clf, tpr_clf, thresholds_clf = roc_curve((test.income == '>50K').astype(int), 
                                             clf.predict_proba(X_test)[:, 1])
auc_clf = roc_auc_score((test.income == '>50K').astype(int), 
                        clf.predict_proba(X_test)[:, 1])

fpr_logreg, tpr_logreg, thresholds_logreg = \
    roc_curve((test.income == '>50K').astype(int), 
              logreg_fit.predict(test))
auc_logreg = roc_auc_score((test.income == '>50K').astype(int), 
                           logreg_fit.predict(test))

# Combine results into a single DataFrame
roc_data = pd.DataFrame({
    'False Positive Rate': list(fpr_rfc) + list(fpr_clf) + list(fpr_logreg),
    'True Positive Rate': list(tpr_rfc) + list(tpr_clf) + list(tpr_logreg),
    'Model': ['RFC (AUC='+str(round(auc_rfc,2))+')'] * len(fpr_rfc) + ['CLF (AUC='+str(round(auc_clf,2))+')'] * len(fpr_clf) + ['logreg (AUC='+str(round(auc_logreg,2))+')'] * len(fpr_logreg)})

# Plot ROC curves
fig = px.line(roc_data, x='False Positive Rate', y='True Positive Rate', color='Model',
              title="ROC Curve Comparison",
              labels={'False Positive Rate': '1 - Specificity', 
                      'True Positive Rate': 'Sensitivity'},
              hover_data={'Model': True})

# Add diagonal line (random model)
fig.add_shape(type='line', x0=0, y0=0, x1=1, y1=1, 
              line=dict(dash='dash', color='gray'))

fig.update_layout(title_x=0.5)  # Center the title
fig.show()
