# **CH 7 MODEL EVALUATION**

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import pandas as pd
import numpy as np 

In [None]:
clothing_train = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/clothing_data_driven_training", delimiter='\t')
clothing_test = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/clothing_data_driven_test", delimiter='\t')

#clothing_train
#clothing_test

### **HANDS-ON ANALYSIS**

## 23 
*   Using Training set, Create C5.0 model (Model 1) to predict customer's 'Income' using 'Marital Status', 'Capital Gains and Losses' 
*   Obtain Predicted Responses




In [None]:
import statsmodels.tools.tools as stattools
from sklearn.tree import DecisionTreeClassifier, export_graphviz

adult_tr = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/adult_ch6_training")
adult_test = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/adult_ch6_test")
y = adult_tr[['Income']]
y_test = adult_test[['Income']]

In [None]:
mar_np = np.array(adult_tr['Marital status'])
(mar_cat, mar_cat_dict) = stattools.categorical(mar_np, drop=True, dictnames=True)

In [None]:
mar_cat_pd = pd.DataFrame(mar_cat)
X = pd.concat((adult_tr[['Cap_Gains_Losses']], mar_cat_pd), axis=1)

X_names = ['Cap_Gains_Losses', 'Divorced', 'Married', 'Never-married', 'Separated', 'Widowed']
y_names = ['<=50K', '>50K']

c50_01 = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=5).fit(X,y)

In [None]:
export_graphviz(c50_01, out_file = '/content/gdrive/My Drive/Python Practice/Datasets/c50_01.dot', 
                feature_names=X_names, class_names=y_names)

In [None]:
c50_01.predict(X)

## 24
*   Evaluate Model 1 using Test set
*   Construct Contingency table



In [None]:
mar_np_test = np.array(adult_test['Marital status'])
(mar_cat_test, mar_cat_dict_test) = stattools.categorical(mar_np_test, drop=True, dictnames=True)

mar_cat_pd_test = pd.DataFrame(mar_cat_test)
X_test = pd.concat((adult_test[['Cap_Gains_Losses']], mar_cat_pd_test), axis=1)

In [None]:
c50_01.predict(X_test)

In [None]:
#y = adult_tr['Income']
y_test = adult_test['Income']

#pred_train = c50_01.predict(X)
pred_test = c50_01.predict(X_test)

In [None]:
crosstab_model1 = pd.crosstab(pred_test, y_test, rownames=["prediction"], colnames=["actual"], margins=True)
crosstab_model1

### 25
*   Calculate all evaluation measures


In [None]:
def eval(cross,lv):
    TN = cross[lv[0]][0]
    TP = cross[lv[1]][1]
    TAP = cross.sum(1)[1]
    TAN = cross.sum(1)[0]
    TPP = cross.sum(0)[1]
    GT = sum(cross.sum())

    acc = round((TN + TP)/GT, 2)    # Accuracy
    err = round(1-acc, 2)           # Error rate
    sen = round(TP/TAP, 2)          # Sensitivity
    rec = round(TN/TAN, 2)          # Specificity = Recall
    pre = round(TP/TPP, 2)          # Precision

    return[acc, err, sen, rec, pre]

def fscore(precision,recall, df):
    f = ((df**2+1)*precision*recall) / ((df**2)*precision + recall)
    return f     

acc1, err1, sen1, rec1, pre1 = eval(crosstab_model1, y_names)

In [None]:
f1_model1 = fscore(pre1, rec1, 1)
f2_model1 = fscore(pre1, rec1, 2)
f05_model1 = fscore(pre1, rec1, 0.5)

In [None]:
eval_dict = {'Eval Measures' : ['Accuracy', 'Error rate', 'Sensitivity', 'Recall(Specificity)', 'Precision', 'F1', 'F2', 'F0.5'],
             'Model 1 : Eval Values' : [acc1, err1, sen1, rec1, pre1, f1_model1, f2_model1, f05_model1]} 

model_eval = pd.DataFrame(eval_dict)
model_eval

### 26
* Interpret evaluation measures

### 27
* Create Cost matrix, 3x, FP is 4 times as bad as FN

In [None]:
#cost_mat = np.array(([0,4],[1,0]))
#cost_mat

In [None]:
n = adult_tr.shape[0]
#cost_list = [0,4,1,0]*n
cost_list = [4,1,0,0]*n
cost_mat = np.array(cost_list).reshape(n,4)
cost_mat

### 28
* Build C5.0 model (Model 2) using training set, 3x cost matrix

In [None]:
#!pip install costcla

In [None]:
 import costcla.models as co

# change y to array and change values to numeric (<=50K = 0)
y_train = np.array(y).reshape(n,)
y_train[y_train == y_names[0]] = 0
y_train[y_train == y_names[1]] = 1

# change X, X_test (training, test set) to array
X_array = np.array(X)
X_test_array = np.array(X_test)

# Create C5.0 model (Model 2) using Cost Matrix
# Fit training set to this model, and Obtain prediction
DT = co.CostSensitiveDecisionTreeClassifier()
DT_fit = DT.fit(X_array, y_train, cost_mat=cost_mat)
DT_fit.predict(X_array)

### 29
* Evaluate prediction from Model 2 using actual response from test set
* Add 'Overall Model Cost', 'Profit per Customer' to the table
* Calculate all measures from Model Evaluation Table

In [None]:
pred_test = DT_fit.predict(X_test_array)
pred_test = pred_test.reshape(X_test_array.shape[0], 1)

y_pred_test = pd.DataFrame(pred_test)
y_pred_test[y_pred_test == 0] = y_names[0]
y_pred_test[y_pred_test == 1] = y_names[1]


crosstab_model2 = pd.crosstab(adult_test['Income'], y_pred_test.loc[:,0], rownames=['Actual'], colnames=['Predicted'])
crosstab_model2

In [None]:
def eval(cross,lv):
    TN = cross[lv[0]][0]
    TP = cross[lv[1]][1]
    TAP = cross.sum(1)[1]
    TAN = cross.sum(1)[0]
    TPP = cross.sum(0)[1]
    GT = sum(cross.sum())

    acc = round((TN + TP)/GT, 2)    # Accuracy
    err = round(1-acc, 2)           # Error rate
    sen = round(TP/TAP, 2)          # Sensitivity
    rec = round(TN/TAN, 2)          # Specificity = Recall
    pre = round(TP/TPP, 2)          # Precision

    return[acc, err, sen, rec, pre]

def fscore(precision,recall, df):
    f = ((df**2+1)*precision*recall) / ((df**2)*precision + recall)
    return f     

acc2, err2, sen2, rec2, pre2 = eval(crosstab_model2, y_names)

In [None]:
f1_model2 = fscore(pre2, rec2, 1)
f2_model2 = fscore(pre2, rec2, 2)
f05_model2 = fscore(pre2, rec2, 0.5)

In [None]:
eval_model2 = {'Model 2 : Eval Values' :  [acc2, err2, sen2, rec2, pre2, f1_model2, f2_model2, f05_model2]} 
eval_model2_summary = pd.DataFrame(eval_model2)

model_eval = pd.concat((model_eval, eval_model2_summary), axis=1)
model_eval

In [None]:
def eval(cross,lv,cost_mat):
    TN = cross[lv[0]][0]
    FN = cross[lv[1]][0]
    FP = cross[lv[0]][1]
    TP = cross[lv[1]][1]
    GT = sum(cross.sum())

    total_cost = FP*cost_mat[0] + FN*cost_mat[1] + TP*cost_mat[2] + TN*cost_mat[3]  
    profit_per_customer = -total_cost / GT
    return[total_cost, profit_per_customer]

In [None]:
# Overall Model Cost / Profit per Customer for MODEL 2
cost_model2, profit_model2 = eval(crosstab_model2, y_names, cost_mat[1])
print('overall model cost: %.2f & Profit per Customer: %.3f' % (cost_model2, profit_model2))

### 30
* Compare Evaluation Measures from Model 1 and Model 2 using 3x cost matrix
* Strength and Weakness for each model

### **CH8 NAIVE BAYES CLASSIFICATION**
## 8.5.1 Naive Bayes

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import statsmodels.tools.tools as stattools

In [None]:
wine_tr = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/wine_flag_training.csv")
wine_test = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/wine_flag_test.csv")

In [None]:
t1 = pd.crosstab(wine_tr['Type'], wine_tr['Alcohol_flag'])
t1['Total'] = t1.sum(axis=1)
t1.loc['Total'] = t1.sum()
t1 

In [None]:
t1_plot = pd.crosstab(wine_tr['Alcohol_flag'], wine_tr['Type'])
t1_plot.plot(kind='bar', stacked=True)

In [None]:
X_Alcohol_ind = np.array(wine_tr['Alcohol_flag'])
(X_Alcohol_ind, X_Alcohol_ind_dict) = stattools.categorical(X_Alcohol_ind, drop=True, dictnames=True)
X_Alcohol_ind = pd.DataFrame(X_Alcohol_ind)

X_Sugar_ind = np.array(wine_tr['Sugar_flag'])
(X_Sugar_ind, X_Sugar_ind_dict) = stattools.categorical(X_Sugar_ind, drop=True, dictnames=True)
X_Sugar_ind = pd.DataFrame(X_Sugar_ind)

X = pd.concat((X_Alcohol_ind, X_Sugar_ind), axis=1)

Y = wine_tr['Type']

In [None]:
# fit Naive Bayes algorithm using Training set
nb_01 = MultinomialNB().fit(X,Y)

In [None]:
# Test NB algorithm by Test set
X_Alcohol_ind_test = np.array(wine_test['Alcohol_flag'])
(X_Alcohol_ind_test, X_Alcohol_ind_dict_test) = stattools.categorical(X_Alcohol_ind_test, drop=True, dictnames=True)
X_Alcohol_ind_test = pd.DataFrame(X_Alcohol_ind_test)

X_Sugar_ind_test = np.array(wine_test['Sugar_flag'])
(X_Sugar_ind_test, X_Sugar_ind_dict_test) = stattools.categorical(X_Sugar_ind_test, drop=True, dictnames=True)
X_Sugar_ind_test = pd.DataFrame(X_Sugar_ind_test)

X_test = pd.concat((X_Alcohol_ind_test, X_Sugar_ind_test), axis=1)

Y_predicted = nb_01.predict(X_test)
Y_predicted

In [None]:
ypred = pd.crosstab(wine_test['Type'], Y_predicted, rownames=['Actual'], colnames=['Predicted'])
ypred['Total'] = ypred.sum()
ypred.loc['Total'] = ypred.sum()
ypred

### HANDS-ON ANALYSIS
### 24
* Convert all variables to factors

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import statsmodels.tools.tools as stattools

In [None]:
fram_tr = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/framingham_nb_training.csv")
fram_test = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/framingham_nb_test.csv")

In [None]:
X_Sex_ind = np.array(fram_tr['Sex'])
(X_Sex_ind, X_Sex_ind_dict) = stattools.categorical(X_Sex_ind, drop=True, dictnames=True)
X_Sex_ind = pd.DataFrame(X_Sex_ind)

X_Educ_ind = np.array(fram_tr['Educ'])
(X_Educ_ind, X_Educ_ind_dict) = stattools.categorical(X_Educ_ind, drop=True, dictnames=True)
X_Educ_ind = pd.DataFrame(X_Educ_ind)

X_Death_ind = np.array(fram_tr['Death'])
(X_Death_ind, X_Death_ind_dict) = stattools.categorical(X_Death_ind, drop=True, dictnames=True)
X_Death_ind = pd.DataFrame(X_Death_ind)

X_fram = pd.concat((X_Sex_ind, X_Educ_ind, X_Death_ind), axis=1)
X_fram

#Y_fram = fram_tr['']

### 25
Create two contingency table. 1. Death and Sex 2. Death and Educ

In [None]:
tab_sex = pd.crosstab(fram_tr['Death'],fram_tr['Sex'], rownames=["Death"], colnames=["Sex"])
tab_educ = pd.crosstab(fram_tr['Death'],fram_tr['Educ'], rownames=["Death"], colnames=["Educ"])

print(tab_sex)
print(tab_educ)

### 26
* a. Probability a randomly selected person is alive or is dead

In [None]:
total = sum(tab_sex.sum())

prob_dead = tab_sex.sum(axis=1)[1] / total
prob_alive = tab_sex.sum(axis=1)[0] / total

print('Dead Probability: %.2f & Alive Probability: %.3f' % (prob_dead, prob_alive))

* b. Probability a randomly selected person is male

In [None]:
prob_male = tab_sex.sum()[1] / total
print('Male Probability: %.2f' % (prob_male))

* c. Probability a randomly selected person has an Educ value of 3

In [None]:
prob_educ3 = tab_educ.sum()[3] / total
print('Educ 3 Probability: %.2f' % (prob_educ3))

* d. Probability that a dead person is male with education level 1, and that a living person is male with education level 1

In [None]:
tab_sex.sum(axis=1)[0]

In [None]:
alive_total = tab_sex.sum(axis=1)[0]
dead_total = tab_sex.sum(axis=1)[1]

dme1 = fram_tr.query('Death == 1 & Sex == 1 & Educ == 1').shape[0] / dead_total
ame1 = fram_tr.query('Death == 0 & Sex == 1 & Educ == 1').shape[0] / alive_total

print('P(X = Educ 1, Male | Y=Dead) : %.2f' % (dme1))
print('P(X = Educ 1, Male | Y=Alive) : %.2f' % (ame1))

* e. Probability that a living person is female with education level 2, and that a dead person is female with education level 2

In [None]:
afe2 = fram_tr.query('Death == 0 & Sex == 2 & Educ == 2').shape[0] / alive_total
dfe2 = fram_tr.query('Death == 1 & Sex == 2 & Educ == 2').shape[0] / dead_total

print('P(X = Educ 2, Female | Y=Alive) : %.2f' % (afe2))
print('P(X = Educ 2, Female | Y=Dead) : %.2f' % (dfe2))

### 27
Create side-by-side bar graphs 1. Death with overlay of Sex 2. Death with overlay of Educ

In [None]:
# Death with overlay of Sex

In [None]:
tab_sex.plot(kind='bar', stacked=True)

In [None]:
tab_sex_norm = tab_sex.div(tab_sex.sum(1), axis=0)
tab_sex_norm.plot(kind='bar', stacked=True)

In [None]:
# Death with overlay of Educ

In [None]:
tab_educ.plot(kind='bar', stacked=True)

In [None]:
tab_educ_norm = tab_educ.div(tab_educ.sum(1), axis=0)
tab_educ_norm.plot(kind='bar', stacked=True)

### 28
* a. If we know a person is dead, are they more likely to be male or female?

In [None]:
# P(sex = ? | Y = Dead)
# Male

* b. If we know a person is alive, are they more likely to be male or female?

In [None]:
# P(sex = ? | Y = Alive)
# Female

* c. If we know a person is dead, what education level are they most likely to have?

In [None]:
# P(educ = ? | Y = Dead)
# Education level 1

* d. If we know a person is alive, what education level are they most likely to have?

In [None]:
# P(educ = ? | Y = Alive)
# Education level 1

* e. Which education levels are more prevalent for dead persons? For living persons?

In [None]:
# For Dead persons, Edu level 1 is prevalent.
# For Alive persons, Edu level 1 and 2 are prevalent.

### 29
* Compute Posterior Probability of Death = 0 (Alive) for male with education level 1
* Compute Posterior Probability of Death = 1 (Dead) for male with education level 1

In [None]:
total = sum(tab_sex.sum())
alive_total = tab_sex.sum(axis=1)[0]
dead_total = tab_sex.sum(axis=1)[1]

am = tab_sex[1][0] / alive_total
ae1 = tab_educ[1][0] / alive_total

dm = tab_sex[1][1] / dead_total
de1 = tab_educ[1][1] / dead_total

prob_dead = tab_sex.sum(axis=1)[1] / total
prob_alive = tab_sex.sum(axis=1)[0] / total

me1 = fram_tr.query('Sex == 1 & Educ == 1').shape[0] / total

# P(Alive| Male, Edu 1)
# By Bayes -> P(Male, Edu 1 | Alive) P(Alive)  /  P(Male, Edu 1) -> P(Male | Alive) P(Edu 1 | Alive) P(Alive)  /  P(Male, Edu 1)
post_ame1 = (am * ae1 * prob_alive) / me1

# P(Dead | Male, Edu 1)
# By Bayes -> P(Male, Edu 1 | Dead) P(Dead)  /  P(Male, Edu 1) -> P(Male | Dead) P(Edu 1 | Dead) P(Dead)  /  P(Male, Edu 1)
post_dme1 = (dm * de1 * prob_dead) / me1

print('P(Alive | Male, Edu 1) : %.2f' % (post_ame1))
print('P(Dead | Male, Edu 1) : %.2f' % (post_dme1))

### 30
* Compute Posterior Probability of Death = 0 (Alive) for female with education level 2
* Compute Posterior Probability of Death = 1 (Dead) for female with education level 2

In [None]:
total = sum(tab_sex.sum())
alive_total = tab_sex.sum(axis=1)[0]
dead_total = tab_sex.sum(axis=1)[1]

af = tab_sex[2][0] / alive_total
ae2 = tab_educ[2][0] / alive_total

df = tab_sex[2][1] / dead_total
de2 = tab_educ[2][1] / dead_total

prob_dead = tab_sex.sum(axis=1)[1] / total
prob_alive = tab_sex.sum(axis=1)[0] / total

fe2 = fram_tr.query('Sex == 2 & Educ == 2').shape[0] / total

# P(Alive| Female, Edu 2)
# By Bayes -> P(Female, Edu 2 | Alive) P(Alive)  /  P(Female, Edu 2) -> P(Female | Alive) P(Edu 2 | Alive) P(Alive)  /  P(Female, Edu 2)
post_afe2 = (af * ae2 * prob_alive) / fe2

# P(Dead | Female, Edu 2)
# By Bayes -> P(Female, Edu 2 | Dead) P(Dead)  /  P(Female, Edu 2) -> P(Female | Dead) P(Edu 2 | Dead) P(Dead)  /  P(Female, Edu 2)
post_dfe2 = (df * de2 * prob_dead) / fe2

print('P(Alive | Female, Edu 2) : %.2f' % (post_afe2))
print('P(Dead | Female, Edu 2) : %.2f' % (post_dfe2))

### 31
Run Naive Bayes Classifier to predict Death using Sex and Education

In [None]:
X_Sex_ind = np.array(fram_tr['Sex'])
(X_Sex_ind, X_Sex_ind_dict) = stattools.categorical(X_Sex_ind, drop=True, dictnames=True)
X_Sex_ind = pd.DataFrame(X_Sex_ind)

X_Educ_ind = np.array(fram_tr['Educ'])
(X_Educ_ind, X_Educ_ind_dict) = stattools.categorical(X_Educ_ind, drop=True, dictnames=True)
X_Educ_ind = pd.DataFrame(X_Educ_ind)

X_fram_tr = pd.concat((X_Sex_ind, X_Educ_ind), axis=1)

Y_fram_tr = fram_tr['Death']

In [None]:
# fit Naive Bayes algorithm using Training set
nb_fram = MultinomialNB().fit(X_fram_tr,Y_fram_tr)

### 32
* Evaluate Naive Bayes Model by Test set
* Display Contingency Table

In [None]:
# Test NB algorithm by Test set
X_Sex_ind_test = np.array(fram_test['Sex'])
(X_Sex_ind_test, X_Sex_ind_dict_test) = stattools.categorical(X_Sex_ind_test, drop=True, dictnames=True)
X_Sex_ind_test = pd.DataFrame(X_Sex_ind_test)

X_Educ_ind_test = np.array(fram_test['Educ'])
(X_Educ_ind_test, X_Educ_ind_dict_test) = stattools.categorical(X_Educ_ind_test, drop=True, dictnames=True)
X_Educ_ind_test = pd.DataFrame(X_Educ_ind_test)

X_fram_test = pd.concat((X_Sex_ind_test, X_Educ_ind_test), axis=1)

Y_fram_predicted = nb_fram.predict(X_fram_test)

In [None]:
y_fram_pred = pd.crosstab(fram_test['Death'], Y_fram_predicted, rownames=['Actual'], colnames=['Predicted'])
y_fram_pred['Total'] = y_fram_pred.sum(axis=1)
y_fram_pred.loc['Total'] = y_fram_pred.sum()
y_fram_pred.index = ['Actual Alive', 'Actual Dead', 'Total']
y_fram_pred.columns = ['Predicted Alive', 'Predicted Dead', 'Total']

y_fram_pred

### 33 
* a. Accuracy

In [None]:
# create 2x2 contingency table again without TOTAL in order to use this as input
fram_table = pd.crosstab(fram_test['Death'], Y_fram_predicted, rownames=['Actual'], colnames=['Predicted'])

In [None]:
def eval(cross,lv):
    TN = cross[lv[0]][0]
    TP = cross[lv[1]][1]
    TAP = cross.sum(1)[1]
    TAN = cross.sum(1)[0]
    TPP = cross.sum(0)[1]
    GT = sum(cross.sum())

    acc = round((TN + TP)/GT, 2)    # Accuracy
    err = round(1-acc, 2)           # Error rate
    sen = round(TP/TAP, 2)          # Sensitivity
    rec = round(TN/TAN, 2)          # Specificity = Recall
    pre = round(TP/TPP, 2)          # Precision

    return[acc, err, sen, rec, pre]

def fscore(precision,recall, df):
    f = ((df**2+1)*precision*recall) / ((df**2)*precision + recall)
    return f     

acc_fram, err_fram, sen_fram, rec_fram, pre_fram = eval(fram_table, fram_table.columns)

In [None]:
print('Accuracy of Naive Bayes model : %.2f' %(acc_fram))

* b. Error rate

In [None]:
print('Error rate of Naive Bayes model : %.2f' %(err_fram))

### 34
* a. How often it correctly classifies dead persons

In [None]:
print('Sensitivity of Naive Bayes model : %.2f' %(sen_fram))

* b. How often it correctly classifies living persons

In [None]:
print('Recall of Naive Bayes model : %.2f' %(rec_fram))