# CH8 NAIVE BAYES CLASSIFICATION


# HANDS-ON ANALYSIS

---
# Use **framingham_nb_training**, **framingham_nb_test** Dataset below.

---

#24
Convert all variables (Death, Sex, and Educ) to factors.

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import statsmodels.tools.tools as stattools

In [None]:
fram_tr = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/framingham_nb_training.csv")
fram_test = pd.read_csv("/content/gdrive/My Drive/Python Practice/Datasets/framingham_nb_test.csv")

In [None]:
X_Sex_ind = np.array(fram_tr['Sex'])
(X_Sex_ind, X_Sex_ind_dict) = stattools.categorical(X_Sex_ind, drop=True, dictnames=True)
X_Sex_ind = pd.DataFrame(X_Sex_ind)

X_Educ_ind = np.array(fram_tr['Educ'])
(X_Educ_ind, X_Educ_ind_dict) = stattools.categorical(X_Educ_ind, drop=True, dictnames=True)
X_Educ_ind = pd.DataFrame(X_Educ_ind)

X_Death_ind = np.array(fram_tr['Death'])
(X_Death_ind, X_Death_ind_dict) = stattools.categorical(X_Death_ind, drop=True, dictnames=True)
X_Death_ind = pd.DataFrame(X_Death_ind)

X_fram = pd.concat((X_Sex_ind, X_Educ_ind, X_Death_ind), axis=1)
X_fram

#Y_fram = fram_tr['']

#25
Create two contingency tables, one with Death and Sex and another with Death and Educ.


In [None]:
tab_sex = pd.crosstab(fram_tr['Death'],fram_tr['Sex'], rownames=["Death"], colnames=["Sex"])
tab_educ = pd.crosstab(fram_tr['Death'],fram_tr['Educ'], rownames=["Death"], colnames=["Educ"])

print(tab_sex)
print(tab_educ)

#26
Use the tables in the previous exercise to calculate:<br>
a.  The probability a randomly selected person is alive or is dead.<br>
b.  The probability a randomly selected person is a male.<br>
c.  The probability a randomly selected person has an Educ value of 3.<br>
d. The probabilities that a dead person is male with education level 1, and that a living person is male with education level 1.<br>
e.  The probabilities that a living person is female with education level 2, and that a dead person is female with education level 2.

In [None]:
#(a)

total = sum(tab_sex.sum())

prob_dead = tab_sex.sum(axis=1)[1] / total
prob_alive = tab_sex.sum(axis=1)[0] / total

print('Dead Probability: %.2f & Alive Probability: %.3f' % (prob_dead, prob_alive))

* b. Probability a randomly selected person is male

In [None]:
prob_male = tab_sex.sum()[1] / total
print('Male Probability: %.2f' % (prob_male))

* c. Probability a randomly selected person has an Educ value of 3

In [None]:
prob_educ3 = tab_educ.sum()[3] / total
print('Educ 3 Probability: %.2f' % (prob_educ3))

* d. Probability that a dead person is male with education level 1, and that a living person is male with education level 1

In [None]:
tab_sex.sum(axis=1)[0]

In [None]:
alive_total = tab_sex.sum(axis=1)[0]
dead_total = tab_sex.sum(axis=1)[1]

dme1 = fram_tr.query('Death == 1 & Sex == 1 & Educ == 1').shape[0] / dead_total
ame1 = fram_tr.query('Death == 0 & Sex == 1 & Educ == 1').shape[0] / alive_total

print('P(X = Educ 1, Male | Y=Dead) : %.2f' % (dme1))
print('P(X = Educ 1, Male | Y=Alive) : %.2f' % (ame1))

* e. Probability that a living person is female with education level 2, and that a dead person is female with education level 2

In [None]:
afe2 = fram_tr.query('Death == 0 & Sex == 2 & Educ == 2').shape[0] / alive_total
dfe2 = fram_tr.query('Death == 1 & Sex == 2 & Educ == 2').shape[0] / dead_total

print('P(X = Educ 2, Female | Y=Alive) : %.2f' % (afe2))
print('P(X = Educ 2, Female | Y=Dead) : %.2f' % (dfe2))

#27
Create side‐by‐side bar graphs for Death, one with an overlay of Sex and the other with an overlay of Educ.

In [None]:
# Death with overlay of Sex

In [None]:
tab_sex.plot(kind='bar', stacked=True)

In [None]:
tab_sex_norm = tab_sex.div(tab_sex.sum(1), axis=0)
tab_sex_norm.plot(kind='bar', stacked=True)

In [None]:
# Death with overlay of Educ

In [None]:
tab_educ.plot(kind='bar', stacked=True)

In [None]:
tab_educ_norm = tab_educ.div(tab_educ.sum(1), axis=0)
tab_educ_norm.plot(kind='bar', stacked=True)

#28
Use the graphs from the previous exercise to answer the following questions:<br>
a.  If we know a person is dead, are they more likely to be male or female?<br>
b.  If we know a person is alive, are they more likely to be male or female?<br>
c.  If we know a person is dead, what education level are they most likely to have?<br>
d. If we know a person is alive, what education level are they most likely to have?<br>
e.  Which education levels are more prevalent for dead persons? For living persons?

In [None]:
# P(sex = ? | Y = Dead)
# Male

* b. If we know a person is alive, are they more likely to be male or female?

In [None]:
# P(sex = ? | Y = Alive)
# Female

* c. If we know a person is dead, what education level are they most likely to have?

In [None]:
# P(educ = ? | Y = Dead)
# Education level 1

* d. If we know a person is alive, what education level are they most likely to have?

In [None]:
# P(educ = ? | Y = Alive)
# Education level 1

* e. Which education levels are more prevalent for dead persons? For living persons?

In [None]:
# For Dead persons, Edu level 1 is prevalent.
# For Alive persons, Edu level 1 and 2 are prevalent.

#29
Compute the posterior probability of Death = 0 (person is living) for a male with edu- cation level 1.<br>
Compute the posterior probability of Death = 1 (person is dead) for a male with education level 1.

In [None]:
total = sum(tab_sex.sum())
alive_total = tab_sex.sum(axis=1)[0]
dead_total = tab_sex.sum(axis=1)[1]

am = tab_sex[1][0] / alive_total
ae1 = tab_educ[1][0] / alive_total

dm = tab_sex[1][1] / dead_total
de1 = tab_educ[1][1] / dead_total

prob_dead = tab_sex.sum(axis=1)[1] / total
prob_alive = tab_sex.sum(axis=1)[0] / total

me1 = fram_tr.query('Sex == 1 & Educ == 1').shape[0] / total

# P(Alive| Male, Edu 1)
# By Bayes -> P(Male, Edu 1 | Alive) P(Alive)  /  P(Male, Edu 1) -> P(Male | Alive) P(Edu 1 | Alive) P(Alive)  /  P(Male, Edu 1)
post_ame1 = (am * ae1 * prob_alive) / me1

# P(Dead | Male, Edu 1)
# By Bayes -> P(Male, Edu 1 | Dead) P(Dead)  /  P(Male, Edu 1) -> P(Male | Dead) P(Edu 1 | Dead) P(Dead)  /  P(Male, Edu 1)
post_dme1 = (dm * de1 * prob_dead) / me1

print('P(Alive | Male, Edu 1) : %.2f' % (post_ame1))
print('P(Dead | Male, Edu 1) : %.2f' % (post_dme1))

#30
Compute the posterior probability of Death = 0 (person is living) for a female with edu- cation level 2. <br>
Compute the posterior probability of Death = 1 (person is dead) for a female with education level 2.

In [None]:
total = sum(tab_sex.sum())
alive_total = tab_sex.sum(axis=1)[0]
dead_total = tab_sex.sum(axis=1)[1]

af = tab_sex[2][0] / alive_total
ae2 = tab_educ[2][0] / alive_total

df = tab_sex[2][1] / dead_total
de2 = tab_educ[2][1] / dead_total

prob_dead = tab_sex.sum(axis=1)[1] / total
prob_alive = tab_sex.sum(axis=1)[0] / total

fe2 = fram_tr.query('Sex == 2 & Educ == 2').shape[0] / total

# P(Alive| Female, Edu 2)
# By Bayes -> P(Female, Edu 2 | Alive) P(Alive)  /  P(Female, Edu 2) -> P(Female | Alive) P(Edu 2 | Alive) P(Alive)  /  P(Female, Edu 2)
post_afe2 = (af * ae2 * prob_alive) / fe2

# P(Dead | Female, Edu 2)
# By Bayes -> P(Female, Edu 2 | Dead) P(Dead)  /  P(Female, Edu 2) -> P(Female | Dead) P(Edu 2 | Dead) P(Dead)  /  P(Female, Edu 2)
post_dfe2 = (df * de2 * prob_dead) / fe2

print('P(Alive | Female, Edu 2) : %.2f' % (post_afe2))
print('P(Dead | Female, Edu 2) : %.2f' % (post_dfe2))

#31
Run the Naïve Bayes classifier to classify persons as living or dead based on sex and education.

In [None]:
X_Sex_ind = np.array(fram_tr['Sex'])
(X_Sex_ind, X_Sex_ind_dict) = stattools.categorical(X_Sex_ind, drop=True, dictnames=True)
X_Sex_ind = pd.DataFrame(X_Sex_ind)

X_Educ_ind = np.array(fram_tr['Educ'])
(X_Educ_ind, X_Educ_ind_dict) = stattools.categorical(X_Educ_ind, drop=True, dictnames=True)
X_Educ_ind = pd.DataFrame(X_Educ_ind)

X_fram_tr = pd.concat((X_Sex_ind, X_Educ_ind), axis=1)

Y_fram_tr = fram_tr['Death']

In [None]:
# fit Naive Bayes algorithm using Training set
nb_fram = MultinomialNB().fit(X_fram_tr,Y_fram_tr)

#32
Evaluate the Naïve Bayes model on the framingham_nb_test data set. <br>
Display the results in a contingency table. <br>
Edit the row and column names of the table to make the table more readable.<br>
Include a total row and column.

In [None]:
# Test NB algorithm by Test set
X_Sex_ind_test = np.array(fram_test['Sex'])
(X_Sex_ind_test, X_Sex_ind_dict_test) = stattools.categorical(X_Sex_ind_test, drop=True, dictnames=True)
X_Sex_ind_test = pd.DataFrame(X_Sex_ind_test)

X_Educ_ind_test = np.array(fram_test['Educ'])
(X_Educ_ind_test, X_Educ_ind_dict_test) = stattools.categorical(X_Educ_ind_test, drop=True, dictnames=True)
X_Educ_ind_test = pd.DataFrame(X_Educ_ind_test)

X_fram_test = pd.concat((X_Sex_ind_test, X_Educ_ind_test), axis=1)

Y_fram_predicted = nb_fram.predict(X_fram_test)

In [None]:
y_fram_pred = pd.crosstab(fram_test['Death'], Y_fram_predicted, rownames=['Actual'], colnames=['Predicted'])
y_fram_pred['Total'] = y_fram_pred.sum(axis=1)
y_fram_pred.loc['Total'] = y_fram_pred.sum()
y_fram_pred.index = ['Actual Alive', 'Actual Dead', 'Total']
y_fram_pred.columns = ['Predicted Alive', 'Predicted Dead', 'Total']

y_fram_pred

#33
According to your table in the previous exercise, find the following values for the Naïve Bayes model:<br>
a. Accuracy<br>
b.  Error rate

In [None]:
# create 2x2 contingency table again without TOTAL in order to use this as input
fram_table = pd.crosstab(fram_test['Death'], Y_fram_predicted, rownames=['Actual'], colnames=['Predicted'])

In [None]:
def eval(cross,lv):
    TN = cross[lv[0]][0]
    TP = cross[lv[1]][1]
    TAP = cross.sum(1)[1]
    TAN = cross.sum(1)[0]
    TPP = cross.sum(0)[1]
    GT = sum(cross.sum())

    acc = round((TN + TP)/GT, 2)    # Accuracy
    err = round(1-acc, 2)           # Error rate
    sen = round(TP/TAP, 2)          # Sensitivity
    rec = round(TN/TAN, 2)          # Specificity = Recall
    pre = round(TP/TPP, 2)          # Precision

    return[acc, err, sen, rec, pre]

def fscore(precision,recall, df):
    f = ((df**2+1)*precision*recall) / ((df**2)*precision + recall)
    return f     

acc_fram, err_fram, sen_fram, rec_fram, pre_fram = eval(fram_table, fram_table.columns)

In [None]:
print('Accuracy of Naive Bayes model : %.2f' %(acc_fram))

* b. Error rate

In [None]:
print('Error rate of Naive Bayes model : %.2f' %(err_fram))

#34
According to your contingency table, find the following values for the Naïve Bayes model:<br>
a. How often it correctly classifies dead persons.<br>
b.  How often it correctly classifies living persons.

In [None]:
print('Sensitivity of Naive Bayes model : %.2f' %(sen_fram))

* b. How often it correctly classifies living persons

In [None]:
print('Recall of Naive Bayes model : %.2f' %(rec_fram))