# Q5

In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import math

In [2]:
data =  pd.read_csv("income_Q5.csv")

##### Because about three-quarters of the available data is in Class "income = <=50K", the data is not well distributed and causes errors. To avoid this problem, we select the data in such a way that the number of two types of classes for the "income" feature is approximately equal. Once I tried with all the data, it just had good accuracy and the rest of the criteria (Precision,Recall,F1_Score) were all zero.

In [3]:
a = data['income'].unique()
df1=data[data['income'] == a[0]]
df2=data[data['income'] == a[1]]
index = list(df1.index.values.astype(int))
samples = random.sample(index,df2.shape[0])
df3 = data.loc[samples,:]
df = pd.concat([df2, df3])
df = df.sample(frac=1)
df

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,native-country,income
2552,<=38,Self-emp-not-inc,Bachelors,>9,Married-civ-spouse,Sales,Wife,White,Female,United-States,>50K
7961,<=38,Private,Some-college,>9,Divorced,Adm-clerical,Own-child,White,Male,United-States,<=50K
6695,>38,State-gov,Doctorate,>9,Widowed,Prof-specialty,Not-in-family,Black,Male,United-States,>50K
45683,>38,Private,Assoc-voc,>9,Widowed,Sales,Not-in-family,White,Female,United-States,<=50K
29514,>38,Federal-gov,Masters,>9,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...
34797,<=38,Private,11th,<=9,Never-married,Other-service,Own-child,White,Male,United-States,<=50K
23484,>38,Self-emp-not-inc,Prof-school,>9,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,>50K
28443,<=38,Private,HS-grad,<=9,Divorced,Exec-managerial,Unmarried,White,Female,United-States,>50K
9137,<=38,Special,Some-college,>9,Never-married,?,Own-child,White,Male,United-States,<=50K


###  NaiveBayes

In [4]:
def cross_validation(df,per):
    #first Replace them!
    data = df.sample(frac=1)
    percent = int(data.shape[0]*per/100)
    X_train = data.iloc[0:percent,:]
    X_test = data.iloc[percent:-1,:]
    return X_train,X_test
    
def NaiveBayes(data):

    df = data
    columns = list(df.columns)
    columns.pop()
    df['labels'] = -1
    tag = list(df['income'].unique())
    #one hot coded
    df.loc[df['income'] == tag[0],'labels'] = 0
    df.loc[df['income'] == tag[1],'labels'] = 1
    
    #split data
    X_train,X_test = cross_validation(df,80)
    tag = list(X_train['labels'].unique())
    X_test['y_predict'] = -1
     
    acc = 0
    for row in range(X_test.shape[0]):
        #make array of P(X1 | yi)P(X2 | yi)...P(Xn | yi) P(yi)
        probXofYs = np.zeros([1,2])
        
        for l in tag:
            probsofXi= 1
            
            #P(yi)
            probYi = X_train[X_train['labels'] == l].shape[0]/X_train.shape[0]

            for col in range(X_test.shape[1]-3):
                
                X = X_test.iloc[row,col]
                #Laplace smoothing initialize parameters
                v = len(list(df[columns[col]].unique())) 
                
                #compute each P(Xi | yi)
                probXi = (X_train[(X_train[columns[col]] == X)].shape[0]+v)/X_train.shape[0]
                probXiYi=(X_train[(X_train[columns[col]] == X)&(X_train['labels']==l)].shape[0]+1)/(probYi+v)

                #the features are independent from each other
                probsofXi = probsofXi*probXiYi
            
            probXofYs[0,l] = probsofXi*probYi
            
        X_test.iloc[row,-1] = probXofYs.argmax(axis = 1)
        #compute accuracy
        if(X_test.iloc[row,-1] == X_test.iloc[row,-2]): acc+=1
    
    
    display("Accuracy:", acc/X_test.shape[0]*100)
    Ypredict = np.array(X_test.iloc[:,-1])
    Truelabel = np.array(X_test.iloc[:,-2])
    return X_test
        
    


### Confusion_Matrix

In [5]:
def Confusion_Matrix(X_Test):
    cnt_TP = 0
    cnt_TN = 0
    cnt_FP = 0
    cnt_FN = 0
    for row in range(X_Test.shape[0]):
        if(X_Test.iloc[row,-2] == X_Test.iloc[row,-1]==1):
            cnt_TP +=1
        if(X_Test.iloc[row,-2] == X_Test.iloc[row,-1]==0):
            cnt_TN +=1
        if(X_Test.iloc[row,-2] == 1 and X_Test.iloc[row,-1]==0):
            cnt_FN +=1
        if(X_Test.iloc[row,-2] == 0 and X_Test.iloc[row,-1]==1):
            cnt_FP +=1
    cm = np.array([cnt_TP, cnt_FN, cnt_FP, cnt_TN]).reshape(2,2)
    print(cm)
    acc = (cnt_TP+cnt_TN)/X_Test.shape[0]*100
    pres = cnt_TP/(cnt_TP+cnt_FP+1)*100
    recall = cnt_TP/(cnt_TP+cnt_FN+1)*100
    F1 = 2*pres*recall/(pres+recall+1)
    classes = ['Positive','Negative']
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]
    cm_frame1 = pd.DataFrame(data=cm,columns=pd.MultiIndex(levels=[['Predicted:'],\
                                                                  classes],codes=level_labels),\
                        index=pd.MultiIndex(levels=[['Actual:'], classes], codes=level_labels))
    criteria=pd.DataFrame(data={'Accuracy':acc,'Precision':pres,'Recall':recall,'F1_Score':F1},index=['0'])
    return cm_frame1 ,criteria
    

### Test Data:

In [6]:
X_test = NaiveBayes(df)

'Accuracy:'

79.0543431750107

In [7]:
cm_frame1, criteria = Confusion_Matrix(X_test)
display(cm_frame1)
display(criteria)

[[1728  588]
 [ 391 1967]]


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted:,Predicted:
Unnamed: 0_level_1,Unnamed: 1_level_1,Positive,Negative
Actual:,Positive,1728,588
Actual:,Negative,391,1967


Unnamed: 0,Accuracy,Precision,Recall,F1_Score
0,79.054343,81.509434,74.579197,77.394629


### The importance of using Laplace smoothing
###### If a given Xi and class never occur together in the training data, then the probability estimate for that Xi conditioned on that class will be zero. Recall, the probability estimate is directly proportional to the number of times a given Xi occurs. Discuss why this scenario is problematic for our classifier and how an adversarial agent (spam generator) might take advantage of it. One solution is to smooth all our word probabilities upwards by a count of 1. That is, we assume we saw each token in each Xi for each class once more than we actually did.