# Prediction sex with Naive-Bayes Classifier

In [294]:
import pandas as pd
import numpy as np
import random
import time

In [28]:
df = pd.read_csv("adult.csv")
df.head()

Unnamed: 0,Education,Status,Others,Skin Color,Sex
0,Bachelors,Never-married,Adm-clerical,White,Male
1,Bachelors,Married-civ-spouse,Exec-managerial,White,Male
2,HS-grad,Divorced,Handlers-cleaners,White,Male
3,11th,Married-civ-spouse,Handlers-cleaners,Black,Male
4,Bachelors,Married-civ-spouse,Prof-specialty,Black,Female


## Missing values handle

<hr>
This is a helper function that is used in the group method to fix missing data

In [29]:
def get_the_most_similar_sample(row, df):
    res_df = df[(df["Education"] == row[0]) & (df["Status"] == row[1]) & (df["Skin Color"] == row[3]) & (df["Sex"] == row[4])]
    res = res_df["size"].tolist()
    
    if res:
        return res_df[res_df["size"] == max(res)]["Others"].values[0]
    else:
        pass

#### Management of lost data is possible in three ways:

##### 1.Delete them

method="drop"        
<hr>

##### 2.Substitution in proportion to the number of repetitions of each value

method="possibility" 
<hr>

##### 3.Replace with the most similar row in the dataset

method="group"

In [30]:
def miss_handle(dff, col="Others", miss_val= " ?", method="drop"):
    if method == "group":
        columns_list = dff.columns.tolist()
        df_grouped = dff[dff[col] != miss_val].groupby(columns_list, as_index=False).size()
        miss_data_index = dff[dff[col] == miss_val].index.tolist() 
        for row in miss_data_index:
            most_similar = get_the_most_similar_sample(df.loc[row].values, df_grouped)
            dff.loc[row][col] = most_similar
        return dff
    if method == "drop":
        return dff[dff[col] != miss_val]
    
    if method == "possibility":
        
        res = []
        miss_size = dff[dff[col] == miss_val].shape[0] # 1383

       
        dff[col] = pd.Categorical(df[col])
        values = dff[col].cat.categories 
        
        mmax = 0
        lbl_max = ""
        for v in values:
            if v != miss_val and v != " ":
                shape = dff[dff[col] == v].shape[0] / dff.shape[0] * 100
                if shape > mmax:     #|
                    lbl_max = v  #|
                    mmax = shape     #|
                    
                if int(shape) > 0:
                    tempList = ((int(shape * miss_size / 100)) * [v])
                    res += tempList
        fin = (miss_size - len(res)) * [lbl_max]
        res += fin
        random.shuffle(res)
        for i, j in zip(dff[dff[col] == miss_val].index.tolist(), range(len(res))):
            dff.loc[i, col] =  res[j]
        
        return dff

In [31]:
df = miss_handle(df, method="drop")

## Train Test split


In [32]:
def train_test_split(df, test_size = 0.2):
    msk = np.random.rand(len(df)) < 1 - test_size
    train = df[msk]
    test = df[~msk]
    res = "train: {}/test: {}".format(train.shape, test.shape)
    print(res)
    return train, test

train, test = train_test_split(df, 0.3)

train: (24626, 5)/test: (6092, 5)


In [33]:
X_train = train[["Education", "Status", "Others", "Skin Color"]]
X_test = test[["Education", "Status", "Others", "Skin Color"]]
y_train = train[["Sex"]]
y_test = test[["Sex"]]

## Naive-Bayes class definition

Bayes Rule:

$$
P(C|X) = \frac{P(X|C)P(C)}{P(X)}
$$
<br><br>
$$
P(female|X) = \frac{P(X|female)P(female)}{P(X)}
$$
<br>
$$
P(male|X) = \frac{P(X|male)P(male)}{P(X)}
$$

In [303]:
class Naive_bayes():
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def fit(self):
        #------------------------------------------------------------------------- Creating a dataframe for save possibilities
        tbl_poss = pd.DataFrame()
        self.tbl_poss = tbl_poss
        X_train_f = self.X_train[y_train["Sex"] == " Female"]
        X_train_m = self.X_train[y_train["Sex"] == " Male"]
        #-------------------------------------------------------------------------
        #------------------------------------------------------------------------- Computing Prior
        p_male = y_train[y_train["Sex"] == " Male"].shape[0] / y_train.shape[0]
        p_female = 1 - p_male
        tbl_poss["Prior"] = [p_male, p_female]
        #------------------------------------------------------------------------- 
        #------------------------------------------------------------------------- Values extraction
        values = dict()
        self.X_train = pd.DataFrame(X_train)
        for feature in X_train.columns:
            self.X_train.loc[:, feature] = pd.Categorical(self.X_train.loc[:, feature])
            values[feature] = self.X_train[feature].cat.categories
        #------------------------------------------------------------------------- 
        #------------------------------------------------------------------------- Creating a table of possibilities
        tempMale = X_train[y_train["Sex"] == " Male"]
        tempFemale = X_train[y_train["Sex"] == " Female"]
        for f in X_train.columns:
            for v in values[f]:
                m_prob = X_train_m[X_train_m[f] == v].shape[0] / X_train_m.shape[0]
                f_prob = X_train_f[X_train_f[f] == v].shape[0] / X_train_f.shape[0]
                tbl_poss[v] = [m_prob, f_prob]
        #------------------------------------------------------------------------- 

    def predict(self, X_test, y_test):
        start_time = time.time()
        true_values = 0
        for test_index in range(y_test.shape[0]):
            print("Prediction started ... % " + str(int(test_index/y_test.shape[0]*100)), end="\r")
            values = X_test.iloc[test_index].values
            values = np.append(values, "Prior")
            male_p = np.prod(self.tbl_poss[values].iloc[0])
            female_p = np.prod(self.tbl_poss[values].iloc[1])
            predicted = ''
            if male_p > female_p:
                predicted = ' Male'
            else:
                predicted = ' Female'
            if predicted == y_test["Sex"].iloc[test_index]:
                true_values += 1
        
        print("Accuracy: ", true_values / y_test.shape[0])
        print("--- Run time (second) ---: %s" % (round(time.time() - start_time, 2)))


In [304]:
naiveBayes = Naive_bayes(X_train, y_train)

## Training

In [305]:
naiveBayes.fit()

## Prediction Test set
### Accuracy report and execution time

In [306]:
naiveBayes.predict(X_test, y_test)

Accuracy:  0.7834865397242285
--- Run time (second) ---: 8.57
