In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Dataset from "https://archive.ics.uci.edu/ml/datasets/Adult"

Listing of attributes: 
<b><i>
- class: >50K, <=50K. 

- age: continuous. 
- workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. 
- fnlwgt: continuous. 
- education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. 
- education-num: continuous. 
- marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. 
- occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. 
- relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 
- race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 
- sex: Female, Male. 
- capital-gain: continuous. 
- capital-loss: continuous. 
- hours-per-week: continuous. 
- native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.</i></b>

In [4]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
#train_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', sep = ',', names = cols)
#test_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', names = cols)

train_data = pd.read_csv('adult_train.csv')
test_data = pd.read_csv('adult_test.csv')
test_data['class'] = test_data['class'].str.replace('.', '')
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In the dataset most of the columns are attributes are some unique labels. But we can't process labelled data in the algorithm. So we have to convert labeled data into numbers. Here, we have to keep track of all the number which we substituted with labels. 

In [5]:
## Converting all the class categorical labels into numbers

def preprocess(data):
    data1 = data.copy()
    label_dict = {}
    for i in data1.columns:
        if (data1[i].dtype == object):
            uniques = data1[i].str.strip().unique()
            count = 1
            map_dict = {}
            for j in uniques:
                map_dict[j] = count
                count += 1
            data1[i] = data1[i].str.strip().map(map_dict)
            
            label_dict[i] = map_dict
    return data1, label_dict

In [6]:
## Here getting all the values and 

train_process_data, labels = preprocess(train_data)
test_process_data, labels = preprocess(test_data)
train_process_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,1,77516,1,13,1,1,1,1,1,2174,0,40,1,1
1,50,2,83311,1,13,2,2,2,1,1,0,0,13,1,1
2,38,3,215646,2,9,3,3,1,1,1,0,0,40,1,1
3,53,3,234721,3,7,2,3,2,2,1,0,0,40,1,1
4,28,3,338409,1,13,2,4,3,2,2,0,0,40,2,1


In [7]:
class NaiveBayesClassifier (object):
    
    '''
    Naive Bayes classifier is the basic machine learning algorithm. 
    '''
    
    def __init__(self, data, target):
        
        self.data = data
        self.target = target
        self.weights = {}
        
    def estimate_mean_std(self, a, total_sample_size = 1000, each_sample_size = 10):
        
        '''
        Estimating natural paramter of normal distribution
        '''
        
        ele_list = np.array(a)
        random_sample = np.random.choice(ele_list, size = (total_sample_size, each_sample_size))
        
        ## Estimating the value of Mean
        estimate_mean = np.mean(random_sample, axis = 1)
        best_estimate_mean = np.mean(estimate_mean)
        
        ## Taking Best estimate of standard deviation
        estimate_std = np.std(random_sample, axis = 1)
        best_estimate_std = np.mean(estimate_std)
        
        return {'mu':best_estimate_mean, 'sigma' :best_estimate_std}
    
    def estimate_bino_pValue (self, a, unique_val, total_sample_size = 1000, each_sample_size = 10):
        
        '''
        Estimating natural parameter of binomial distribution
        '''
        prob_dict = {}
        for uni in unique_val:
            if (uni in a):
                random_sample = np.random.choice(np.array(a), size = (total_sample_size, each_sample_size))
                val = []
                for sample in random_sample:
                    val.append(np.count_nonzero(uni == sample) / each_sample_size)
                prob_dict[uni] = np.mean(val)
            else:
                prob_dict[uni] = self.laplash_smoothing(len(a), len(unique_val) )
        return prob_dict
        
    def normal_pdf(self, x, mu, sigma):
        
        '''
        Proabability distribution function for Normal Distribution
        '''
        scale_parm = 1 / (np.sqrt(2 * np.pi) * sigma)
        shift = np.e ** -(((x - mu) ** 2) / (2 * sigma ** 2))
        return scale_parm * shift
    
    def calculate_class_prob(self):
        '''
        Calculating class probability for every class
        '''
        self.unique_classes = np.unique(self.data[self.target])
        self.class_prob = {}
        for class_val in self.unique_classes:
            self.class_prob[class_val] = np.count_nonzero(self.data[self.target] == class_val) / self.data.shape[0]

    def calculate_class_based_feature_prob(self, feature):
        
        '''
        Calculating probability for particular feature on the basis of resultant classes
        '''
        val = {}
        feature_unique = np.unique(feature)
        for classes in self.unique_classes:
            mask = self.data[self.target] == classes
            feature_now = feature[mask]
            val[classes] = self.estimate_bino_pValue(feature_now, feature_unique)
        return val
    
    def calculate_class_based_normal_feature(self, feature):
        val = {}
        for classes in self.unique_classes:
            mask = self.data[self.target] == classes
            feature_now = feature[mask]
            val[classes] = self.estimate_mean_std(feature_now)
        return val
    
    
    def predict(self, feature):
        
        pred = {}
        pred_class_sum = 0
        pred_class = None
        total = 0
        for classes in self.unique_classes:
            pred[classes] = {}
            sumVal = 1
            for key, val in feature.items():
                #print(key, val)
                if key in self.weights:
                    if 'mu' in self.weights[key][classes].keys():
                        pred[classes][key] = self.normal_pdf(val, self.weights[key][classes]['mu'], self.weights[key][classes]['sigma'])
                        sumVal *= pred[classes][key]
                    else:
                        pred[classes][key] = self.weights[key][classes][val]
                        sumVal *= pred[classes][key]
            pred[classes]['sum'] = sumVal
            total += sumVal
        
        posterior_prob = {}
        posterior = 0
        classes = 0
        for i in self.unique_classes:
            posterior_prob[i] = pred[i]['sum'] * self.class_prob[i] / total
            if (posterior_prob[i] > posterior):
                posterior = posterior_prob[i]
                classes = i
        return classes
    
    
    def laplash_smoothing(self, data_len, unique_val):
        return 1 / (data_len + unique_val + 1)
    
    def fit(self):
        
        '''
        Fitting data for every feature
        '''
        self.calculate_class_prob()
        self.weights['age'] = self.calculate_class_based_normal_feature(self.data['age'])
        self.weights['workclass'] = self.calculate_class_based_feature_prob(self.data['workclass'])
        self.weights['fnlwgt'] = self.calculate_class_based_normal_feature(self.data['fnlwgt'])
        self.weights['education'] = self.calculate_class_based_feature_prob(self.data['education'])
        self.weights['education-num'] = self.calculate_class_based_normal_feature(self.data['education-num'])
        self.weights['marital-status'] = self.calculate_class_based_feature_prob(self.data['marital-status'])
        self.weights['occupation'] = self.calculate_class_based_feature_prob(self.data['occupation'])
        self.weights['relationship'] = self.calculate_class_based_feature_prob(self.data['relationship'])
        self.weights['race'] = self.calculate_class_based_feature_prob(self.data['race'])
        self.weights['sex'] = self.calculate_class_based_feature_prob(self.data['sex'])
        self.weights['capital-gain'] = self.calculate_class_based_normal_feature(self.data['capital-gain'])
        self.weights['capital-loss'] = self.calculate_class_based_normal_feature(self.data['capital-loss'])
        self.weights['hours-per-week'] = self.calculate_class_based_normal_feature(self.data['hours-per-week'])
        self.weights['native-country'] = self.calculate_class_based_feature_prob(self.data['native-country'])
    
    def predict_batch(self, data):
        val = []
        for i in range(test_data.shape[0]):
            val.append(self.predict(data.iloc[i]))
        return val
    
    def predict_batchdata(self, data):
        classes = data.apply(self.predict, axis = 1)
        return classes

In [8]:
nb = NaiveBayesClassifier(train_process_data, 'class')

In [9]:
nb.fit()

In [10]:
classes = nb.predict_batchdata(test_process_data)



In [11]:
class Score_Matrix(object):
    
    '''
    This class has all score matrix by which we can check efficiency of our algorithm on given dataset.
    '''
    
    def __init__(self, y_actual, y_pred):
        self.y_actual = y_actual
        self.y_pred = np.array(y_pred)
        
    def confusion_matrix(self):
        self.data = pd.crosstab(self.y_actual, self.y_pred, rownames= ['actual'], colnames = ['predicted'], margins = True)
        return self.data
        
    def recall(self):
        '''
        data:
            Dataframe of confusion matrix
        '''
        sumVal = 0
        self.confusion_matrix()
        for i in self.data.columns[:-1]:
            sumVal += (self.data[i][i] / self.data['All'][i])
        return np.round(sumVal / (self.data.shape[0] - 1), 3)
    
    def check_accuracy(self):
        sumVal = 0
        self.confusion_matrix()
        for i in self.data.columns[:-1]:
            sumVal += self.data[i][i]
        return np.round(sumVal / self.data['All']['All'], 3)
    
    def precision(self):
        sumVal = 0
        self.confusion_matrix()
        for i in self.data.columns[:-1]:
            sumVal += (self.data[i][i] / self.data[i]['All'])
        avg = sumVal / (self.data.shape[0]- 1)
        return np.round(avg,3)
    

In [12]:
sm = Score_Matrix(test_process_data['class'], classes)

In [13]:
sm.confusion_matrix()

predicted,0,1,2,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,12037,398,12435
2,3,2753,1090,3846
All,3,14790,1488,16281


In [14]:
accuracy = sm.check_accuracy()
print ("Accuracy on test dataset: {}".format(accuracy))

Accuracy on test dataset: 0.806


In [15]:
recall = sm.recall()
print ('Recall on test dataset: {}'.format(recall))

Recall on test dataset: 0.626


In [17]:
precision = sm.precision()
print ('Precision on test dataset: {}'.format(precision))

Precision on test dataset: 0.773


## Applying PCA for feature reduction
PCA (Principle Component Ananlysis) is a decomposition technique by which we can reduce a very high dimensional matrix. PCA is neccessary because as the number of dimension increase. The computational power also increase. But we may have some features are not at all participating in changing classification.

PCA could only apply 

Steps to perform in PCA:

- Check for the singular matrix. If determinant of the covariance of the matrix is zero. That matrix is signular.

In [None]:
def PCA(data):
    
    # Checking for the singular matrix.
    covarince = data.cov()
    
    ## Calculating Determinant of the Covariance Matrix
    deter = np.linalg.det(convariance)
    
    ## If determinant is zero. Then data matrix is Singular
    if (deter != 0):
        print ("Determinant is not zero")
        return 
    
    print ('Determinant is zero.')
    
    