In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Dataset: https://archive.ics.uci.edu/ml/datasets/Image+Segmentation<br>
Training data link: https://archive.ics.uci.edu/ml/machine-learning-databases/image/segmentation.data<br>
Testing data link: https://archive.ics.uci.edu/ml/machine-learning-databases/image/segmentation.test

In [2]:
train_data = pd.read_csv('.//..//Dataset//segmentation_train.csv', sep = ',')
test_data = pd.read_csv('.//..//Dataset//segmentation_test.csv', sep = ',')

In [3]:
train_data.head()

Unnamed: 0,CLASS,REGION-CENTROID-COL,REGION-CENTROID-ROW,REGION-PIXEL-COUNT,SHORT-LINE-DENSITY-5,SHORT-LINE-DENSITY-2,VEDGE-MEAN,VEDGE-SD,HEDGE-MEAN,HEDGE-SD,INTENSITY-MEAN,RAWRED-MEAN,RAWBLUE-MEAN,RAWGREEN-MEAN,EXRED-MEAN,EXBLUE-MEAN,EXGREEN-MEAN,VALUE-MEAN,SATURATION-MEAN,HUE-MEAN
0,BRICKFACE,140,125,9,0.0,0.0,0.277778,0.062963,0.666667,0.311111,6.185185,7.333334,7.666666,3.555556,3.444444,4.444445,-7.888889,7.777778,0.545635,-1.121818
1,BRICKFACE,188,133,9,0.0,0.0,0.333333,0.266667,0.5,0.077778,6.666666,8.333334,7.777778,3.888889,5.0,3.333333,-8.333333,8.444445,0.53858,-0.924817
2,BRICKFACE,105,139,9,0.0,0.0,0.277778,0.107407,0.833333,0.522222,6.111111,7.555555,7.222222,3.555556,4.333334,3.333333,-7.666666,7.555555,0.532628,-0.965946
3,BRICKFACE,34,137,9,0.0,0.0,0.5,0.166667,1.111111,0.474074,5.851852,7.777778,6.444445,3.333333,5.777778,1.777778,-7.555555,7.777778,0.573633,-0.744272
4,BRICKFACE,39,111,9,0.0,0.0,0.722222,0.374074,0.888889,0.429629,6.037037,7.0,7.666666,3.444444,2.888889,4.888889,-7.777778,7.888889,0.562919,-1.175773


In [4]:
train_data.columns

Index(['CLASS', 'REGION-CENTROID-COL', 'REGION-CENTROID-ROW',
       'REGION-PIXEL-COUNT', 'SHORT-LINE-DENSITY-5', 'SHORT-LINE-DENSITY-2',
       'VEDGE-MEAN', 'VEDGE-SD', 'HEDGE-MEAN', 'HEDGE-SD', 'INTENSITY-MEAN',
       'RAWRED-MEAN', 'RAWBLUE-MEAN', 'RAWGREEN-MEAN', 'EXRED-MEAN',
       'EXBLUE-MEAN', 'EXGREEN-MEAN', 'VALUE-MEAN', 'SATURATION-MEAN',
       'HUE-MEAN'],
      dtype='object')

In [69]:
class NaiveBayesClassifier (object):
    
    def __init__(self, data, target):
        
        self.data = data
        self.target = target
        self.weights = {}
        
    def estimate_mean_std(self, a, total_sample_size = 1000, each_sample_size = 10):
        
        '''
        Estimating natural paramter of normal distribution
        '''
        
        ele_list = np.array(a)
        random_sample = np.random.choice(ele_list, size = (total_sample_size, each_sample_size))
        
        ## Estimating the value of Mean
        estimate_mean = np.mean(random_sample, axis = 1)
        best_estimate_mean = np.mean(estimate_mean)
        
        ## Taking Best estimate of standard deviation
        estimate_std = np.std(random_sample, axis = 1)
        best_estimate_std = np.mean(estimate_std)
        
        return {'mu':best_estimate_mean, 'sigma' :best_estimate_std}
    
    def estimate_bino_pValue (self, a, unique_val, total_sample_size = 1000, each_sample_size = 10):
        
        '''
        Estimating natural parameter of binomial distribution
        '''
        prob_dict = {}
        for uni in unique_val:
            if (uni in a):
                random_sample = np.random.choice(np.array(a), size = (total_sample_size, each_sample_size))
                val = []
                for sample in random_sample:
                    val.append(np.count_nonzero(uni == sample) / each_sample_size)
                prob_dict[uni] = np.mean(val)
            else:
                prob_dict[uni] = self.laplash_smoothing(len(a), len(unique_val))
        return prob_dict
        
    def normal_pdf(self, x, mu, sigma):
        
        '''
        Proabability distribution function for Normal Distribution
        '''
        scale_parm = 1 / (np.sqrt(2 * np.pi) * sigma)
        shift = np.e ** -(((x - mu) ** 2) / (2 * sigma ** 2))
        return scale_parm * shift
    
    def calculate_class_prob(self):
        '''
        Calculating class probability for every class
        '''
        self.unique_classes = np.unique(self.data[self.target])
        self.class_prob = {}
        for class_val in self.unique_classes:
            self.class_prob[class_val] = np.count_nonzero(self.data[self.target] == class_val) / self.data.shape[0]

    def calculate_class_based_feature_prob(self, feature):
        
        '''
        Calculating probability for particular feature on the basis of resultant classes
        '''
        val = {}
        feature_unique = np.unique(feature)
        for classes in self.unique_classes:
            mask = self.data[self.target] == classes
            feature_now = feature[mask]
            val[classes] = self.estimate_bino_pValue(feature_now, feature_unique)
        return val
    
    def calculate_class_based_normal_feature(self, feature):
        val = {}
        for classes in self.unique_classes:
            mask = self.data[self.target] == classes
            feature_now = feature[mask]
            val[classes] = self.estimate_mean_std(feature_now)
        return val
    
    
    def predict(self, feature):
        
        pred = {}
        pred_class_sum = 0
        pred_class = None
        total = 0
        for classes in self.unique_classes:
            pred[classes] = {}
            sumVal = 1
            for key, val in feature.items():
                #print(key, val)
                if key in self.weights:
                    if 'mu' in self.weights[key][classes].keys():
                        pred[classes][key] = self.normal_pdf(val, self.weights[key][classes]['mu'], self.weights[key][classes]['sigma'])
                        sumVal *= pred[classes][key]
                    else:
                        if (val not in self.weights[key][classes]):
                            pred[classes][key] = self.laplash_smoothing(self.data.shape[0], len(self.weights[key][classes]))
                        else:
                            pred[classes][key] = self.weights[key][classes][val]
                            sumVal *= pred[classes][key]
                    
            pred[classes]['sum'] = sumVal
            total += sumVal
        
        posterior_prob = {}
        posterior = 0
        classes = 0
        for i in self.unique_classes:
            posterior_prob[i] = pred[i]['sum'] * self.class_prob[i] / total
            if (posterior_prob[i] > posterior):
                posterior = posterior_prob[i]
                classes = i
        return classes
    
    
    def laplash_smoothing(self, data_len, unique_val):
        return 1 / (data_len + len(self.unique_classes) + 1)
    
    
    def fit(self):
        self.calculate_class_prob()
        self.weights['REGION-CENTROID-COL'] = self.calculate_class_based_normal_feature(self.data['REGION-CENTROID-COL'])
        self.weights['REGION-CENTROID-ROW'] = self.calculate_class_based_normal_feature(self.data['REGION-CENTROID-ROW'])
        self.weights['SHORT-LINE-DENSITY-5'] = self.calculate_class_based_feature_prob(self.data['SHORT-LINE-DENSITY-5'])
        self.weights['SHORT-LINE-DENSITY-2'] = self.calculate_class_based_feature_prob(self.data['SHORT-LINE-DENSITY-2'])
        self.weights['VEDGE-MEAN'] = self.calculate_class_based_normal_feature(self.data['VEDGE-MEAN'])
        self.weights['VEDGE-SD'] = self.calculate_class_based_normal_feature(self.data['VEDGE-SD'])
        self.weights['HEDGE-MEAN'] = self.calculate_class_based_normal_feature(self.data['HEDGE-MEAN'])
        self.weights['HEDGE-SD'] = self.calculate_class_based_normal_feature(self.data['HEDGE-SD'])
        self.weights['INTENSITY-MEAN'] = self.calculate_class_based_normal_feature(self.data['INTENSITY-MEAN'])
        self.weights['RAWRED-MEAN'] = self.calculate_class_based_normal_feature(self.data['RAWRED-MEAN'])
        self.weights['RAWBLUE-MEAN'] = self.calculate_class_based_normal_feature(self.data['RAWBLUE-MEAN'])
        self.weights['RAWGREEN-MEAN'] = self.calculate_class_based_normal_feature(self.data['RAWGREEN-MEAN'])
        self.weights['EXRED-MEAN'] = self.calculate_class_based_normal_feature(self.data['EXRED-MEAN'])
        self.weights['EXBLUE-MEAN'] = self.calculate_class_based_normal_feature(self.data['EXBLUE-MEAN'])
        self.weights['EXGREEN-MEAN'] = self.calculate_class_based_normal_feature(self.data['EXGREEN-MEAN'])
        self.weights['VALUE-MEAN'] = self.calculate_class_based_normal_feature(self.data['VALUE-MEAN'])
        self.weights['SATURATION-MEAN'] = self.calculate_class_based_normal_feature(self.data['SATURATION-MEAN'])
        self.weights['HUE-MEAN'] = self.calculate_class_based_normal_feature(self.data['HUE-MEAN'])
        
    def predict_batch(self, data):
        classes = data.apply(self.predict, axis = 1)
        return classes

In [70]:
def dimension_reduction(data):
    ''' 
    Data is a DataFrame
    '''
    data1 = data
    for i in data.columns:
        if (np.unique(data[i]) == 1) or (data[i].var() == 0):
            data1 = data1.drop(i)
    return data1

In [71]:
#data = dimension_reduction(train_data)

In [72]:
nb = NaiveBayesClassifier(train_data, 'CLASS')
nb.fit()

In [73]:
train_data.shape

(210, 20)

In [74]:
test_data.shape

(2100, 20)

In [75]:
nb.predict(test_data.iloc[0].to_dict())

'GRASS'

In [76]:
classes = nb.predict_batch(test_data)

In [77]:
nb.weights

{'REGION-CENTROID-COL': {'BRICKFACE': {'mu': 83.47510000000001,
   'sigma': 58.13779301999442},
  'CEMENT': {'mu': 151.0422, 'sigma': 59.278134952571754},
  'FOLIAGE': {'mu': 76.6173, 'sigma': 56.63148274992549},
  'GRASS': {'mu': 129.8469, 'sigma': 74.52000820274426},
  'PATH': {'mu': 150.6774, 'sigma': 67.36970187807935},
  'SKY': {'mu': 116.3783, 'sigma': 53.30035936595884},
  'WINDOW': {'mu': 165.2322, 'sigma': 61.46778390554555}},
 'REGION-CENTROID-ROW': {'BRICKFACE': {'mu': 109.4233,
   'sigma': 26.47291809935071},
  'CEMENT': {'mu': 96.6555, 'sigma': 35.69301995281916},
  'FOLIAGE': {'mu': 111.54939999999999, 'sigma': 19.687993191790802},
  'GRASS': {'mu': 203.08859999999999, 'sigma': 24.498371894936753},
  'PATH': {'mu': 187.25039999999998, 'sigma': 10.253819682432894},
  'SKY': {'mu': 45.698800000000006, 'sigma': 23.13312701729746},
  'WINDOW': {'mu': 104.73219999999999, 'sigma': 35.85852812403088}},
 'SHORT-LINE-DENSITY-5': {'BRICKFACE': {0.0: 0.9688000000000001,
   0.1111111

In [78]:
class Score_Matrix(object):
    
    '''
    This class has all score matrix by which we can check efficiency of our algorithm on given dataset.
    '''
    
    def __init__(self, y_actual, y_pred):
        self.y_actual = y_actual
        self.y_pred = np.array(y_pred)
        
    def confusion_matrix(self):
        self.data = pd.crosstab(self.y_actual, self.y_pred, rownames= ['actual'], colnames = ['predicted'], margins = True)
        return self.data
        
    def recall(self):
        '''
        data:
            Dataframe of confusion matrix
        '''
        sumVal = 0
        self.confusion_matrix()
        for i in self.data.columns[:-1]:
            sumVal += (self.data[i][i] / self.data['All'][i])
        return np.round(sumVal / (self.data.shape[0] - 1), 3)
    
    def check_accuracy(self):
        sumVal = 0
        self.confusion_matrix()
        for i in self.data.columns[:-1]:
            sumVal += self.data[i][i]
        return np.round(sumVal / self.data['All']['All'], 3)
    
    def precision(self):
        sumVal = 0
        self.confusion_matrix()
        for i in self.data.columns[:-1]:
            sumVal += (self.data[i][i] / self.data[i]['All'])
        avg = sumVal / (self.data.shape[0]- 1)
        return np.round(avg,3)
    

In [79]:
sm = Score_Matrix(test_data['CLASS'], classes)

In [80]:
print ('Accuracy on test data: {}'.format(sm.check_accuracy()))

Accuracy on test data: 0.774


In [81]:
print ('Recall value on test data: {}'.format(sm.recall()))

Recall value on test data: 0.774


In [82]:
print ('Precision value on test data: {}'.format(sm.precision()))

Precision value on test data: 0.781


In [83]:
from sklearn.naive_bayes import GaussianNB

In [84]:
nb = GaussianNB()

In [85]:
x = train_data.drop('CLASS', axis = 1)
y = train_data['CLASS']

In [86]:
nb.fit(x, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [87]:
x_test = test_data.drop('CLASS', axis = 1)
y_test = test_data['CLASS']

In [88]:
nb.predict(x_test)

array(['GRASS', 'GRASS', 'GRASS', ..., 'WINDOW', 'BRICKFACE', 'BRICKFACE'],
      dtype='<U9')

In [89]:
nb.score(x_test, y_test)

0.7952380952380952

# Applying PCA to reduce the number of features. 

In [19]:
def PCA(data, labels):
    
    ## We are taking the data columns. 
    global unique_classes
    
    cov_mat = list(map(lambda y: data[y == labels].cov(), unique_classes))

    svd_mat = list(map(lambda x: np.linalg.svd(x), cov_mat))
    
    ## Taking column number 
    col_no = np.array(list(map(lambda x: np.argmax(x[0], axis = 1), svd_mat)))
    
    
    col_val = list(map(lambda x: set(col_no[:, x]), np.arange(col_no.shape[1])))

    cols = []
    
    for i in col_val:
        if (len(i) == 1):
            cols.extend(i)
        
    return cols