In [1]:
#Import the required libraries
import numpy as np
import pandas as pd

In [2]:
#import the csv file
data = pd.read_csv('/home/srija/Downloads/Compressed/Datasets/Social_Network_Ads.csv')

In [3]:
#take a look at data
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
#Extract the columns
x = data.iloc[:,2:4].values
y = data.iloc[:,-1].values
#Reshape 1-D array to 2-D array
y = y.reshape(len(y),1)

In [5]:
#Perform Feature Scaling on features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [6]:
#Combine features and target into a single 2-D array
X = np.hstack((x,y))

In [7]:
#Split the data into training and testing samples 
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(X,test_size = 0.2,random_state=42)

In [8]:
#Take a look at size of training samples
train_data.shape

(320, 3)

In [9]:
#Take a look at size of testing samples
test_data.shape

(80, 3)

In [10]:
#Split the dataset by classvalues,returns a dictionary
def seperate_byclass(data):
    dict = {}
    for i in range(len(data)):
        arr = data[i]
        class_name = arr[-1]
        if class_name not in dict:
            dict[class_name] = []
        dict[class_name].append(arr)
    return dict

In [11]:
#Calculate the mean for a list of numbers
def mean(num):
    return np.mean(num)

In [12]:
#calculate standard deviation for a list of numbers
def standard_deviation(num):
   return np.std(num)

In [13]:
#calculate mean,standard deviation and count for each column in dataset
def MeanAndStdDev(data):
    tuples = []
    for column in zip(*data):
        tuples.append([mean(column),standard_deviation(column),len(column)])
    del tuples[-1]
    return tuples

In [14]:
#Find Mean,standard deviation under each class
def MeanAndStdDev_forClass(data):
    dict = seperate_byclass(data)
    info = {}
    for classvalue,instances in dict.items():
        info[classvalue] = MeanAndStdDev(instances)
    return info

In [15]:
#Calculate Gaussian Probability Density function
from math import pi
from math import exp
def calculate_probability(x,mean,stdev):
    expo = exp(-(x-mean)**2/2*stdev**2)
    return (1/(np.sqrt(2*pi))*stdev)*expo

In [16]:
#Calculate the class probabilties
def probability_forClass(info,test):
    probabilities = {}
    total_rows =0
    for classvalue in info:
        total_rows += info[classvalue][0][2]
    for classvalue,classsummaries in info.items():
        probabilities[classvalue] = info[classvalue][0][2]/float(total_rows)
        for i in range(len(classsummaries)):
            mean,stdev,count = classsummaries[i]
            probabilities[classvalue] *= calculate_probability(test[i],mean,stdev)
    return probabilities

In [17]:
#Make prediction - Highest probability is the prediction
def predict(info,test):
    probabilities = probability_forClass(info,test)
    bestlabel = None
    bestproba = -1
    for classvalue,probability in probabilities.items():
        if bestlabel is None or probability > bestproba:
            bestproba = probability
            bestlabel = classvalue
    return bestlabel

In [18]:
#Returns predictions for a set of data
def getPrediction(info,test):
    predictions = []
    for i in range(len(test)):
        result = predict(info,test[i])
        predictions.append(result)
    return predictions

In [19]:
#prepare the model
info = MeanAndStdDev_forClass(train_data)

In [20]:
#test model
predictions = getPrediction(info,test_data)

In [21]:
#Actual values
test_data[:,-1]

array([0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1.,
       0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1.])

In [22]:
#Predicted values
predictions = np.array(predictions)
predictions

array([0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.])

In [23]:
#Accuracy Score
def accuracy(test_data,predictions):
    correct = 0
    for i in range(len(test_data)):
        if(test_data[i][-1] == predictions[i]):
            correct +=1
    return (correct/len(test_data))*100

In [24]:
#Check the accuracy of the model
accuracy(test_data,predictions)

81.25