In [1]:
import pandas as pd
import numpy as np

# load the dataset
df = pd.read_csv('train.csv')

# display the first few rows of the dataset
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# drop columns that we don't need
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# convert categorical variables to numerical variables
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'].astype(str))

# drop rows with missing values
df = df.dropna()


In [3]:
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1), 
                                                    df['Survived'], 
                                                    test_size=0.2, 
                                                    random_state=42)


In [4]:
import time

start = time.time()

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1), 
                                                    df['Survived'], 
                                                    test_size=0.2, 
                                                    random_state=42)

class NaiveBayesClassifier1:
    def __init__(self, alpha=1):
        self.alpha = alpha  # Laplace smoothing parameter
        self.classes = None  # list of class labels
        self.class_priors = None  # dictionary of class priors
        self.class_likelihoods = None  # dictionary of class conditional likelihoods
    
    def fit(self, X, y):
        self.classes = list(set(y))
        self.class_priors = {}
        self.class_likelihoods = {}
        
        # calculate class priors
        for c in self.classes:
            self.class_priors[c] = (y == c).sum() / len(y)
        
        # calculate class conditional likelihoods
        for c in self.classes:
            # get the subset of training data belonging to class c
            X_c = X[y == c]
            # calculate the mean and variance of each feature in X_c
            means = X_c.mean(axis=0)
            variances = X_c.var(axis=0) + self.alpha
            # store the mean and variance for class c
            self.class_likelihoods[c] = (means, variances)
    
    def predict(self, X):
        # calculate the log posterior probability of each class for each instance in X
        log_posteriors = []
        for x in X:
            log_p_c = {}
            for c in self.classes:
                # calculate the log prior probability of class c
                log_p_c[c] = np.log(self.class_priors[c])
                # calculate the log likelihood of x given class c
                means, variances = self.class_likelihoods[c]
                log_likelihood = np.sum(-0.5 * np.log(2 * np.pi * variances) - 0.5 * ((x - means) ** 2 / variances), axis=0)
                log_p_c[c] += log_likelihood.sum()
            log_posteriors.append(log_p_c)
        
        # predict the class with the highest log posterior probability for each instance in X
        y_pred = []
        for log_p_c in log_posteriors:
            max_log_p = -np.inf
            max_c = None
            for c, log_p in log_p_c.items():
                if log_p > max_log_p:
                    max_log_p = log_p
                    max_c = c
            y_pred.append(max_c)
        return y_pred
    
#create an instance of the NaiveBayesClassifier class
nb1 = NaiveBayesClassifier1()

#fit the training data
nb1.fit(X_train.values, y_train.values)

#make predictions on the testing data
y_pred = nb1.predict(X_test.values)

#calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.2f}')


end = time.time()

print('\n')
print("Time taken to run code in build code in seconds -- ",end - start)

print('\n')
print("Time taken to run code in Sklearn in milli seconds -- ",((end - start)*1000))


Accuracy: 0.68


Time taken to run code in build code in seconds --  0.013445138931274414


Time taken to run code in Sklearn in milli seconds --  13.445138931274414


In [5]:

import time

start = time.time()


#create an instance of the NaiveBayesClassifier class
nb1 = NaiveBayesClassifier1()

#fit the training data
nb1.fit(X_train.values, y_train.values)

end = time.time()

print('\n')
print("Time taken to run code in build code in seconds -- ",(end - start)*1000)




Time taken to run code in build code in seconds --  1.9614696502685547


In [6]:

import time

start = time.time()

class NaiveBayesClassifier:
    def __init__(self, alpha=1):
        self.alpha = alpha  # Laplace smoothing parameter
        self.classes = None  # list of class labels
        self.class_priors = None  # dictionary of class priors
        self.class_likelihoods = None  # dictionary of class conditional likelihoods
    
    def fit(self, X, y):
        self.classes = list(set(y))
        self.class_priors = {}
        self.class_likelihoods = {}
        
        # calculate class priors
        for c in self.classes:
            self.class_priors[c] = (y == c).sum() / len(y)
        
        # calculate class conditional likelihoods
        for c in self.classes:
            # get the subset of training data belonging to class c
            X_c = X[y == c]
            # calculate the mean and variance of each feature in X_c
            means = X_c.mean(axis=0)
            variances = X_c.var(axis=0) + self.alpha
            # store the mean and variance for class c
            self.class_likelihoods[c] = (means, variances)
    
    def predict(self, X):
        # calculate the log posterior probability of each class for each instance in X
        log_posteriors = list(map(lambda x: {c: np.log(self.class_priors[c]) +
                                np.sum(-0.5 * np.log(2 * np.pi * self.class_likelihoods[c][1]) -
                                       0.5 * ((x - self.class_likelihoods[c][0]) ** 2 / self.class_likelihoods[c][1]).sum())
                                for c in self.classes}, X))
        
        # predict the class with the highest log posterior probability for each instance in X
        y_pred = list(map(lambda p: max(p, key=p.get), log_posteriors))
        return y_pred

    #create an instance of the NaiveBayesClassifier class
nb = NaiveBayesClassifier()

#fit the training data
nb.fit(X_train.values, y_train.values)

#make predictions on the testing data
y_pred = nb.predict(X_test.values)

#calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.2f}')


end = time.time()

print('\n')
print("Time taken to run code in build code in seconds -- ",end - start)

print('\n')
print("Time taken to run code in Sklearn in milli seconds -- ",((end - start)*1000))


Accuracy: 0.73


Time taken to run code in build code in seconds --  0.01665043830871582


Time taken to run code in Sklearn in milli seconds --  16.65043830871582


In [7]:
import time

start = time.time()


#create an instance of the NaiveBayesClassifier class
nb = NaiveBayesClassifier1()

#fit the training data
nb.fit(X_train.values, y_train.values)

end = time.time()

print('\n')
print("Time taken to run code in build code in seconds -- ",(end - start)*1000)




Time taken to run code in build code in seconds --  3.0019283294677734


In [8]:
import time

start = time.time()

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)



from sklearn.metrics import accuracy_score, precision_score, recall_score
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}".format(accuracy, precision, recall))


end = time.time()

print('\n')
print("Time taken to run code in Sklearn in seconds -- ",end - start)

print('\n')
print("Time taken to run code in Sklearn in milli seconds -- ",((end - start)*1000))

Accuracy: 0.77, Precision: 0.72, Recall: 0.68


Time taken to run code in Sklearn in seconds --  0.014291048049926758


Time taken to run code in Sklearn in milli seconds --  14.291048049926758


In [9]:
import time

start = time.time()

clf.fit(X_train, y_train)

end = time.time()

print('\n')
print("Time taken to run code in Sklearn in seconds -- ",(end - start)*1000)




Time taken to run code in Sklearn in seconds --  4.569292068481445


In [10]:
import pandas as pd

# code 2
result1 = [1.9614, "68 %"]
df1 = pd.DataFrame({'without using lambda ': result1})

# code 3
result2 = [3.0019, "73 %"]
df2 = pd.DataFrame({'using lambda': result2})

# code 1
result4 = [4.56929, "77 %"]
df4 = pd.DataFrame({'using sklearn': result4})

# code 4
result3 = ['Time stamp', 'accuracy']
df3 = pd.DataFrame({'Column': result3})


# display the results in a table
df = pd.concat([df3, df1, df2, df4], axis=1)

print("\n")
print(df)




       Column without using lambda  using lambda using sklearn
0  Time stamp                1.9614       3.0019       4.56929
1    accuracy                  68 %         73 %          77 %


#### * The time taken by sklearn is more than our code with or without lambda function