## Naive Bayes to a Kaggle dataset using object-oriented programming in Python. We'll use the famous "Titanic: Machine Learning from Disaster" dataset, which contains information about passengers on the Titanic and whether or not they survived

In [20]:
import pandas as pd
import numpy as np

# load the dataset
df = pd.read_csv('train.csv')

# display the first few rows of the dataset
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [21]:
df.shape

(891, 12)

In [22]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [23]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [24]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# drop columns that we don't need
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# convert categorical variables to numerical variables
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'].astype(str))

# drop rows with missing values
df = df.dropna()

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1), 
                                                    df['Survived'], 
                                                    test_size=0.2, 
                                                    random_state=42)


In [25]:
class NaiveBayesClassifier:
    def __init__(self, alpha=1):
        self.alpha = alpha  # Laplace smoothing parameter
        self.classes = None  # list of class labels
        self.class_priors = None  # dictionary of class priors
        self.class_likelihoods = None  # dictionary of class conditional likelihoods
    
    def fit(self, X, y):
        self.classes = list(set(y))
        self.class_priors = {}
        self.class_likelihoods = {}
        
        # calculate class priors
        for c in self.classes:
            self.class_priors[c] = (y == c).sum() / len(y)
        
        # calculate class conditional likelihoods
        for c in self.classes:
            # get the subset of training data belonging to class c
            X_c = X[y == c]
            # calculate the mean and variance of each feature in X_c
            means = X_c.mean(axis=0)
            variances = X_c.var(axis=0) + self.alpha
            # store the mean and variance for class c
            self.class_likelihoods[c] = (means, variances)
    
    def predict(self, X):
        # calculate the log posterior probability of each class for each instance in X
        log_posteriors = []
        for x in X:
            log_p_c = {}
            for c in self.classes:
                # calculate the log prior probability of class c
                log_p_c[c] = np.log(self.class_priors[c])
                # calculate the log likelihood of x given class c
                means, variances = self.class_likelihoods[c]
                log_likelihood = np.sum(-0.5 * np.log(2 * np.pi * variances) - 0.5 * ((x - means) ** 2 / variances), axis=0)
                log_p_c[c] += log_likelihood.sum()
            log_posteriors.append(log_p_c)
        
        # predict the class with the highest log posterior probability for each instance in X
        y_pred = []
        for log_p_c in log_posteriors:
            max_log_p = -np.inf
            max_c = None
            for c, log_p in log_p_c.items():
                if log_p > max_log_p:
                    max_log_p = log_p
                    max_c = c
            y_pred.append(max_c)
        return y_pred


In [26]:
#create an instance of the NaiveBayesClassifier class
nb = NaiveBayesClassifier()

#fit the training data
nb.fit(X_train.values, y_train.values)

#make predictions on the testing data
y_pred = nb.predict(X_test.values)

#calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.2f}')

Accuracy: 0.68
