Perform Naive Bayes classification on Titanic Dataset.

In [1]:
#Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('Titanic.csv')
data.head()

Unnamed: 0,Class,Gender,Age,Survived
0,3rd,Male,Child,No
1,3rd,Male,Child,No
2,3rd,Male,Child,No
3,3rd,Male,Child,No
4,3rd,Male,Child,No


In [3]:
data.shape

(2201, 4)

In [5]:
data.isnull().sum()

Class       0
Gender      0
Age         0
Survived    0
dtype: int64

In [15]:
# Convert categorical features to numerical using Label Encoding
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Survived'] = label_encoder.fit_transform(data['Survived'])
data['Class'] = label_encoder.fit_transform(data['Class'])
data['Age'] = label_encoder.fit_transform(data['Age'])

In [16]:
# Create feature matrix X and target vector y
X = data.drop('Survived', axis=1)
y = data['Survived']

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### 1. Perform Naive Bayes Classification (Gaussian Naive Bayes):

In [18]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

In [19]:
# Create a Gaussian Naive Bayes classifier
gaussian_nb = GaussianNB()

In [20]:
# Fit the model to the training data
gaussian_nb.fit(X_train, y_train)

In [21]:
# Make predictions on the test data
y_pred = gaussian_nb.predict(X_test)

In [22]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [23]:
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.7551020408163265
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83       300
           1       0.65      0.50      0.57       141

    accuracy                           0.76       441
   macro avg       0.72      0.69      0.70       441
weighted avg       0.75      0.76      0.75       441



#### 2. Perform Naive Bayes Classification (Multinomial Naive Bayes) - If applicable:

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
# Create a Multinomial Naive Bayes classifier
multinomial_nb = MultinomialNB()

In [26]:
# Fit the model to the training data
multinomial_nb.fit(X_train, y_train)

In [27]:
# Make predictions on the test data
y_pred_multinomial = multinomial_nb.predict(X_test)

In [28]:
# Evaluate the model
accuracy_multinomial = accuracy_score(y_test, y_pred_multinomial)
report_multinomial = classification_report(y_test, y_pred_multinomial)

In [30]:
print("Multinomial Naive Bayes Accuracy:", accuracy_multinomial)
print("Multinomial Naive Bayes Classification Report:\n", report_multinomial)

Multinomial Naive Bayes Accuracy: 0.6848072562358276
Multinomial Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.98      0.81       300
           1       0.56      0.06      0.11       141

    accuracy                           0.68       441
   macro avg       0.63      0.52      0.46       441
weighted avg       0.65      0.68      0.59       441



### Hyperparameter Tuning
#### GridSearchCV

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
# Define hyperparameters and their possible values
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
}

In [33]:
# Create GridSearchCV with cross-validation
grid_search = GridSearchCV(gaussian_nb, param_grid, cv=5, n_jobs=-1)

In [34]:
# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

In [35]:
# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [36]:
# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

In [37]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [38]:
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Best Parameters: {'var_smoothing': 1e-09}
Accuracy: 0.7551020408163265
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83       300
           1       0.65      0.50      0.57       141

    accuracy                           0.76       441
   macro avg       0.72      0.69      0.70       441
weighted avg       0.75      0.76      0.75       441

