In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, plot_confusion_matrix


import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
TRAIN_DATA_PATH = "../input/titanic/train.csv"
TEST_DATA_PATH = "../input/titanic/test.csv"

In [None]:
data = pd.read_csv(TRAIN_DATA_PATH)
data.head()

## Exploring the Data

In [None]:
data.info()

In [None]:
# Checking for duplicacy
len(data["PassengerId"].unique()) == data.shape[0]

We can see that is no duplicacy present in the data. Let us drop the PassengerId, Name, Ticket from the data.

In [None]:
data.drop(columns=["PassengerId", "Name", "Ticket"],inplace=True)

In [None]:
data.head()

In [None]:
print(data.isnull().sum())
data.isnull().sum().plot(kind = "bar")
plt.title("NaN values Plot")
plt.show()

## Univariate Analysis

In [None]:
# Checking the data if it is balanced or not

counts = data["Survived"].value_counts()
diag_cols = ["Not Survived", "Survived"]
diag_counts = [counts[0], counts[1]]

nd = (diag_counts[0] / sum(diag_counts))*100
d = (diag_counts[1] / sum(diag_counts)) * 100

print(f"Survived: {d}%")
print(f"Not Survived: {nd}%")

print()

plt.figure(figsize = (10, 8))
sns.barplot(x = diag_cols, y = diag_counts)
plt.show()

In [None]:
data["Pclass"].unique()

In [None]:
data["Pclass"].value_counts().sort_values().plot(kind = "bar")
plt.show()

In [None]:
data.groupby("Pclass")["Survived"].mean()

Here we an see that the survivors mostly belonged to class 1 which is obvious. The 1st class people were given more priority than the 2nd and 3rd class people.

In [None]:
data.head()

In [None]:
data["Sex"].value_counts().plot(kind = "bar")
plt.show()

In [None]:
data.groupby("Sex")["Survived"].mean()

Here we an see that the survivors were mostly feamle which is obvious. Let us change the encoding for the Sex feature. Let us replace it with the values generated using Target Guided Encoding.

In [None]:
sex_map = {"female":1, "male":0}
data["Sex"] = data["Sex"].map(sex_map).values.copy()

In [None]:
data.head()

In [None]:
print(f"Number of missing values in Age: {data['Age'].isnull().sum()}")

In [None]:
plt.figure(figsize = (11, 7))
sns.histplot(data["Age"], kde=True, bins = 50)
plt.title("Age Distribution")
plt.show()

Using mean imputation let us fill the missing values in the age feature.

In [None]:
data["Age"].fillna(data["Age"].mean(), inplace=True)

In [None]:
plt.figure(figsize = (11, 7))
sns.histplot(data["Age"], kde=True, bins = 50)
plt.title("Age Distribution after Random Sample Inputation")
plt.show()

In [None]:
print(f"Number of missing values in Age: {data['Age'].isnull().sum()}")

In [None]:
data.head()

In [None]:
data["SibSp"].unique()

In [None]:
data["SibSp"].value_counts().plot(kind = "bar")
plt.show()

In [None]:
data.groupby("SibSp")["Survived"].mean()

In [None]:
data.head()

In [None]:
data["Parch"].unique()

In [None]:
data["Parch"].value_counts().plot(kind = "bar")
plt.show()

In [None]:
data.head()

In [None]:
plt.figure(figsize = (11, 7))
sns.histplot(data["Fare"], kde=True, bins = 50)
plt.title("Fare Distribution")
plt.show()

In [None]:
data["Fare"].isnull().sum()

In [None]:
print(f"Number of missing values in Cabin: {data['Cabin'].isnull().sum()}")

In [None]:
data.drop(columns = ["Cabin"], inplace = True)

In [None]:
data.head()

In [None]:
data["Embarked"].unique()

In [None]:
data["Embarked"].fillna(data["Embarked"].mode()[0], inplace = True)

In [None]:
data.head()

In [None]:
data["Embarked"].unique()

In [None]:
data.groupby("Embarked")["Survived"].mean()

In [None]:
embark_map = {
    "S":0,
    "Q":1,
    "C":2
}

data["Embarked"] = data["Embarked"].map(embark_map).values

In [None]:
data.head()

## Bivariate Analysis

In [None]:
continuous_data_cols = ["Age", "Fare"]
plt.figure(figsize = (10,10))
sns.pairplot(data[continuous_data_cols+["Survived"]], hue="Survived")
plt.show()

## Cleaning the Test Data

In [None]:
test_data = pd.read_csv(TEST_DATA_PATH)

test_data.drop(columns = ["Name", "Cabin", "Ticket"], inplace = True)

print("Missing Values")
print(test_data.isnull().sum())

test_data["Age"].fillna(test_data["Age"].mean(), inplace = True)

test_data['Fare'].fillna(test_data["Fare"].mean() ,inplace = True)

test_data["Sex"] = test_data["Sex"].map(sex_map)
test_data["Embarked"] = test_data["Embarked"].map(embark_map)


print()
print("Missing Values")
print(test_data.isnull().sum())

test_data.head()

## Checking for best Baseline Model

In [None]:
all_columns = list(data.columns)
X = data[all_columns[1:]]
y = data["Survived"]
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
all_models = {
    "xgb_model":XGBClassifier(eval_metric = "logloss",random_state=18),
    "rf_model":RandomForestClassifier(random_state = 18),
    "logistic_model":LogisticRegression(),
    "svm_model":SVC(),
    "ada_model":AdaBoostClassifier(RandomForestClassifier(random_state = 18))
}

for model_name in all_models:
    print(f"Model Name: {model_name}")
    cv_score = cross_val_score(all_models[model_name],X, y, cv = 5)
    print(cv_score)
    print(f"Mean Score: {np.mean(cv_score)}")
    print()

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24)
print(f"Train Data: {X_train.shape}. {y_train.shape}")
print(f"Test Data: {X_test.shape}. {y_test.shape}")

## SVM Model

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

print("On Test Data")
predictions = svm_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(svm_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = svm_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(svm_model, X_train, y_train)
plt.show()

## Hyperparameter Tuning for SVM Model

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 0)
grid.fit(X_train, y_train)

print("Best Params:",grid.best_params_)
print("Best Estimator", grid.best_estimator_)

In [None]:
svm_model = SVC(C=1000, gamma=0.01)
svm_model.fit(X_train, y_train)

print("On Test Data")
predictions = svm_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(svm_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = svm_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(svm_model, X_train, y_train)
plt.show()

## RandomForest Model

In [None]:
rf_model = RandomForestClassifier(random_state = 18)
rf_model.fit(X_train, y_train)

print("On Test Data")
predictions = rf_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(rf_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = rf_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(rf_model, X_train, y_train)
plt.show()

## Hyperparameter Tuning for Random Forest Model

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier(random_state = 24)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=24, n_jobs = -1)
rf_random.fit(X_train, y_train)

print("Best Params:",rf_random.best_params_)
print("Best Estimator", rf_random.best_estimator_)

In [None]:
rf_model = RandomForestClassifier(bootstrap=False, max_depth=80, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=600, random_state=24)
rf_model.fit(X_train, y_train)

print("On Test Data")
predictions = rf_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(rf_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = rf_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(rf_model, X_train, y_train)
plt.show()

## XGBoost Model

In [None]:
xgb_model = XGBClassifier(random_state = 18)
xgb_model.fit(X_train, y_train)

print("On Test Data")
predictions =xgb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(xgb_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions =xgb_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(xgb_model, X_train, y_train)
plt.show()

## Adaboost Model

In [None]:
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)

print("On Test Data")
predictions = ada_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(ada_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = ada_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(ada_model, X_train, y_train)
plt.show()

## Voting Classifier (RandomForest + XGBoost + SVM)

In [None]:
voting_model = VotingClassifier(
    [
        ("rf_model", RandomForestClassifier(bootstrap=False, max_depth=80, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=600, random_state=24)),
        ("xgb_model", XGBClassifier(eval_metric="logloss", random_state = 18)),
        ("svm_model", SVC(C=1000, gamma=0.01))
    ]
)

voting_model.fit(X_train, y_train)

print("On Test Data")
predictions = voting_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(voting_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = voting_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(voting_model, X_train, y_train)
plt.show()

## Voting Classifier (RandomForest + XGBoost + AdaBoost)

In [None]:
voting_model = VotingClassifier(
    [
        ("rf_model", RandomForestClassifier(bootstrap=False, max_depth=80, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=600, random_state=24)),
        ("xgb_model", XGBClassifier(eval_metric="logloss",random_state = 18)),
        ("ada_model", AdaBoostClassifier())
    ]
)

voting_model.fit(X_train, y_train)

print("On Test Data")
predictions = voting_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(voting_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = voting_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(voting_model, X_train, y_train)
plt.show()

In [None]:
test_data.head()

In [None]:
final_voting_model = VotingClassifier(
    [
        ("rf_model", RandomForestClassifier(bootstrap=False, max_depth=80, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=600, random_state=24)),
        ("xgb_model", XGBClassifier(eval_metric="logloss", random_state = 18)),
        ("svm_model", SVC(C=1000, gamma=0.01))
    ]
)

final_voting_model.fit(X, y)

In [None]:
test_X = test_data.iloc[:, 1:]
test_X = scaler.transform(test_X)
test_predictions = final_voting_model.predict(test_X)

In [None]:
submission = pd.DataFrame({
    "PassengerId":test_data["PassengerId"].values,
    "Survived":test_predictions
})
submission.head()