In [None]:
# Setting auto reloading for imported modules
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from auxiliary_functions import Utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
utils = Utils()

In [None]:
df = pd.read_csv('data/train.csv')
df.head()

In [None]:
df.dtypes

**Data preprocessing**
<br>
*Duplicates*

In [None]:
df.shape

In [None]:
df.describe(include='all')

In [None]:
def check_duplicates(df: pd.DataFrame) -> None:
    duplicates = df[df.duplicated(keep=False)]
    if duplicates.empty:
        print('There are no duplicate entries in this dataset.')
    else:
        print(f'Number of duplicate entries in this dataset: {len(duplicates)}')
        print(f'Duplicated rows:\n {duplicates}')

check_duplicates(df)

**Data preprocessing**
<br>
*Handle Missing Values*

In [None]:
msno.matrix(df);

In [None]:
utils.missing_values_percentage(df)

In [None]:
df.Cabin.value_counts(dropna=False)

In [None]:
df[df.Cabin.notna()]

In [None]:
df['cabin_letter_code'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'None')
df['no_of_cabins'] = df['Cabin'].apply(lambda x: len(x.split()) if pd.notna(x) else 0)
df = df.drop(columns=['Cabin'], axis=1)
df['cabin_letter_code'].value_counts()

In [None]:
utils.missing_values_percentage(df)

In [None]:
df['Age'].value_counts(dropna=False)

In [None]:
# Imputation Based on Segmentation
sns.boxplot(x='Sex', y='Age', data=df)
plt.title('Age vs Gender')
plt.show()
mean_age_by_gender = round(df.groupby('Sex')['Age'].mean(), 0)
print(f"Mean age for gender groups:\n{mean_age_by_gender}")

sns.boxplot(x='Pclass', y='Age',data=df)
plt.title('Age vs Pclass')
plt.show()
mean_age_by_pclass = round(df.groupby('Pclass')['Age'].mean(), 0)
print(f"Mean age for passenger class groups{mean_age_by_pclass}")

In [None]:
mean_age_by_gender = round(df.groupby(['Sex', 'Pclass'])['Age'].mean(), 0)
print(mean_age_by_gender)

In [None]:
df['Age'] = df.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.mean()))
df.Age.describe()

In [None]:
utils.missing_values_percentage(df)

In [None]:
df['Embarked'].value_counts(dropna=False)

In [None]:
embarked_mode = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(embarked_mode)
utils.missing_values_percentage(df)

In [None]:
df.dtypes

**Data preprocessing**
<br>
*Data Cleaning/Outliers*

In [None]:
df.head()

In [None]:
sns.boxplot(df['Fare'])
plt.show()

In [None]:
outliers_fare, fare_boundaries = utils.find_outliers_iqr(df=df, column='Fare')
print(outliers_fare['Survived'].value_counts())
outliers_fare

Outliers in 'Fare' column should be kept as they correspond with the survival

In [None]:
sns.boxplot(df['Age'])
plt.show()

In [None]:
age_outliers, age_boundaries = utils.find_outliers_iqr(df=df, column='Age')
print(age_outliers['Survived'].value_counts())
age_outliers

Outliers in 'Age' column should be capped as they do have weak correspond rate with the survival

In [None]:
df['Age'] = np.where(df['Age'] > age_boundaries[1], age_boundaries[1], df['Age'])

**Exploratory Data Analysis**

In [None]:
df.head()

In [None]:
categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'cabin_letter_code']
continuous_features = ['Age', 'Fare']
target_feature = 'Survived'

Univariate Analysis of the Target Feature

In [None]:
utils.create_bar_plot(df=df, column=target_feature)

Univariate Analysis of Other Features

In [None]:
for feature in categorical_features:
    utils.create_bar_plot(df=df, column=feature)

Bivariate Analysis Between Features and Target Feature

In [None]:
for feature in categorical_features:
    utils.create_bivar_bar_plot(df=df, column=feature, target_feature=target_feature)

In [None]:
for feature in continuous_features:
    utils.create_bivar_violin_plot(column=feature, target_feature=target_feature, df=df)

**Feature Engineering**
<br>
*New Feature Creation*

In [None]:
df.head()

In [None]:
df['family_size'] = df['SibSp'] + df['Parch'] + 1

In [None]:
df['is_alone'] = (df['family_size'] == 1).astype(int)

In [None]:
df['title'] = df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
df['name_length'] = df['Name'].apply(len)

In [None]:
df['ticket_prefix'] = df['Ticket'].str.extract('([A-Za-z]+)', expand=False).fillna('None')
df['ticket_length'] = df['Ticket'].apply(len)

In [None]:
df['age_category'] = pd.cut(df['Age'], bins=[0, 12, 18, 25, 35, 60, 100], labels=['child', 'teen', 'young adult', 'adult', 'middle aged', 'senior'])

In [None]:
cols_to_drop = ['PassengerId', 'Name', 'Ticket']
df = df.drop(cols_to_drop, axis=1)

*Feature Transformation*

In [None]:
categorical_features = df.select_dtypes(include=['object', 'category']).columns

for feature in categorical_features:
    df = utils.categorical_feature_ohe(df=df, column=feature)    

In [None]:
numerical_features = ['Age', 'Fare', 'name_length', 'ticket_length']

for feature in numerical_features:
    df = utils.numerical_feature_std(df=df, column=feature)

In [None]:
df.head()

**Modeling**

In [None]:
X = df.drop(columns=[target_feature], axis=1)
y = df[target_feature]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier()
}

In [None]:
best_model, results = utils.select_best_model(models=models, x=X_train, y=y_train)

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'degree': [2, 3, 4],
    'coef0': [0.0, 0.1, 0.5]
}

best_params = utils.hyper_params_tuning(model=best_model, param_grid=param_grid, x=X, y=y)

In [None]:
svc = SVC(**best_params)
svc.fit(X_train, y_train)

predictions = svc.predict(X_test)
test_accuracy = accuracy_score(y_test, predictions)
test_f1 = f1_score(y_test, predictions)

print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Test F1: {test_f1:.4f}')