# Titanic Prediction/Classification

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import shapiro
from scipy.stats import randint
from imblearn.over_sampling import SMOTE

In [1]:
data = pd.read_csv("../input/titanic/train.csv")
data.head()

In [1]:
data.describe()

In [1]:
data.info()

In [1]:
data.isnull().sum()

# Data Preprocessing

In [1]:
def data_pipeline(data):
    data["Age"].fillna((data["Age"].mean()), inplace=True)
    data = data.drop(["Name", "Ticket"], axis=1)
    data.Sex.replace(to_replace=dict(female=1, male=0), inplace=True)
    data.Embarked.replace(to_replace=dict(Q=1, C=2, S=3), inplace=True)
    data["Embarked"].fillna((data["Embarked"].mean()), inplace=True)
    data['Deck'] = data['Cabin'].str.extract('([A-Za-z])', expand=False)
    data.Deck.replace(to_replace=dict(A=1, B=2, C=3, D=4, E=5, F=6, G=7, T=8), inplace=True)
    data["Deck"].fillna((data["Deck"].median()), inplace=True)
    data = data.drop(["Cabin"], axis=1)
    data["Fare"].fillna((data["Fare"].mean()), inplace=True)
    
    return data

In [1]:
data = data_pipeline(data)

In [1]:
data

# Data Interpretation and Visualization

### Outlier Visualization, Interpretation and Handling

In [1]:
plt.figure(figsize=(15, 10))
sns.boxplot(x="variable", y="value", data=pd.melt(data))
sns.stripplot(x="variable", y="value", data=pd.melt(data), color="orange", jitter=0.2, size=2.5)
plt.grid()

In [1]:
import warnings
warnings.filterwarnings("ignore")

a=1
plt.figure(figsize=(20, 10))
for i in data.columns:
    plt.subplot(4, 3, a)
    sns.distplot(data[i])
    a += 1
plt.show()

#### Hypothesis 1: Not one of the columns are normal distriuted

This will be evaluated by using the p-value measurement:

${\displaystyle p=2\min\{\Pr(T\geq t\mid H_{0}),\Pr(T\leq t\mid H_{0})\}}$ for a two-sided test. If distribution ${\displaystyle T}$ is symmetric about zero, then ${\displaystyle p=\Pr(|T|\geq |t|\mid H_{0})}{\displaystyle p=\Pr(|T|\geq |t|\mid H_{0})}$

In [1]:
# Shapiro-Wilk Test
for col in data:
    stat, p = shapiro(data[col])
    print("----------------------------------------------")
    print(col)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')

#### On the basis of the knowledge gained from above the data in columns are not normal distributed.

### Outlier handling
We do not handle any outliers for the following reason:
- a woman, with age 70 in the 3rd class has it a lot harder than a man, with 30 ages and in the 1st class

This statement will be proven in the section visualizations...

In [1]:
# Visualize correlations of each column (not necessary but for interest)
correlations = data.corr(method="pearson")
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, vmin= -1, cmap="coolwarm", annot=True)

### Visualization

#### More Men than Women died:

In [1]:
plt.figure(figsize=(20, 8))

plt.subplot(1, 2, 1)
plt.ylim(0, 600)
sns.countplot(data=data, x="Survived")


plt.subplot(1, 2, 2)
fig_2 = sns.countplot(data=data, x="Sex", hue="Survived")
fig_2.set_xticklabels(["male", "female"])
plt.ylim(0, 600)
plt.show(fig_2.containers[0])
plt.show(fig_2.containers[1])

plt.show()


#### Persons which are in the Age Group 30 are died the most but not in percentage:

In [1]:
plt.figure(figsize=(12, 8))
sns.histplot(data=data, x="Age", hue="Survived", element="step", kde=True)

#### According to the Classes 1-3 (1 = 1st class (best class) -> upper levels of the ship, 2 = 2nd class -> middle level of the ship, 3 = 3rd class -> lower levels of the ship) this means that passengers in the 3rd class had it harder to get on top than 1st class passengers:

In [1]:
plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.ylim(0, 550)
fig_2 = sns.countplot(data=data, x="Pclass")
plt.bar_label(fig_2.containers[0])


plt.subplot(1, 2, 2)
fig_2 = sns.countplot(data=data, x="Pclass", hue="Survived")
fig_2.set_xticklabels(["1", "2", "3"])
plt.ylim(0, 550)
plt.show(fig_2.containers[0])
plt.show(fig_2.containers[1])

In [1]:
plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.ylim(0, 650)
sns.countplot(data=data, x="SibSp")


plt.subplot(1, 2, 2)
fig_4 = sns.countplot(data=data, x="SibSp", hue="Survived")
plt.ylim(0, 650)
plt.show(fig_4.containers[0])
plt.show(fig_4.containers[1])


In [1]:
plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.ylim(0, 700)
sns.countplot(data=data, x="Parch")


plt.subplot(1, 2, 2)
fig_4 = sns.countplot(data=data, x="Parch", hue="Survived")
plt.ylim(0, 700)
plt.show(fig_4.containers[0])
plt.show(fig_4.containers[1])


# Preprocessing and Classification

#### First of all we split the Survived column from the other columns and use SMOTE to make the "Survived" column distribution equal

In [1]:
x = data.drop(columns=["Survived"])
y = data["Survived"]

# Show distribution of 0 and 1
y.value_counts()

In [1]:
sm = SMOTE(random_state=42)
x, y = sm.fit_resample(x, y)
y.value_counts()

In [1]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=14)

### Prepare the models for testing

In [1]:
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , ExtraTreesClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
%matplotlib inline

# Create empty list and append each model to list
models = []
models.append(("SVC", SVC(random_state=14)))
models.append(("SVM", LinearSVC(random_state=14)))
models.append(("LOGR", LogisticRegression(solver="liblinear", random_state=14)))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("CART", DecisionTreeClassifier(random_state=14)))
models.append(("NB", GaussianNB()))
models.append(("DT", DecisionTreeClassifier(random_state=14)))
models.append(("RF", RandomForestClassifier(random_state=14)))
models.append(("ET", ExtraTreesClassifier(random_state=14)))
models.append(("GB", GradientBoostingClassifier(random_state=14)))
models.append(("BC", BaggingClassifier(random_state=14)))

# Empty list for results of the evaluation
model_results = []

In [1]:
# Function: for each element in model list there will be an evaluation -> Results will be added to results df
def train_all_models(models):
    i = 1
    plt.figure(figsize=(25, 15))
    for method, model in models:
        model.fit(x_train, y_train)
        test_pred = model.predict(x_test)

        f_score = model.score(x_test, y_test)
        model_results.append((method, f_score))

        plt.subplot(3, 4, i)
        plt.subplots_adjust(hspace=0.3, wspace=0.3)
        sns.heatmap(confusion_matrix(y_test, test_pred), annot=True, cmap="Greens")
        plt.title(model, fontsize=14)
        plt.xlabel('Test', fontsize=12)
        plt.ylabel('Predict', fontsize=12)
        df = pd.DataFrame(model_results).transpose()
        i+=1

# Show confusion matrix for each trained model 
    plt.show()
    df = pd.DataFrame(model_results)
    return df

In [1]:
# Sort results df for later visualizations    
best_models = train_all_models(models)
best_models = best_models.sort_values([1], ascending=False)

In [1]:
best_models

In [1]:
y_pos = np.arange(len(best_models[0]))
plt.figure(figsize=(10, 6))
plt.bar(y_pos, best_models[1], color=(0.2, 0.4, 0.6, 0.6))
plt.xticks(y_pos, best_models[0])
plt.title('F-Score of all trained models')
plt.xlabel('Model Type')
plt.ylabel('F-Score')
plt.show()

### Hyperparameter Tuning of best 3 models

In [1]:
# Take top 3 models and define new -> for randomized search cv
top3_RF = RandomForestClassifier()
top3_ET = ExtraTreesClassifier()
top3_GB = GradientBoostingClassifier()

top3_RF.fit(x_train, y_train)
top3_ET.fit(x_train, y_train)
top3_GB.fit(x_train, y_train)

In [1]:
from sklearn.model_selection import RandomizedSearchCV

# Grid Search for RandomForesClassifier
grid_param_RF = {
    "n_estimators": randint(low=1, high=100),
    "max_depth": randint(low=10, high=100),
    "max_features": randint(low=1, high=4)
}

RF_grid_search = RandomizedSearchCV(estimator=top3_RF, param_distributions=grid_param_RF, cv= 10, verbose=1, random_state=14)
RF_grid_search.fit(x_train, y_train)

RF_best_grid = RF_grid_search.best_estimator_
print(RF_best_grid)
print(RF_grid_search.best_score_)

In [1]:
# Grid Search for ExtraTreesClassifier
grid_param_ET = {
    "n_estimators": randint(low=1, high=100),
    "max_depth": randint(low=10, high=100),
    "max_features": randint(low=1, high=4)
}

ET_grid_search = RandomizedSearchCV(estimator=top3_ET, param_distributions=grid_param_ET, cv= 10, verbose=1, random_state=14)
ET_grid_search.fit(x_train, y_train)

ET_best_grid = ET_grid_search.best_estimator_

print(ET_best_grid)
print(ET_grid_search.best_score_)

In [1]:
# Grid Search for GradientBoostingClassifier
grid_param_GB = {
    "n_estimators": randint(low=1, high=100),
    "max_depth": randint(low=10, high=100),
    "max_features": randint(low=1, high=4)
}

GB_grid_search = RandomizedSearchCV(estimator=top3_GB, param_distributions=grid_param_GB, cv= 10, verbose=1, random_state=14)
GB_grid_search.fit(x_train, y_train)

GB_best_grid = GB_grid_search.best_estimator_

print(GB_best_grid)
print(GB_grid_search.best_score_)

In [1]:
# Show results of grid search
print(RF_best_grid, "\n Score: ", RF_grid_search.best_score_, "\n ------------------------------------")
print(ET_best_grid, "\n Score: ", ET_grid_search.best_score_, "\n ------------------------------------")
print(GB_best_grid, "\n Score: ", GB_grid_search.best_score_, "\n ------------------------------------")

# Taking best Model and train it again with best Hyperparameters

In [1]:
# ExtraTreesClassifier is best one, create model and learn again with defined parameters of grid search
best_model = RandomForestClassifier(max_depth=20, max_features=1, n_estimators=62)
best_model.fit(x, y)

# Using the best_model for test.csv

In [1]:
test_data = pd.read_csv("../input/titanic/test.csv")

In [1]:
test_data = data_pipeline(test_data)

test_data.head()

In [1]:
test_data.isnull().sum()

In [1]:
test_data["Survived"] = best_model.predict(test_data)

In [1]:
test_data[['PassengerId', 'Survived']].to_csv('submission.csv', index=False)

In [1]:
submission = pd.read_csv("./submission.csv")
submission.shape