In [950]:
import os

# For data manipulation and management
import numpy as np
import pandas as pd

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Models I'm going to test for predictions
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [951]:
# Import datasets
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [952]:
train_data.head()

# Get information about the datasets

In [953]:
# Get info of training and testing datasets
print('-'*40)
print('Training dataset info')
print('-'*40)
train_data.info()
print('-'*40)
print('Testing dataset info')
print('-'*40)
test_data.info()

In [954]:
# Get basic statistical data of the dataset
train_data.describe()

In [955]:
train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [956]:
train_data[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [957]:
g = sns.FacetGrid(train_data, col='Survived')
g.map(plt.hist, 'Age', bins=20)

In [958]:
grid = sns.FacetGrid(train_data, col='Survived', row='Pclass', size=2.5, aspect=1.5)
grid.map(plt.hist, 'Age', bins=20)
grid.add_legend();

In [959]:
# Drop useless columns
train_data = train_data.drop(['Ticket'], axis=1)
test_data = test_data.drop(['Ticket'], axis=1)
train_data = train_data.drop(['Cabin'], axis=1)
test_data = test_data.drop(['Cabin'], axis=1)

## Titles
Let's see what titles the passangers had and who survived

In [960]:
datasets = [train_data, test_data]
for dataset in datasets:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0}).astype(int)

pd.crosstab(train_data['Title'], train_data['Sex'])

In [961]:
train_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [962]:
# Drop Name column
train_data = train_data.drop(['Name'], axis=1)
test_data = test_data.drop(['Name'], axis=1)

In [963]:
train_data.head()

In [964]:
print('Train data NaN counts')
print('-'*40)
print(train_data.isna().sum())
print('-'*40)
print('Test data NaN counts')
print('-'*40)
print(test_data.isna().sum())

## Addressing NaN values
There are multiple NaN values in the Age column. We need to address this, because we can't drop that many columns.
There are multiple ways to predict the NaN values in Age. One of the most optimal way is to predict based on other features.
First we need to figure out what other features to use. The best is to use Sex and Pclass. These don't have NaN values and there is a clear correlation between them.

In [965]:
# Correlation between Age, Sex, Pclass
grid = sns.FacetGrid(train_data, row='Pclass', col='Sex', size=2.5, aspect=1.5)
grid.map(plt.hist, 'Age', bins=20)
grid.add_legend()

In [966]:
datasets = [train_data, test_data]
ages = np.zeros((2,3))

for dataset in datasets:
    for i in range(2): # Sex
        for j in range(3): # Pclass
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            
            ages[i,j] = int(age_guess/0.5 + 0.5)*0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),'Age'] = ages[i,j]
            
    dataset['Age'] = dataset['Age'].astype(int)

In [967]:
print('Train data NaN counts')
print('-'*40)
print(train_data.isna().sum())
print('-'*40)
print('Test data NaN counts')
print('-'*40)
print(test_data.isna().sum())

train_data.head()

## Grouping ages

In [968]:
train_data['AgeGroup'] = pd.cut(train_data['Age'], 5)
train_data[['AgeGroup', 'Survived']].groupby(['AgeGroup'], as_index=False).mean().sort_values(by='AgeGroup', ascending=True)

In [969]:
for dataset in datasets:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
    

train_data.drop(['AgeGroup'], axis=1, inplace=True)

train_data.head()

## Grouping Fare

In [970]:
test_data['Fare'].fillna(test_data['Fare'].dropna().median(), inplace=True)

for dataset in datasets:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

datasets = [train_data, test_data]
    
train_data['Fare'].unique()

In [971]:
train_data.head()

## Grouping Embarked

In [972]:
train_data['Embarked'].unique()

In [973]:
for dataset in datasets:
    dataset.loc[dataset['Embarked'] == 'S', 'Embarked'] = 0
    dataset.loc[dataset['Embarked'] == 'C', 'Embarked'] = 1
    dataset.loc[dataset['Embarked'] == 'Q', 'Embarked'] = 2
    
    dataset['Embarked'].fillna(dataset['Embarked'].dropna().median(), inplace=True)
    
datasets = [train_data, test_data]

## Grouping Titles

In [974]:
train_data['Title'].unique()
categories_map = {'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3, 'Dr':4, 'Other':5}
for dataset in datasets:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
    dataset['Title'] = dataset['Title'].map(categories_map).astype(int)

datasets = [train_data, test_data]

In [975]:
print('Train data NaN counts')
print('-'*40)
print(train_data.isna().sum())
print('-'*40)
print('Test data NaN counts')
print('-'*40)
print(test_data.isna().sum())

train_data.head()

## Train model and predict solution
Now that there are no NaN values in the dataset we are ready to train a model.

Task: Predict which passengers survived the Titanic shipwreck. The problem is a classification and regression problem.

In [976]:
# Set training and testing sets
X_train = train_data.drop('Survived', axis=1)
X_train.drop(['PassengerId'], axis=1, inplace=True)
Y_train = train_data['Survived']
X_test = test_data.drop(['PassengerId'], axis=1).copy()

X_train.shape, Y_train.shape, X_test.shape

In [977]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
accuracy_logreg = logreg.score(X_train, Y_train)

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
accuracy_knn = knn.score(X_train, Y_train)

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
accuracy_decision_tree = decision_tree.score(X_train, Y_train)

rnd_forest = RandomForestClassifier(n_estimators=100)
rnd_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
accuracy_rnd_forest = rnd_forest.score(X_train, Y_train)

The correlation between the survivability and sex is the highest and next is the fare and title.
This can be done by calculating the coefficient.

In [978]:
coeff_df = pd.DataFrame(train_data.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

## Evaluating the models
Now we can see which model perform better.
As we can see the random forest and the decision tree has the highest accuracy. Probably random forest would perform better with larger dataset, since it handles better the problem of overfitting and outliers.

In [979]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 
              'Random Forest', 'Decision Tree'],
    'Score': [accuracy_knn, accuracy_logreg, accuracy_rnd_forest, accuracy_decision_tree]})
models.sort_values(by='Score', ascending=False)

In [980]:
submission = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Survived": Y_pred})
submission.to_csv('/kaggle/working/submission.csv', index=False)

## References
This notebook has been created based on these great sources:
* [A Journey through Titanic](https://www.kaggle.com/code/omarelgabry/a-journey-through-titanic/notebook)
* [Titanic best working Classifier](https://www.kaggle.com/code/sinakhorami/titanic-best-working-classifier/notebook)
* [Titanic Data Science Solutions](https://www.kaggle.com/code/startupsci/titanic-data-science-solutions)