In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Read CSV

In [2]:
df_train_o = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test_o = pd.read_csv('/kaggle/input/titanic/test.csv')

In [3]:
df_train_o

### Next, we have a small EDA

#### 1. Get the number of NULL in dataset

In [4]:
df_train_o.isnull().sum(axis = 0)

In [5]:
df_test_o.isnull().sum(axis = 0)

#### Notice that there are many null in Cabin. Therefore, it could be removed. 
#### Besides, Ticket and Name can be transformed into a better form

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt

def cat_plot(feature, df_train):
    fig = plt.figure(figsize = (14, 8))
    sns.countplot(x=feature, hue='Survived', data=df_train)
    
    plt.xlabel(feature, size=15, labelpad=5)
    plt.ylabel('Passengers', size=15, labelpad=15)    
    plt.tick_params(axis='x', labelsize=15)
    plt.tick_params(axis='y', labelsize=15)
    plt.legend(['Not Survived', 'Survived'], loc='upper center', prop={'size': 14})
    plt.title('Survival Count in {}'.format(feature), size=15)
    plt.show()
    
    print(df_train[feature].value_counts())

#### 2. Take a look on the distribution of survival rate

In [11]:
cat_plot('Sex', df_train_o)

**The survival rate of female is larger than that of male**

In [12]:
cat_plot('SibSp', df_train_o)

In [13]:
cat_plot('Parch', df_train_o)

#### 3. Look for the distribution of numerical features

In [15]:
sns.displot(df_train_o['Fare'])

In [16]:
sns.displot(df_train_o['Age'])

In [17]:
sns.displot(df_train_o['Parch'])

In [18]:
sns.displot(df_train_o['SibSp'])

#### Extreme values exist in Fare (over 500) and Parch, SibSp (8). They can be removed from the training set

#### And we try to retrieve the prefix of Ticket

In [20]:
num_values = 0
ticket_set = set()
for x in df_train_o['Ticket']:
    try:
        xxx = int(x)
        num_values += 1
    except:
        xxx = x.split()
        ticket_set.add(xxx[0])
print(f"{num_values*100/891}% of them are numbers")
print(ticket_set)

However, it is hard for me to extract the useful data because of lack of domain knowledge. Therefore, Ticket is skipped and dropped.

### Feature engineering

Dropping PassengerId, Cabin, Ticket and Name.

In [21]:
df_train = df_train_o.drop(columns=['PassengerId', 'Cabin','Name', 'Ticket'])
df_train

Do so for test data

In [25]:
df_test = df_test_o.drop(columns=['PassengerId', 'Cabin','Name', 'Ticket'])
df_test

#### Find the mode, mean and median to replace null values in remaining features

In [22]:
df_train.mode()

In [23]:
df_train.mean()

In [24]:
df_train.median()

Fill null values by median

In [26]:
values = {"Embarked": 'S', "Age": 28}
df_train2 = df_train.fillna(value=values)

In [27]:
df_train2.isnull().sum(axis=0)

Repeat for test data

In [28]:
df_test2 = df_test.fillna(value=values)
df_test2 = df_test2.fillna({"Fare": 7.89})
df_test2.isnull().sum(axis=0)

#### Extract Title from name and extract isMs
Reference: https://www.kaggle.com/code/sergioortiz/titanic-competition-data-exploration-1/notebook


In [31]:
def multipleReplace(text, wordDic):
    for key in wordDic:
        if text.lower()==key.lower():
            text=wordDic[key]
            break
    return text

def normaliseTitle(title):
    wordDic = {
    'Mlle': 'Miss',
    'Ms': 'Mrs',
    'Mrs':'Mrs',
    'Master':'IMP',
    'Mme': 'Mrs',
    'Lady': 'IMP',
    'Countess': 'IMP',
    'Capt': 'IMP',
    'Col': 'IMP',
    'Dona': 'Miss',
    'Don': 'Mr',
    'Dr': 'IMP',
    'Major': 'IMP',
    'Rev': 'IMP',
    'Sir': 'IMP',
    'Jonkheer': 'IMP',
    }     
    title=multipleReplace(title,wordDic)
    return title

def extractTitleFromName(name):
    pos_point=name.find('.')
    if pos_point == -1: return ""
    wordList=name[0:pos_point].split(" ")
    if len(wordList)<=0: return ""
    title=wordList[len(wordList)-1]
    normalisedTitle=normaliseTitle(title)
    return normalisedTitle

# Get a list with different titles
titleList=(df_train_o['Name'].apply(lambda x: extractTitleFromName(x)))
df_train2['Title'] = titleList

#### Then, Name is transformed into Title with 4 categories: Mr, Mrs, Miss, IMP (Important)
IsMs will be 1 if she is a Mrs or Miss else 0

In [32]:
df_train2['IsMs']=(df_train2["Title"]).apply(lambda x: 1 if x=='Mrs' or x == 'Miss' else 0)

In [33]:
df_train2

In [34]:
cat_plot('Title', df_train2)

#### Replace male, female with 1, 0, and Class 1, 2, 3 with A, B, C

In [39]:
df_train2.Pclass.replace({1:'A' , 2: 'B', 3: 'C'} , inplace =True)
df_train2.Sex.replace({'male': 1 , 'female':0} , inplace =True)
df_train2

**Do so for test data**

In [38]:
titleList=(df_test_o['Name'].apply(lambda x: extractTitleFromName(x)))
df_test2['Title'] = titleList
df_test2['IsMs']=(df_test2["Title"]).apply(lambda x: 1 if x=='Mrs' or x == 'Miss' else 0)
df_test2.Pclass.replace({1:'A' , 2: 'B', 3: 'C'} , inplace =True)
df_test2.Sex.replace({'male': 1 , 'female':0} , inplace =True)

df_test2

#### Add new feature: IsAlone to represent if the passenger is alone

In [40]:
df_train2['IsAlone']=(df_train2["SibSp"]+df_train2["Parch"]).apply(lambda x: 0 if x>0 else 1)
df_test2['IsAlone']=(df_test2["SibSp"]+df_test2["Parch"]).apply(lambda x: 0 if x>0 else 1)

#### Remove outliers

In [41]:
df_train3 = df_train2[df_train2['Fare'] < 250]
df_train3

#### Find the correlation between features and survival

In [42]:
def printHeatmap(df):
    matrix = df.corr().round(2)
    fig, ax = plt.subplots(figsize=(10,10))   
    sns.heatmap(matrix, annot=True, ax=ax)
    plt.show()

In [43]:
printHeatmap(df_train3)

**Age, SibSp and Parch have low correlation with Survival rate, so they can be transformed.**

#### Transforming SibSp and Parch into hasSibSp and hasParch

In [44]:
df_train3['hasSibSp']=(df_train3["SibSp"]).apply(lambda x: 0 if x > 0 else 1)
df_train3['hasParch']=(df_train3["Parch"]).apply(lambda x: 0 if x > 0 else 1)
df_train3.head()

In [45]:
df_train4 = df_train3.drop(columns=['SibSp', 'Parch'])

In [48]:
df_train4.head()

Do so for test data

In [46]:
df_test2['hasSibSp']=(df_test2["SibSp"]).apply(lambda x: 0 if x > 0 else 1)
df_test2['hasParch']=(df_test2["Parch"]).apply(lambda x: 0 if x > 0 else 1)
df_test4 = df_test2.drop(columns=['SibSp', 'Parch'])

In [47]:
df_test4.head()

#### One-hot encoding for categorical features

In [49]:
cat_cols = ['Pclass', 'Embarked', 'Title']
df_train4 = pd.get_dummies(df_train4 , columns=cat_cols)
df_train4

In [50]:
df_test4 = pd.get_dummies(df_test4 , columns=cat_cols)
df_test4

#### Scaling the data with MinMaxScaler

In [52]:
from sklearn.preprocessing import MinMaxScaler

In [53]:
minmaxScaler = MinMaxScaler().fit(df_train4.drop(columns=['Survived']))
arr_train_scaled = minmaxScaler.transform(df_train4.drop(columns=['Survived']))
arr_train_scaled.shape

Scale test data with the same scaler

In [54]:
arr_test_scaled = minmaxScaler.transform(df_test4)
arr_test_scaled.shape

In [55]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


from sklearn.model_selection import cross_val_score

In [56]:
dtc = DecisionTreeClassifier(random_state=12)

rfc = RandomForestClassifier(random_state=12)

nn = MLPClassifier(solver='lbfgs', alpha=1,hidden_layer_sizes=(15,), random_state=1)

logR = LogisticRegression(max_iter=30000)

svc = SVC(C=4)

kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(kernel=kernel, random_state=0)

In [57]:
y = df_train4['Survived']
X = arr_train_scaled

In [58]:
print(X.shape)
print(y.shape)

#### Logistic Regression

In [62]:
cross_val_score(logR, X, y, cv=4)

#### Random Forest Classifier

In [60]:
cross_val_score(rfc, X, y, cv=4)

#### Neural Network

In [61]:
cross_val_score(nn, X, y, cv=4)

#### Guassian Process Classifier

In [63]:
cross_val_score(gpc, X, y, cv=4)

#### Define predictor and CSV helper functions

In [64]:
def predictor(model, X, y, test_data):
    model.fit(X, y)
    y_pred = model.predict(test_data)
    return y_pred
    
def csvTransformer(df_test_o, y_pred, filename):
    df = pd.DataFrame()
    df['PassengerId'] = df_test_o['PassengerId']
    df['Survived'] = y_pred
    
    df.to_csv(filename, index=False)
    print('Saved')
    return

In [70]:
y_pred = predictor(logR, X, y, arr_test_scaled)
csvTransformer(df_test_o, y_pred, '/kaggle/working/prediction.csv')