In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

# Features

Load the data and select only the features that are not redundant, don't have too many missing values and affect the output significantly.

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId is just a serial number and Cabin has too many missing values. The name of a person only gives information about gender (through honorifics - Mr, Mrs, Miss, etc) and family size (through surname) however we already know both of these through other features (Sex and SibSp, Parch). Ticket only gives information about the group size of a person by counting the number of people with the same ticket, but in most cases it will be equal to family size.

The other features might affect the survivability significantly. Let's check:

In [None]:
# Sex

women = train_data.loc[train_data['Sex'] == 'female']['Survived']
print("Women: ", sum(women)/len(women)*100)
men = train_data.loc[train_data['Sex'] == 'male']['Survived']
print("Men: ", sum(men)/len(men)*100)

Women:  74.20382165605095
Men:  18.890814558058924


In [None]:
# Embarked

southampton = train_data.loc[train_data['Embarked'] == 'S']['Survived']
print("Southampton: ", sum(southampton)/len(southampton)*100)
cherbourg = train_data.loc[train_data['Embarked'] == 'C']['Survived']
print("Cherbourg: ", sum(cherbourg)/len(cherbourg)*100)
queenstown = train_data.loc[train_data['Embarked'] == 'Q']['Survived']
print("Queenstown: ", sum(queenstown)/len(queenstown)*100)

Southampton:  33.69565217391305
Cherbourg:  55.35714285714286
Queenstown:  38.961038961038966


In [None]:
# Pclass

class_1 = train_data.loc[train_data['Pclass'] == 1]['Survived']
print("Class 1: ", sum(class_1)/len(class_1)*100)
class_2 = train_data.loc[train_data['Pclass'] == 2]['Survived']
print("Class 2: ", sum(class_2)/len(class_2)*100)
class_3 = train_data.loc[train_data['Pclass'] == 3]['Survived']
print("Class 3: ", sum(class_3)/len(class_3)*100)

Class 1:  62.96296296296296
Class 2:  47.28260869565217
Class 3:  24.236252545824847


In [None]:
# Age

young = train_data.loc[train_data['Age'] < 15]['Survived']
print("Young: ", sum(young)/len(young)*100)
middle = train_data.loc[(train_data['Age'] <= 60) & (train_data['Age'] >= 15)]['Survived']
print("Middle: ", sum(middle)/len(middle)*100)
old = train_data.loc[train_data['Age'] > 60]['Survived']
print("Old: ", sum(old)/len(old)*100)

Young:  57.692307692307686
Middle:  39.08794788273616
Old:  22.727272727272727


In [None]:
# SibSp

for num in set(train_data['SibSp'].unique()):
    survived = train_data.loc[train_data['SibSp'] == num]['Survived']
    print(f"{num}:", sum(survived)/len(survived)*100)

0: 34.53947368421053
1: 53.588516746411486
2: 46.42857142857143
3: 25.0
4: 16.666666666666664
5: 0.0
8: 0.0


In [None]:
# Parch

for num in set(train_data['Parch'].unique()):
    survived = train_data.loc[train_data['Parch'] == num]['Survived']
    print(f"{num}:", sum(survived)/len(survived)*100)

0: 34.365781710914455
1: 55.08474576271186
2: 50.0
3: 60.0
4: 0.0
5: 20.0
6: 0.0


In [None]:
# Fare

poor = train_data.loc[train_data['Fare'] <= 14]['Survived']
print("Poor: ", sum(poor)/len(poor)*100)
middle = train_data.loc[(train_data['Fare'] <= 31) & (train_data['Fare'] > 14)]['Survived']
print("Middle: ", sum(middle)/len(middle)*100)
rich = train_data.loc[train_data['Fare'] > 31]['Survived']
print("Rich: ", sum(rich)/len(rich)*100)

Poor:  25.400457665903893
Middle:  43.96551724137931
Rich:  58.108108108108105


As is clear, the features that need to be selected are Sex, Pclass, Age, SibSp, Parch and Fare. Embarked needs to be dropped.

Sex: 74% of females survived while only 19% males survived. This makes sense - women are given higher priority in such situations.

Pclass: 63% Class 1 passengers survived and the survivability went down with Class 2 and 3. This makes sense - the affluent are given higher priority in such situations.

Age: 58% children (under 15) survived with survivability decreasing with increase in age. This makes sense - children are given higher priorit in such situations.

SibSp & Parch: Survivability decreased with increasing family size. This makes sense - people with large families will prioritise their family members, especially children.

Fare: The rich (people who bought more expensive tickets) had better survivability. Again, this correlates with Pclass.

Embarked: The nationality of an individual - English, French or Kiwi (New Zealander) - doesn't seem to affect their survivability significantly.

And that's all for the features. Don't try to make this too complicated with sophisticated feature engineering - this problem is one of the simplest on this site and is meant to be dealt with in a simple manner. Feature engineering works but a lot of it increases the score only a little and may even decrease the score if it isn't done properly.

In [None]:
train_data = train_data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
test_data = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

# Preprocessing

Split into x_train and y_train, and intialize the preprocessors. The categorical feature - Sex - is One Hot Encoded and one of them is dropped to prevent linear dependence of features (by looking at whether someone is male or not, you can determine whether he/she is female or not - it doesn't give any extra information). The feature with missing values - Age - is imputed in a principled manner with Iterative Imputation. Feature scaling isn't required since I will be using a tree based method - Random Forest.

Its always a good idea to do all preprocessing through a pipeline - it reduces the amount of code, makes it easier to understand and also provides the option of tuning hyperparameters of all preprocessors and estimator at once.

Once again, don't try to make this too complicated. No need to remove outliers, transform features to uniform/normal distribution or handle class rebalancing with oversampling. Such things make the dataset too complicated to make a good estimation.

In [None]:
x_train = train_data.iloc[:, 1:]
y_train = train_data.iloc[:, 0]
x_test = test_data.iloc[:, :]
encoder = make_column_transformer((OneHotEncoder(drop='first', sparse=False), x_train.select_dtypes(include='object').columns), remainder='passthrough', verbose_feature_names_out=False)
pipe = make_pipeline(encoder, IterativeImputer(random_state=42))

In [None]:
x_train = pd.DataFrame(pipe.fit_transform(x_train), columns=encoder.get_feature_names_out())
x_test = pd.DataFrame(pipe.transform(x_test), columns=encoder.get_feature_names_out())

In [None]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Sex_male  891 non-null    float64
 1   Pclass    891 non-null    float64
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    float64
 4   Parch     891 non-null    float64
 5   Fare      891 non-null    float64
dtypes: float64(6)
memory usage: 41.9 KB


# Estimation

As has been said before, keep it simple. Don't use something like XGBoost or Neural Networks (Multi-layer Perceptron). Something simple like Random Forest or SVC will be more than enough. The tutorial used Random Forest and that's what I will be using too.

First of all, do some hyperparameter tuning with GridSearchCV and use the best estimator.

In [None]:
# rf = RandomForestClassifier(random_state=1)
# params = {'n_estimators': [50, 100, 150, 200],
#           'max_depth': [2, 3, 4, 5]}
# search = GridSearchCV(rf, params)
# search.fit(x_train, y_train)
# search.best_params_

# max_depth = 5, n_estimators = 150

In [None]:
# rf = RandomForestClassifier(n_estimators=150, max_depth=5, random_state=1)
# rf.fit(x_train, y_train)
# y_pred = rf.predict(x_test)

# score = 0.78468

Unfortunately, the above 'best' estimator didn't give the best results. The best results were actually obtained by the estimator used in the tutorial. This is because of the difference in training and evaluation sets.

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

# score = 0.78708

The score of 0.799 was obtained in a crude manner - during preprocessing, both features for Sex_male and Sex_female were retained and missing values were all imputed with -1. It gave the best score but didn't make statistical sense.

# Submission

Load the sample submission and build your submission file in the same format.

In [None]:
pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
df = pd.DataFrame()
df['PassengerId'] = range(892, 1310)
df['Survived'] = y_pred
df.to_csv('submission.csv', index=False)