# Titanic Prediction with EDA and Random Forest Hyper-parameter tuning



In [None]:
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble  import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
train_data.head()

In [None]:
train_data.info()

### Information about the data
PassengerID - Passenger's identification \
Survived - Status if passenger survived the disaster \
Pclass - Ticket class \
Name - Name of passenger \
Sex - Gender of passenger \
Age - Age of passenger at the time \
SibSp - Number of siblings or spouse on board along \
Parch - Number of parents or child on board along \
Ticket - Ticket number \
Fare - Price paid for ticket \
Cabin - Cabin number \
Embarked - Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

# Analysis of Different Categorical Valrables and their impact on Survival Rate

In [None]:
columns = ["Pclass", "Sex", "SibSp", "Parch", 'Embarked']

plt.figure(figsize = (20, 10))

count = 1

for col in columns:
    plt.subplot(2, 4, count)
    sns.barplot(data = train_data, x = col, y = 'Survived')
    count += 1;

# Age Analysis

In [None]:
sns.boxplot(x='Age', data = train_data)
sns.displot(data=train_data, x='Age', kde=True)

## Age analysis 

#### The Age column in train data have 177 Null value. Similarly test data also contains null values. So we will do followings
* We will first separate the men and women data. 
* Then for men and women we will separate the data into different Passenger Class.  
* Now for each passenger class for **Men**, we will calculate the mean, median, mode, and standard deviation of Age data.
* Similarly for **Women**
* Then we will define a gaussian kernel with the 6 set of (mean and standard deviation), i.e., for each Passenger class for Men and Women. 
* Replace the NaN values with the corresponding random data generated from Gaussian Kernel 
* After filling up the NaN values, we will divide the Age data into different Age Groups

In [None]:
men = train_data.loc[train_data.Sex == 'male'][['PassengerId', 'Survived', 'Pclass', 'Age', "SibSp", "Parch", "Fare", "Embarked"]]

women = train_data.loc[train_data.Sex == 'female'][['PassengerId', 'Survived', 'Pclass', 'Age', "SibSp", "Parch", "Fare", "Embarked"]]

# women.head()

In [None]:
menP1 = men.loc[men.Pclass == 1][['PassengerId', 'Survived', 'Age', "SibSp", "Parch", "Fare", "Embarked"]]
menP2 = men.loc[men.Pclass == 2][['PassengerId', 'Survived', 'Age', "SibSp", "Parch", "Fare", "Embarked"]]
menP3 = men.loc[men.Pclass == 3][['PassengerId', 'Survived', 'Age', "SibSp", "Parch", "Fare", "Embarked"]]

womenP1 = women.loc[women.Pclass == 1][['PassengerId', 'Survived', 'Age', "SibSp", "Parch", "Fare", "Embarked"]]
womenP2 = women.loc[women.Pclass == 2][['PassengerId', 'Survived', 'Age', "SibSp", "Parch", "Fare", "Embarked"]]
womenP3 = women.loc[women.Pclass == 3][['PassengerId', 'Survived', 'Age', "SibSp", "Parch", "Fare", "Embarked"]]

print("For Men:\n")
print("Pclass\t Mean\t Median\t Mode\t Std_deviation")
print("1\t {:.2f}\t {:.2f}\t {:.2f}\t {:.2f}".format(menP1.Age.mean(), menP1.Age.median(), menP1.Age.mode()[0], menP1.Age.std()))
print("2\t {:.2f}\t {:.2f}\t {:.2f}\t {:.2f}".format(menP2.Age.mean(), menP2.Age.median(), menP2.Age.mode()[0], menP2.Age.std()))
print("3\t {:.2f}\t {:.2f}\t {:.2f}\t {:.2f}".format(menP3.Age.mean(), menP3.Age.median(), menP3.Age.mode()[0], menP3.Age.std()))

print("\n----------------------------------------")

print("For Women:\n")
print("Pclass\t Mean\t Median\t Mode\t Std_deviation")
print("1\t {:.2f}\t {:.2f}\t {:.2f}\t {:.2f}".format(womenP1.Age.mean(), womenP1.Age.median(), womenP1.Age.mode()[0], womenP1.Age.std()))
print("2\t {:.2f}\t {:.2f}\t {:.2f}\t {:.2f}".format(womenP2.Age.mean(), womenP2.Age.median(), womenP2.Age.mode()[0], womenP2.Age.std()))
print("3\t {:.2f}\t {:.2f}\t {:.2f}\t {:.2f}".format(womenP3.Age.mean(), womenP3.Age.median(), womenP3.Age.mode()[0], womenP3.Age.std()))

In [None]:
# menP1_randAge = np.random.normal(41.28, 15.14, size=menP1["Age"].isnull().sum())

# menP1_randAge = np.random.normal(41.28, 15.14)
# menP2_randAge = np.random.normal(30.74, 14.79)
# menP3_randAge = np.random.normal(26.51, 12.16)


# womenP1_randAge = np.random.normal(34.61, 13.61)
# womenP2_randAge = np.random.normal(28.72, 12.87)
# womenP3_randAge = np.random.normal(21.75, 12.73)

### Replace NaN value of Age in TRAIN data using Gaussian Kernel

In [None]:
index_NaN_age = list(train_data["Age"][train_data["Age"].isnull()].index)

for i in index_NaN_age:
    sex = train_data.iloc[i]["Sex"]
    pclass = train_data.iloc[i]["Pclass"]
    if sex == "male":
        if pclass == 1:
            train_data.loc[i, "Age"] = np.random.normal(41.28, 15.14)
        elif pclass == 2:
            train_data.loc[i, "Age"] = np.random.normal(30.74, 14.79)
        elif pclass == 3:
            train_data.loc[i, "Age"] = np.random.normal(26.51, 12.16)
    if sex == "female":
        if pclass == 1:
            train_data.loc[i, "Age"] = np.random.normal(34.61, 13.61)
        elif pclass == 2:
            train_data.loc[i, "Age"] = np.random.normal(28.72, 12.87)
        elif pclass == 3:
            train_data.loc[i, "Age"] = np.random.normal(21.75, 12.73)

### Replace NaN value of Age in TEST data using Gaussian Kernal

In [None]:
index_NaN_age = list(test_data["Age"][test_data["Age"].isnull()].index)

for i in index_NaN_age:
    sex = test_data.iloc[i]["Sex"]
    pclass = test_data.iloc[i]["Pclass"]
    if sex == "male":
        if pclass == 1:
            test_data.loc[i, "Age"] = np.random.normal(41.28, 15.14)
        elif pclass == 2:
            test_data.loc[i, "Age"] = np.random.normal(30.74, 14.79)
        elif pclass == 3:
            test_data.loc[i, "Age"] = np.random.normal(26.51, 12.16)
    if sex == "female":
        if pclass == 1:
            test_data.loc[i, "Age"] = np.random.normal(34.61, 13.61)
        elif pclass == 2:
            test_data.loc[i, "Age"] = np.random.normal(28.72, 12.87)
        elif pclass == 3:
            test_data.loc[i, "Age"] = np.random.normal(21.75, 12.73)
            
# test_data.info()

### Group TRAIN and TEST data into Age Groups

##### We will divide the age into different groups. Having 5 groups created more errors, as it cannot decide the perfect bias for each group. So we will divide into 3 groups. And this increases the accuracy. Lesser the number of groups, better is the chance to predict.

In [None]:
train_data.loc[(train_data['Age'] <= 16), 'Age_group'] = 1
train_data.loc[(train_data['Age'] > 16) & (train_data['Age'] <= 60), 'Age_group'] = 2
train_data.loc[(train_data['Age'] > 60), 'Age_group'] = 3

train_data.groupby('Age_group').mean()['Survived']

In [None]:
train_data.groupby('Age_group').describe()['Survived']

In [None]:
test_data.loc[(test_data['Age'] <= 16), 'Age_group'] = 1
test_data.loc[(test_data['Age'] > 16) & (test_data['Age'] <= 60), 'Age_group'] = 2
test_data.loc[(test_data['Age'] > 60), 'Age_group'] = 3

# test_data.groupby('Age_group').describe()

# Combine Train and Test Data

In [None]:
test_data_copy = test_data.copy()
test_data_copy['Survived'] = np.nan

# Data = Train data + Test data

data = pd.concat([train_data, test_data_copy]).reset_index(drop=True)

# Analysis of Fare attribute to improve Performance

In [None]:
sns.displot(data['Fare'], kde=True)

### Ticket Frequency

The Train and Test Data contains several entry with same Ticket number. That is because people within same family or people travelling in groups purchased one group ticket. And that group ticket and Fare (which is the total Fare os that Group ticket) is entered at each position of that same Family/Group members. That is causing distortion in data. So what we will do, is to count the Ticket Frequency of group ticket, and then divide the Fare/Total Fare by the Ticket Frequency.

In [None]:
data.groupby('Ticket')['Ticket'].describe().head()

In [None]:
data['Ticket_Frequency'] = data.groupby('Ticket')['Ticket'].transform('count')

data.head()

### We divide the Fare with Ticket Frequency. Then Plot the data with KDE

In [None]:
data["Fare"] = data['Fare'] / data['Ticket_Frequency']

In [None]:
sns.histplot(data, x='Fare', kde=True)

#### Here the PassengerId-1044 contains the null value in Fare attribute. So we need to fill this null value with the mean value of Fare     


In [None]:
data.info()

In [None]:
data[data['Fare'].isnull()]

data.loc[data['Fare'].isnull(), 'Fare'] = data['Fare'].mean()

In [None]:
warnings.filterwarnings('ignore')

fare_idx = data[data['Fare'] != 0].index

for idx in fare_idx:
    data['Fare'][idx] = np.log(data['Fare'][idx])

In [None]:
sns.histplot(data['Fare'], kde=True)

### Divide Fare into different Fare Categories. (Preferably 3 groups).  

#### Having 5 Fare groups was bad at predicting. Tested Previously.

In [None]:
data.loc[(data['Fare'] >=0) & (data['Fare'] <= 2.1), 'Fare_Category'] = 1
data.loc[(data['Fare'] > 2.1) & (data['Fare'] <= 3), 'Fare_Category'] = 2
data.loc[(data['Fare'] > 3), 'Fare_Category'] = 3

data.groupby('Fare_Category').describe()['Survived']

In [None]:
data[data['Embarked'].isnull()]

#### Since PassengerId = 62 and 829 have Embarked = NaN value, but we can see that they both survived. Since from the Embarked  attributes (S, C & Q), C has more survival probability, so we will replace this two Embarked with C value

In [None]:
data.loc[data['Embarked'].isnull(), 'Embarked'] = 'C'

# Test the impact of Different Categorical Variables on Survival

In [None]:
columns = ["Pclass", "Sex", "SibSp", "Parch", "Age_group", "Fare_Category", 'Embarked']

plt.figure(figsize = (20, 10))

count = 1

for col in columns:
    plt.subplot(2, 4, count)
    sns.barplot(data = data, x = col, y = 'Survived')
    count += 1;

# SibSp and Parch Grouping

* **SibSp have 9 Categories**
* **Parch have 10 Categories**

##### We will try to minimize the number of categories in SibSp and Parch. This will improve the performance

### SibSp Grouping

In [None]:
data.loc[(data['SibSp'] == 0), 'SibSp_Category'] = 0
data.loc[(data['SibSp'] > 0) & (data['SibSp'] <= 2), 'SibSp_Category'] = 1
data.loc[(data['SibSp'] > 2) & (data['SibSp'] <= 3), 'SibSp_Category'] = 2
data.loc[(data['SibSp'] > 3), 'SibSp_Category'] = 3

data.groupby('SibSp_Category').describe()['Survived']

In [None]:
sns.barplot(data = data, x = 'SibSp_Category', y = 'Survived')

### Parch Grouping

In [None]:
data.loc[(data['Parch'] == 0), 'Parch_Category'] = 0
data.loc[(data['Parch'] > 0) & (data['Parch'] <= 3), 'Parch_Category'] = 1
data.loc[(data['Parch'] > 3), 'Parch_Category'] = 2

data.groupby('Parch_Category').describe()['Survived']

In [None]:
sns.barplot(data = data, x = 'Parch_Category', y = 'Survived')

# Final Categorical Variables

## Plotting against Survival Rate

In [None]:
columns = ["Pclass", "Sex", "SibSp_Category", "Parch_Category", "Age_group", "Fare_Category", 'Embarked']

plt.figure(figsize = (20, 10))

count = 1

for col in columns:
    plt.subplot(2, 4, count)
    sns.barplot(data = data, x = col, y = 'Survived')
    count += 1;

## Seperate Train and Test Data

In [None]:
train_data, test_data = data[:len(train_data)], data[len(train_data):]

In [None]:
test_data = test_data.drop(['Survived'], axis=1)

# test_data.info()

# RF Model and Hyper-Parameter Tuning

In [None]:
start_time = datetime.now()

features = ["Pclass", "Sex", "SibSp_Category", "Parch_Category", "Age_group", "Fare_Category", "Embarked"]


X_train = pd.get_dummies(train_data[features])

Y_train = train_data["Survived"]

# for validation
X_valid, Y_valid = X_train[800:], Y_train[800:]

param = [{'n_estimators': [100, 200, 300, 400, 450, 500], 
         'max_depth': [3, 4, 6, 8, 10, 12], 
         'max_leaf_nodes': [15, 20, 25]},]



rfc = RandomForestClassifier()
gs_rfc = GridSearchCV(rfc, param, cv = 5, n_jobs = -1, verbose = 1)    
gs_rfc.fit(X_train, Y_train)


end_time = datetime.now()

gs_rfc.best_estimator_, gs_rfc.score(X_valid, Y_valid), gs_rfc.score(X_train, Y_train)

In [None]:
print("Runtime: ", (end_time - start_time))

# Prediction on Test Data

In [None]:
X_test = pd.get_dummies(test_data[features])

predictions = gs_rfc.predict(X_test)

output4 = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions.astype(int)})

output4.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")