

# **What will we cover?**  
##  *This code will help you get started with the problem and will also focus on getting some key insights into the data*
##  *It also captures RF model with CV and submission file*

##   *Contents*
  
### 1. Data Understanding  
### 2. Data Cleaning  
### 3. Data manipulation and preprocessing  
### 4. Exploratory Data Analysis (Univariate and Bivariate analysis)  + Vizualizations
### 5. Model
### 6. Submission

![](https://preview.redd.it/0izq0428pe661.jpg?width=960&format=pjpg&auto=webp&s=15022053715fc50198a17c401be035445592fee2)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing important packages

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

# Read the data

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")

## Check the records in the data
Survived is the target variable, while other variables are the raw features in the train data.

In [None]:
# Check top 10 records of train data
df_train.head(10)

In [None]:
# Check top 10 records of test data
df_test.head(10)

## % of passenges who survived - 42.8%

In [None]:
# % of passenges who survived
df_train.Survived.value_counts()/ df_train.shape[0]

### A good strategy to understand features as a whole would be to combine trainig and testing data and then perform univariate analysis
### So let's do it!!

In [None]:
df_test['Survived'] = 1  #temporarily kept as 1 for all test data passengers

# To be able to filter the data later
df_train['Set'] = "Train" 
df_test['Set'] = "Test"

# Complete data
df_comp = pd.concat([df_train, df_test]).reset_index(drop = True)
df_comp.index.nunique()

# Check Missing Values of the features in the complete data

## 1. Age ~3.38% missing values  
## 2. Ticket ~ 4.9% missing values
## 3. Fare ~ 0.13% missing values
## 4. Cabin ~ 69.34% missing values
## 4. Embarked ~ 0.26% missing values

In [None]:
#check missing values
print("Missing Values in the data \n",df_comp.isnull().sum()/df_comp.shape[0])

# Treating Missing values

### Age  
#### Treat by Median

In [None]:
# Distribution of Age
print(df_comp.Age.describe())
sns.displot(df_comp.Age)

# We can perfom median treatment for missing values of Age 
df_comp.Age = df_comp.Age.fillna(df_comp.Age.median())

In [None]:
# Create Age Bucket
df_comp["Age_bucket"] = pd.cut(df_comp["Age"], 9, 
                                  labels=["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80-89"])

### Fare
#### Multimodal skewed distribution. This could be due to the class of the passengers  
#### It is worth to check if Fare is dependent on Pclass and then do the treatment of missing values as per the passenger class
#### Also because of heavy tail and outliers, we may have to perform a log transform to control the variation in Fare
#### The Fare also has some values < 1, which means ln(Fare) can go negaitve. So we could do ln(1+Fare) transformation

In [None]:
# Distribution of Fare
print(df_comp.Fare.describe())
sns.displot(df_comp.Fare)

# Highly skewed distribution for Fare and seems multimodal histogram

In [None]:
# Clearly, the fare if dependent on Passenger Class
sns.boxplot(x= df_comp.Pclass, y= np.log(1+df_comp.Fare))

In [None]:
# Let's get the median values of Fare by Pclass for missing replacement
FareByClass = pd.crosstab(index = df_comp.Pclass, columns = 'MedianFare', \
                          values = np.log(1+df_comp.Fare), aggfunc= 'median').to_dict()['MedianFare']
FareByClass

In [None]:
df_comp.Pclass.map(FareByClass)

In [None]:
# Get a column with log of fare
df_comp['LnFare'] = np.log(1+df_comp['Fare'])

# Replace missing fare values with log transformed values by PClass
df_comp['LnFare'].fillna(df_comp.Pclass.map(FareByClass),inplace=True)

# Validating if missing values are treated
df_comp.loc[:,['Fare','LnFare']][df_comp.Fare.isna()]

## Passengers who paid more fare had higher chances of survival

In [None]:
# Bivariate
sns.boxplot(x = df_comp.Survived[df_comp.Set == "Train"],y= df_comp.LnFare[df_comp.Set == "Train"] )

### Embarked
#### Can be replaced by mode value of embarked which is 'S'

In [None]:
print(df_comp.Embarked.value_counts())
df_comp.Embarked.fillna('S', inplace=True)

## For passengers who boarded the Ship in 'Southampton' had least chances of survival
## while who boarded in Cherbourg had highest chances of survival

In [None]:
# Bivariate
pd.crosstab(index= df_comp.Embarked[df_comp.Set == "Train"] , columns= df_comp.Survived[df_comp.Set == "Train"], normalize='index' ). \
sort_values(by = 1).plot.bar(figsize=(15, 7),stacked = True)

## Cabin
### Cabin is interesting with approx 70% missing values
### It may mean that 70% of the passengers did not have dedicated Cabin assigned.. maybe they were using some common/ shared rooms
### It will be interesting to see if passengers with no cabin alloted had lower chances of survival
### We also observe that the 1st alphbet of the cabin number may mean the Deck number (could be an important feature. Not done in this version of code)

In [None]:
# Let's create a new column CabinAlloted (1/0)
df_comp['CabinNotAlloted'] = df_comp.Cabin.isna().astype(int)
df_comp['CabinNotAlloted'].value_counts()/df_comp.shape[0]

## Passengers with no cabin alloted had relatively lower chances of survival

In [None]:
# Bivariate
# Passengers with no cabin alloted had relatively lower chances of survival
pd.crosstab(index = df_train.Cabin.isna(), columns= df_train.Survived, normalize = 'index').plot.bar(stacked = True)

In [None]:
# Let's also fetch the first char of the cabin (which could signify Deck)
df_comp['Deck'] = df_comp.Cabin.str[0:1]
print("Unique decks \n", df_comp['Deck'].unique())

# replace missing deck by 'X'
df_comp['Deck'].fillna('X',inplace= True)

print("Passengers in each deck \n", df_comp['Deck'].value_counts())

## Ticket
### Some ticket numbers have some special alpha charachters like A/5,CA, SC PARIS etc., while many are just numeric. May mean some categorization of Special class tickets
#### Let's call the ones with only numbers as XX tickets
### Let us try and do some text cleaning and feature extraction and then also put the missing values as 'XX'

In [None]:
# Check this out
set(df_comp.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'XX'))

In [None]:
# Let us create a new Ticket column with the above feature extracted and also do the missing value at the same time
df_comp['TicketType'] = df_comp.Ticket.fillna('XX')
df_comp['TicketType'] = df_comp.TicketType.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'XX')
print(set(df_comp.TicketType))

In [None]:
# Let us do some text cleaning on TicketType
# Convert to lower
df_comp['TicketType'] = df_comp['TicketType'].str.lower()

# Get rid of dots and slash
import re
df_comp['TicketType'] = df_comp.TicketType.map(lambda x: re.sub("[^\w\s]+","",x))
set(df_comp.TicketType.to_list())

In [None]:
# Let's check the freq of passengers
df_comp.TicketType.value_counts()/ df_comp.shape[0]

In [None]:
pd.crosstab(index= df_comp.TicketType, columns= df_comp.Pclass, normalize='index' )

In [None]:
### df_comp['Pclass'].value_counts()/ df_comp.shape[0]

## The ticket type is related to Pclass 
## we also observe for the ticket types with less than 20% of 1st class passengers
###  - stono, stono2, sotono2, stonoq, aq3, a and a5 have death rates >= 80%
###  - sotonoq, fa, ca, fcc, scow, caston, wc and c have death rates >=65% and <80%
###  - rest are <65% death rate (>35% survival rate)  
  
    

### ** also most of the 'pc' ticket type passengers belong to first class


In [None]:
# Let's see if ticket type is related to Pclass
pd.crosstab(index= df_comp.TicketType, columns= df_comp.Pclass, normalize='index' ).sort_values(by = 1).plot.bar(figsize=(15, 7), stacked = True)

plt.axhline(y = 0.2, color = 'r', linestyle = '-')

In [None]:
# Bivariate
# Check the ticket types when more than 60% did not survive 
pd.crosstab(index= df_comp.TicketType[df_comp.Set == "Train"] , columns= df_comp.Survived[df_comp.Set == "Train"], normalize='index' ). \
sort_values(by = 1).plot.bar(figsize=(15, 7),stacked = True)

plt.axhline(y = 0.8, color = 'r', linestyle = '-')
plt.axhline(y = 0.65, color = 'g', linestyle = '-')

In [None]:
# Let's create bucket for ticket type
df_comp['TT_bucket'] = df_comp.TicketType.map(lambda x: 0 if x == 'pc' else 3 if x in ['stono', 'stono2', 'sotono2', 'stonoq', 'aq3', 'a', 'a5'] else 2 if \
                                             x in ['sotonoq', 'fa', 'ca', 'fcc', 'scow', 'caston', 'wc', 'c'] else 1)

df_comp['TT_bucket'].value_counts()

## Ticket Type Bucket shows survival rate varies by ticket types

In [None]:
#Bivariate
pd.crosstab(index= df_comp.TT_bucket[df_comp.Set == "Train"] , columns= df_comp.Survived[df_comp.Set == "Train"], normalize='index' ). \
sort_values(by = 1).plot.bar(figsize=(15, 7),stacked = True)

In [None]:
# Change Sex variable to 1 and 0
df_comp['Sex'] = df_comp['Sex'].map(lambda i: 1 if i == 'male' else 0)

## Surival rate for women was higher

In [None]:
# Bivariate
pd.crosstab(index= df_comp.Sex[df_comp.Set == "Train"] , columns= df_comp.Survived[df_comp.Set == "Train"], normalize='index' ). \
sort_values(by = 1).plot.bar(figsize=(15, 7),stacked = True)

In [None]:
##combine the number of SibSp & Parch +1 to be FamilySize, a new feature synthesized:
df_comp['FamilySize'] = df_comp['SibSp'] + df_comp['Parch'] + 1 

In [None]:
def family_size(x):
    if x == 1:
        return "alone"
    else:
        return "notalone"

In [None]:
df_comp["Group"] = df_comp["FamilySize"].apply(family_size)

## Being Alone or not did not make much difference

In [None]:
# Bivariate
pd.crosstab(index= df_comp.Group[df_comp.Set == "Train"] , columns= df_comp.Survived[df_comp.Set == "Train"], normalize='index' ). \
sort_values(by = 1).plot.bar(stacked = True)

In [None]:
# Let's create a backup for our analytical data with all features
df_copy_comp = df_comp.copy()

## Drop the unnecessary columns

In [None]:
# Check column names
df_comp.columns


In [None]:
# Drop unnecessary columns
df_comp.drop(columns = ['Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'TicketType'], inplace= True)

In [None]:
# Check data
df_comp.head()

In [None]:
# Change group to 0 and 1 (0 mean Alone and 1 mean Group)
df_comp['Group'] = df_comp['Group'].map(lambda x: 0 if x == "alone" else 1)

In [None]:
# Get dummies for Embarked
df_comp = pd.get_dummies(df_comp, columns= ['Embarked'])

In [None]:
# Get dummies for Age bucket
df_comp = pd.get_dummies(df_comp, columns= ['Age_bucket'])

In [None]:
# Label encode deck
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df_comp['Deck']= label_encoder.fit_transform(df_comp['Deck'])
  
df_comp['Deck'].unique()

In [None]:
df_comp.head()

In [None]:
# Model
# Get train and test data from comp data

new_train = df_comp[df_comp.Set == "Train"].drop(["Set", "PassengerId"], axis = 1)
new_test =  df_comp[df_comp.Set == "Test"].drop(["Set", "PassengerId","Survived"], axis = 1)

new_train.head()
#df_test.head()

In [None]:
# Creat X feature set and y target
X=new_train.drop("Survived",axis=1).values
y = new_train.Survived.values

In [None]:
print(X.shape)
print(y.shape)

## CROSS VALIDATION

In [None]:
from sklearn.model_selection import KFold

# Creating 5 folds (samples)
kf = KFold(n_splits=5,random_state=42,shuffle=True)

# Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=400)

In [None]:
from sklearn import ensemble
from sklearn import metrics
def cross_val_fn(n_trees):
    AUC =[]
    for dev_index, validation_index in kf.split(X_train):
        print("TRAIN:", dev_index, "TEST:", validation_index)
        X_dev, X_validation = X_train[dev_index], X_train[validation_index]
        y_dev, y_validation = y_train[dev_index], y_train[validation_index]
        clf=ensemble.RandomForestClassifier(n_jobs=-1,n_estimators=n_trees,random_state=400)
        clf = clf.fit(X_dev, y_dev)
        ### train my model on dev set and obtain some accuracy measure on validation set
        # preds=clf.predict(X_validation)
        probs=clf.predict_proba(X_validation)[:,1]
        auc = metrics.roc_auc_score(y_validation,probs)
        AUC.append(auc)
    print("Mean AUC = ",np.array(AUC).mean())
    return np.array(AUC).mean()

In [None]:
# Blank dictionary for AUC for different iterations of n_estimators
n_estimator_dict={}

# 2 keys- # tress and AUC
# Value pairs - n_tress and AUC output by the function
n_estimator_dict['trees']=[]
n_estimator_dict['AUC']=[]

# Run many iterations of ensemble models starting from 10 trees till 200 tress with 20 steps frequency.. 10, 30, 50, 70,....190
for tree in range(10,200,10):
    AUC=cross_val_fn(tree)
    n_estimator_dict['trees'].append(tree)
    n_estimator_dict['AUC'].append(AUC)

In [None]:
# Checking the outputs in dictionary
df_auc = pd.DataFrame(n_estimator_dict)

In [None]:
sns.lineplot(x = df_auc.trees, y = df_auc.AUC)

In [None]:
trees=[150,170,190,210, 230, 250]
min_samples_split=[2,4,6]

In [None]:
# Import the library for creating cross product of tress and min sample split
import itertools

In [None]:
# Create a function to run multiple ensembles by lopping over cross product of multiple hyperparameters
def cross_val_fn(n_trees, min_samples):
    AUC =[]
    for dev_index, validation_index in kf.split(X_train):
        print("TRAIN:", dev_index, "TEST:", validation_index)
        X_dev, X_validation = X_train[dev_index], X_train[validation_index]
        y_dev, y_validation = y_train[dev_index], y_train[validation_index]
        clf=ensemble.RandomForestClassifier(n_jobs=-1,n_estimators=n_trees,min_samples_split=min_samples,
                                       random_state=400)
        clf = clf.fit(X_dev, y_dev)
        ### train my model on dev set and obtain some accuracy measure on validation set
        # preds=clf.predict(X_validation)
        
        probs=clf.predict_proba(X_validation)[:,1]
        auc = metrics.roc_auc_score(y_validation,probs)
        AUC.append(auc)
    print("Mean AUC = ",np.array(AUC).mean())
    return np.array(AUC).mean()

In [None]:
# Blank dictionary for AUC for different iterations of n_estimators
n_estimator_dict={}
n_estimator_dict['trees']=[]
n_estimator_dict['Min Sample'] =[]
n_estimator_dict['AUC']=[]

for tree,min_samples in itertools.product(trees,min_samples_split):
    AUC=cross_val_fn(tree, min_samples)
    n_estimator_dict['trees'].append(tree)
    n_estimator_dict['Min Sample'].append(min_samples)
    n_estimator_dict['AUC'].append(AUC)

In [None]:
# Create dataframe of dictionary
pd.DataFrame(n_estimator_dict)

In [None]:
# Finalize RF with n_estimators = 250 and min_sample_split = 6
# Import random forest classifier 
from sklearn.ensemble import RandomForestClassifier

# Create classifier object
clf=RandomForestClassifier(n_estimators=250, min_samples_split=6 ,oob_score=True,n_jobs=-1,random_state=400)

# Fit model
clf.fit(X,y)

In [None]:
#clf.oob_score_

In [None]:
feature_names = new_train.drop("Survived", axis = 1).columns
pd.Series(clf.feature_importances_,index=feature_names.tolist()).sort_values(ascending=False).plot(kind='barh', figsize = (15,7))

# Submission

In [None]:
y_hat = clf.predict(new_test)
results_df = pd.DataFrame(data={'PassengerId':df_test['PassengerId'], 'Survived':y_hat})
results_df.to_csv('submission-random_forest_kfold.csv', index=False)

## I hope you like the notebook. Feel free to use it. Make sure you upvote and give credits.
### All the best 
#### -JM