In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Reading Data

In [11]:
data = pd.read_csv('/kaggle/input/titanic/train.csv')
# list(data.columns)

test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
# test_data.head()

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocessing

In [12]:
#FIlling NaN values of age with the median age of people with the same salutation

median_age = data['Age'].median()
median_age_miss = data[data["Name"].str.contains('Miss.', na=False)]['Age'].median().round()
median_age_mrs = data[data["Name"].str.contains('Mrs.', na=False)]['Age'].median().round()
median_age_mr = data[data["Name"].str.contains('Mr.', na=False)]['Age'].median().round()
median_age_master = data[data["Name"].str.contains('Master.', na=False)]['Age'].median().round()


def fill_age(name_age):
    
    name = name_age[0]
    age = name_age[1]
    
    if pd.isnull(age):
        if 'Mr.' in name:
            return median_age_mr
        if 'Mrs.' in name:
            return median_age_mrs
        if 'Miss.' in name:
            return median_age_miss
        if 'Master.' in name:
            return median_age_master
        if 'Ms.' in name:
            return median_age_miss
        return median_age
    else:
        return age

#writing a function that modifies cabin to include only the first letter
def modify_cabin(cabin):
    if pd.isnull(cabin):
        return cabin
    else:
        return cabin[0]
        

In [13]:
from sklearn import preprocessing

#Modifiying cabin to include only the first letter as there are too many different values
data['Cabin']= data['Cabin'].apply(modify_cabin)
#Filling NaN values of Cabin with the most frequent value
data['Cabin'].fillna(data['Cabin'].mode()[0], inplace = True)

#Filling NaN of age 
data['Age'] = data[['Name', 'Age']].apply(fill_age,axis=1)
test_data['Age'] = test_data[['Name', 'Age']].apply(fill_age,axis=1)

#Filling NaN of Embarked with the most frequent value
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace = True)

#Filling NaN of Fare with the median
test_data['Fare'].fillna(test_data['Fare'].median(), inplace = True)


#Encoding the string labels into int
sex_le = preprocessing.LabelEncoder()
ticket_le = preprocessing.LabelEncoder()
cabin_le = preprocessing.LabelEncoder()
embarked_le = preprocessing.LabelEncoder()

data['Sex'] = sex_le.fit_transform(data['Sex'])
data['Ticket'] = ticket_le.fit_transform(data['Ticket'])
data['Cabin'] = cabin_le.fit_transform(data['Cabin'])
data['Embarked'] = embarked_le.fit_transform(data['Embarked'])

test_data['Sex'] = sex_le.fit_transform(test_data['Sex'])
test_data['Ticket'] = ticket_le.fit_transform(test_data['Ticket'])
test_data['Cabin'] = cabin_le.fit_transform(test_data['Cabin'])
test_data['Embarked'] = embarked_le.fit_transform(test_data['Embarked'])

#Splitting data into train and valid
from sklearn.model_selection import train_test_split
train_data,valid_data = train_test_split(data)

#Choosing only the requiired features (Ex. Name and passenger id are unique for everyone so we can ignore that)
X_train=train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Ticket', 'Fare','Cabin','Parch','Embarked']] 
X_test = test_data[['Pclass', 'Sex', 'Age', 'SibSp',  'Ticket', 'Fare','Cabin','Parch','Embarked']]
y_train=train_data['Survived']
X_valid = valid_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Ticket', 'Fare','Cabin','Parch','Embarked']]
y_valid=valid_data['Survived'] 

#Scaling the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_valid = sc.transform(X_valid)

# Random Hyperparameter Grid


In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

n_estimators = [100*x for x in range(1,11)]
max_features = ['auto', 'sqrt']
max_depth = [10*x for x in range(1,11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
class_weights= ["balanced","balanced_subsample",None]

#Creating a random grid with the above parameters
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'class_weight': class_weights}


#Creating a Random Forest Model and doing a Random Grid Search
clf=RandomForestClassifier()
clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
clf_random.fit(X_train,y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'class_weight': ['balanced',
                                                         'balanced_subsample',
                                                         None],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000]},
                   

## Analysis of features

In [15]:
#Analysing the Importance of each of the features in classification
feature_imp = pd.Series(clf_random.best_estimator_.feature_importances_, index=['Pclass', 'Sex', 'Age', 'SibSp', 'Ticket', 'Fare','Cabin','Parch','Embarked']).sort_values(ascending=False)
print(feature_imp)

Sex         0.271542
Ticket      0.197449
Fare        0.172151
Age         0.145293
Pclass      0.078264
SibSp       0.045369
Cabin       0.039244
Parch       0.026245
Embarked    0.024443
dtype: float64


# Train and Validation Accuracies

In [16]:

from sklearn.metrics import confusion_matrix,classification_report

y_train_pred = clf_random.predict(X_train)
print(classification_report(y_train,y_train_pred))

y_valid_pred = clf_random.predict(X_valid)
print(classification_report(y_valid, y_valid_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       414
           1       0.96      0.94      0.95       254

    accuracy                           0.96       668
   macro avg       0.96      0.96      0.96       668
weighted avg       0.96      0.96      0.96       668

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       135
           1       0.80      0.76      0.78        88

    accuracy                           0.83       223
   macro avg       0.82      0.82      0.82       223
weighted avg       0.83      0.83      0.83       223



# Predicting on Testing Data and generating csv


In [17]:
y_test= clf_random.predict(X_test)
import csv

with open('submission.csv', mode='w') as submission_file:
    submission_writer = csv.writer(submission_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    submission_writer.writerow(['PassengerId', 'Survived'])
    for i in range(len(y_test)):
        submission_writer.writerow([test_data['PassengerId'][i],y_test[i]])