In [39]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [40]:
original_train_data = pd.read_csv("Data/train.csv")
print(original_train_data.shape)
original_train_data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
original_test_data = pd.read_csv("Data/test.csv")
print(original_test_data.shape)
original_test_data.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [42]:
# checking the datatypes of different features
original_train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

1. PassengerID is not gonna help us in the final prediction.
2. Pclass is categorical variable
3. Name is not gonna help us in the final prediction.
4. Sex is categorical variable
5. Age is numerical variable.
6. SibSp is numerical variable.
7. Parch is numerical variable.
8. Ticket will not help in the final prediction.
9. Fare is numerical variable.
10. Cabin is categorical variable.
11. Embarked is categorical variable.

In [43]:
#extracting titles out of names
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
original_train_data['Title'] = original_train_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

original_train_data['Title'] = original_train_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
original_train_data['Title'] = original_train_data['Title'].replace('Mlle','Miss')
original_train_data['Title'] = original_train_data['Title'].replace('Ms','Miss')
original_train_data['Title'] = original_train_data['Title'].replace('Mme','Mrs')
original_train_data['Title'] = original_train_data['Title'].map(titles)
original_train_data['Title'] = original_train_data['Title'].fillna(0)


In [44]:
original_test_data['Title'] = original_test_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

original_test_data['Title'] = original_test_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
original_test_data['Title'] = original_test_data['Title'].replace('Mlle','Miss')
original_test_data['Title'] = original_test_data['Title'].replace('Ms','Miss')
original_test_data['Title'] = original_test_data['Title'].replace('Mme','Mrs')
original_test_data['Title'] = original_test_data['Title'].map(titles)
original_test_data['Title'] = original_test_data['Title'].fillna(0)

In [45]:
# Removing the unwanted columns

#Training data
original_train_data.drop(columns=["PassengerId","Name","Ticket"], inplace = True)
original_train_data.head()

#Test data
original_test_data.drop(columns=["PassengerId","Name","Ticket"], inplace = True)
original_test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,3,male,34.5,0,0,7.8292,,Q,1
1,3,female,47.0,1,0,7.0,,S,3
2,2,male,62.0,0,0,9.6875,,Q,1
3,3,male,27.0,0,0,8.6625,,S,1
4,3,female,22.0,1,1,12.2875,,S,3


In [46]:
# Calculating the percentage of missing values for training data
(original_train_data.isnull().sum()/original_train_data.shape[0])*100

Survived     0.000000
Pclass       0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
Title        0.000000
dtype: float64

In [47]:
# Calculating the percentage of missing values for test data
(original_test_data.isnull().sum()/original_test_data.shape[0])*100

Pclass       0.000000
Sex          0.000000
Age         20.574163
SibSp        0.000000
Parch        0.000000
Fare         0.239234
Cabin       78.229665
Embarked     0.000000
Title        0.000000
dtype: float64

Training Data - Age has approximately 20% missing values. Either we can impute missing values using mean or by predicting using linear regression. I will be using Linear Regression. Cabin has approximately 77% missing values. So we will drop it. Embarked has approximately 0.2% missing values. We can impute that using mode Test Data - Similar to the training data will compute missing age values using Linear Regression. Cabin has approximately using 78% missing values. So it will be dropped. There is also one missing value of fare. I will be using mean to calculate it.

In [48]:
#dropping cabin column from train data and test data
original_train_data.drop(columns=["Cabin"], inplace = True)

original_test_data.drop(columns=["Cabin"], inplace = True)

In [49]:
#imputing missing values of embarked column in train data
original_train_data['Embarked'] = original_train_data['Embarked'].fillna('S')

#Join SibSp and Parch to make one column named family
original_train_data['Family'] = original_train_data['SibSp']+original_train_data['Parch']

#drop SibSp and Parch
original_train_data.drop(columns=['SibSp','Parch'], inplace = True)

#Treating Pclass as categorical value
original_train_data['Pclass'] = original_train_data['Pclass'].astype(str)

#doing one hot encoding in categorical variable
original_train_data = pd.get_dummies(original_train_data, dtype = int)

original_train_data

Unnamed: 0,Survived,Age,Fare,Title,Family,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.2500,1,1,0,0,1,0,1,0,0,1
1,1,38.0,71.2833,3,1,1,0,0,1,0,1,0,0
2,1,26.0,7.9250,2,0,0,0,1,1,0,0,0,1
3,1,35.0,53.1000,3,1,1,0,0,1,0,0,0,1
4,0,35.0,8.0500,1,0,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,13.0000,5,0,0,1,0,0,1,0,0,1
887,1,19.0,30.0000,2,0,1,0,0,1,0,0,0,1
888,0,,23.4500,2,3,0,0,1,1,0,0,0,1
889,1,26.0,30.0000,1,0,1,0,0,0,1,1,0,0


In [50]:
#imputing missing values of fare column in test data
original_test_data['Fare'] = original_test_data['Fare'].fillna(original_test_data['Fare'].mean())

#Join SibSp and Parch to make one column named family
original_test_data['Family'] = original_test_data['SibSp']+original_test_data['Parch']

#drop SibSp and Parch
original_test_data.drop(columns=['SibSp','Parch'], inplace = True)


#Treating Pclass as categorical value
original_test_data['Pclass'] = original_test_data['Pclass'].astype(str)

#doing one hot encoding in categorical variable
original_test_data = pd.get_dummies(original_test_data, dtype = int)

original_test_data

Unnamed: 0,Age,Fare,Title,Family,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,34.5,7.8292,1,0,0,0,1,0,1,0,1,0
1,47.0,7.0000,3,1,0,0,1,1,0,0,0,1
2,62.0,9.6875,1,0,0,1,0,0,1,0,1,0
3,27.0,8.6625,1,0,0,0,1,0,1,0,0,1
4,22.0,12.2875,3,2,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,,8.0500,1,0,0,0,1,0,1,0,0,1
414,39.0,108.9000,5,0,1,0,0,1,0,1,0,0
415,38.5,7.2500,1,0,0,0,1,0,1,0,0,1
416,,8.0500,1,0,0,0,1,0,1,0,0,1


In [51]:
#imputing missing values of Age in training dataset using median
original_train_data['Age'] = original_train_data['Age'].fillna(original_train_data['Age'].median())

#imputing missing values of Age in test dataset using median
original_test_data['Age'] = original_test_data['Age'].fillna(original_test_data['Age'].median())



In [52]:
#Storing survived column(target) of the training data into seperate dataset
original_train_data_y = original_train_data["Survived"]
original_train_data.drop(columns="Survived", inplace = True)

#normalising the training dataset
scaler = StandardScaler()
original_train_data_norm = scaler.fit_transform(original_train_data)

#Splitting the training data into two parts
X_train, X_test, y_train, y_test = train_test_split(original_train_data_norm, original_train_data_y, test_size = 0.2, random_state = 42)



In [53]:
#Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

train_acc = logistic_model.score(X_train, y_train)
test_acc = logistic_model.score(X_test, y_test)

print("Training Accuracy = ", train_acc)
print("Test Accuracy = ", test_acc)

Training Accuracy =  0.8188202247191011
Test Accuracy =  0.7821229050279329


In [54]:
#Decision Tree Model
param_grid = {
    'min_samples_split': [2, 10, 30, 50, 100, 200, 300, 700],
    'max_depth': [1, 2, 3, 4, 8, 16, 32, 64, None]
}

decision_model = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = decision_model, param_grid=param_grid, cv = 5)

grid_search.fit(X_train, y_train)
decision_best_model = grid_search.best_estimator_

best_score = grid_search.best_score_

train_acc = decision_best_model.score(X_train, y_train)
test_acc = decision_best_model.score(X_test, y_test)


print("Best Parameters:", grid_search.best_params_)
print("Best Score:", best_score)
print("Training Accuracy =", train_acc)
print("Test Accuracy =", test_acc)

Best Parameters: {'max_depth': 3, 'min_samples_split': 2}
Best Score: 0.8187727765192554
Training Accuracy = 0.8342696629213483
Test Accuracy = 0.8156424581005587


In [55]:
#Random Forest Model
param_grid = {
    'min_samples_split': [2,3,4],
    'max_depth': [5,10,None],
    'n_estimators': [10,100,200,500]
}

forest_model = RandomForestClassifier()
grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, cv = 5)

grid_search.fit(X_train, y_train)
forest_best_model = grid_search.best_estimator_

best_score = grid_search.best_score_

train_acc = forest_best_model.score(X_train, y_train)
test_acc = forest_best_model.score(X_test, y_test)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", best_score)
print("Training Accuracy =", train_acc)
print("Test Accuracy =", test_acc)

Best Parameters: {'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 10}
Best Score: 0.8356446370530879
Training Accuracy = 0.8581460674157303
Test Accuracy = 0.8044692737430168


In [56]:
#normalising the test dataset
scaler = StandardScaler()
original_test_data_norm = scaler.fit_transform(original_test_data)

#initialising the result dataset
result = pd.DataFrame(columns=['PassengerId' , 'Survived'])

test_data_raw = pd.read_csv("Data/test.csv")
result['PassengerId'] = test_data_raw['PassengerId']

#Prediction on original test dataset
result_pred = forest_best_model.predict(original_test_data_norm)

result['Survived'] = result_pred

result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [57]:
result.to_csv('Submissions/submission_trees.csv', index=False)