In [240]:
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [241]:
# Loading train, test and sample submission
train_data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submision = pd.read_csv('gender_submission.csv')

In [242]:
# Analyzing null values in training data
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [243]:
# Analyzing training data
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [244]:
# Here 'Survived' is output feature and for input we will consider 'Pclass', 'Sex', 'Age', 'Sib', 'Parch', 'Fare' and 'Embarked'
train_y = train_data['Survived']
train_x = train_data.drop(columns = ['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin']) 

In [245]:
# Handling with missing data in Age( by taking median classwise) and Embarked( by taking most frequent class)

# For Embarked
train_x.Embarked.fillna('S', inplace = True )

# For Age
grouped_train = train_data.iloc[:891].groupby(['Sex','Pclass'])
grouped_median_train = grouped_train.median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Age']]

def fill_age(row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) &  
        (grouped_median_train['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_train[condition]['Age'].values[0]


def process_age():
    global train_x
    # a function that fills the missing values of the Age variable
    train_x['Age'] = train_x.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return train_x

train_x = process_age()


In [246]:
train_x.isnull().sum() # No missing values

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [247]:
# Feature engineering which include encoding of Sex feature ( Male as 1 and female as 0)
train_x['Sex'].replace({'male':0, 'female':1}, inplace = True)

# Feature engineering which include creating dummy variables (separate features encoded as 0 and 1) for Embarked and Pclass
embarked_dummies = pd.get_dummies(train_x['Embarked'], prefix='Embarked')
train_x = pd.concat([train_x, embarked_dummies], axis=1)
train_x = train_x.drop(columns = 'Embarked')
class_dummies = pd.get_dummies(train_x['Pclass'], prefix='class')
train_x = pd.concat([train_x, class_dummies], axis=1)
train_x = train_x.drop(columns = 'Pclass')

# Feature engineering which include the merging of parch and SibSp to create FamilySize as Single, small and large
def process_family():
    
    global train_x
    # introducing a new feature : the size of families (including the passenger)
    train_x['FamilySize'] = train_data['Parch'] + train_data['SibSp'] + 1
    
    # introducing other features based on the family size
    train_x['Singleton'] = train_x['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    train_x['SmallFamily'] = train_x['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    train_x['LargeFamily'] = train_x['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
    
    
    return train_x
train_x = process_family()

In [251]:
# Dropping useless features and rounding the feature Fare into 2 decimal place
train_x = train_x.drop(columns = ['FamilySize', 'SibSp', 'Parch'])
train_x['Fare'] = train_x['Fare'].round(decimals = 2)

In [257]:
# So train_x and train_y
train_x

Unnamed: 0,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,class_1,class_2,class_3,Singleton,SmallFamily,LargeFamily
0,0,22.0,7.25,0,0,1,0,0,1,0,1,0
1,1,38.0,71.28,1,0,0,1,0,0,0,1,0
2,1,26.0,7.92,0,0,1,0,0,1,1,0,0
3,1,35.0,53.10,0,0,1,1,0,0,0,1,0
4,0,35.0,8.05,0,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,13.00,0,0,1,0,1,0,1,0,0
887,1,19.0,30.00,0,0,1,1,0,0,1,0,0
888,1,21.5,23.45,0,0,1,0,0,1,0,1,0
889,0,26.0,30.00,1,0,0,1,0,0,1,0,0


In [258]:
train_y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [259]:
# Analyzing null values in test data
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [260]:
# Analyzing test data
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [261]:
# Here input feature are 'Pclass', 'Sex', 'Age', 'Sib', 'Parch', 'Fare' and 'Embarked'
test = test.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin']) 

In [262]:
# Handling with missing data in Age( by taking median classwise) and Embarked( by taking most frequent class)

# For Age we will consider the same set of medians as we have considered in case of training data
def process_age2():
    global test
    # a function that fills the missing values of the Age variable
    test['Age'] = test.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return test

test = process_age2()

In [263]:
test.isnull().sum() # No missing values

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [264]:
# Feature engineering which include encoding of Sex feature ( Male as 1 and female as 0)
test['Sex'].replace({'male':0, 'female':1}, inplace = True)

# Feature engineering which include creating dummy variables (separate features encoded as 0 and 1) for Embarked and Pclass
embarked_dummies2 = pd.get_dummies(test['Embarked'], prefix='Embarked')
test = pd.concat([test, embarked_dummies2], axis=1)
test = test.drop(columns = 'Embarked')
class_dummies2 = pd.get_dummies(test['Pclass'], prefix='class')
test = pd.concat([test, class_dummies2], axis=1)
test = test.drop(columns = 'Pclass')

# Feature engineering which include the merging of parch and SibSp to create FamilySize as Single, small and large
def process_family2():
    
    global test
    # introducing a new feature : the size of families (including the passenger)
    test['FamilySize'] = test['Parch'] + test['SibSp'] + 1
    
    # introducing other features based on the family size
    test['Singleton'] = test['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    test['SmallFamily'] = test['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    test['LargeFamily'] = test['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
    
    
    return test
test = process_family2()

In [266]:
# Dropping useless features and rounding the feature Fare into 2 decimal place
test = test.drop(columns = ['FamilySize', 'SibSp', 'Parch'])
test['Fare'] = test['Fare'].round(decimals = 2)

In [267]:
# So test data
test

Unnamed: 0,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,class_1,class_2,class_3,Singleton,SmallFamily,LargeFamily
0,0,34.5,7.83,0,1,0,0,0,1,1,0,0
1,1,47.0,7.00,0,0,1,0,0,1,0,1,0
2,0,62.0,9.69,0,1,0,0,1,0,1,0,0
3,0,27.0,8.66,0,0,1,0,0,1,1,0,0
4,1,22.0,12.29,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,25.0,8.05,0,0,1,0,0,1,1,0,0
414,1,39.0,108.90,1,0,0,1,0,0,1,0,0
415,0,38.5,7.25,0,0,1,0,0,1,1,0,0
416,0,25.0,8.05,0,0,1,0,0,1,1,0,0


In [268]:
# Creating model
model = RandomForestClassifier()
model.fit(train_x, train_y)

# Checking accuracy
predict_train = model.predict(train_x)
accuracy = accuracy_score(train_y, predict_train)
accuracy   # 97.97% accuracy on training data

0.9797979797979798

In [281]:
# Getting result for test data and storing in submission.csv ( 2 columns passengerId and survived)
test_predict = model.predict(test)
test_predict = pd.DataFrame(test_predict)
Submission = pd.concat([submision, test_predict], axis = 1)
Submission.rename(columns = {0 : 'Survived'}, inplace = True)

In [282]:
# Submission data
Submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [284]:
# Saving submission file
Submission.to_csv('Submission.csv', index = False)