In [47]:
# KAGGLE TITANIC USING RANDOM FORESTS. 
# ACCURACY ON KAGGLE TRAINING SET : 0.86797.
# ACCURACY ON CROSS VALIDATION SET : 0.82122.
# ACCURACY ON KAGGLE TEST SET : 0.80382.
import pandas as pd
import numpy as np
import seaborn as sns
import re

from pandas import DataFrame as df
from pandas import Series as sr
from matplotlib import pyplot as plt
from scipy import misc

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import StratifiedShuffleSplit as strat
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore') # IGNORE DEPRECATION (AND OTHER WARNINGS).

In [48]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [49]:
# CHECKING DATA MISSING VALUES.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [50]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [51]:
# CORRELATIONS BETWEEN SURVIVAL AND ALL OTHER FEATURES.
corr = data.corr()
corr

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [52]:
M = len(data)
M

891

In [53]:
data['SibSp'] += data['Parch'] # COMBINING VALUES BECAUSE THEY BOTH MORE OR LESS AFFECT SIMILARLY.

def get_title(name):
# ALL NAMES ARE IN THE FORM ' abcd, Mr. xyz...'. THIS REGEX FINDS 'Mr', 'Miss', 'Mrs', ETC.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)

data['Name'] = data['Name'].apply(get_title) # NOW EVERY NAME IS 'Mr', 'Mrs', ETC.

# SELECTING THE MOST IMPORTANT FEATURES.
X = data[['Pclass', 'Sex', 'Name', 'Age', 'SibSp', 'Fare', 'Embarked']]        
Y = data['Survived']

power = 2
X['Pclass'] **= power # INCREASING DEGREE BECAUSE THIS HAS LARGER EFFECT ON SURVIVAL CHANCES.

# DATA CLEANING.
X['Age'].fillna(X['Age'].median(), inplace=True)
X['Age'][ X['Age'] < 10] = 1
X['Age'][ ( 10 <= X[ 'Age' ] ) & ( X[ 'Age' ] < 18 ) ] = 2
X['Age'][ ( 18 <= X[ 'Age' ] ) & ( X[ 'Age' ] < 25 ) ] = 3
X['Age'][ ( 25 <= X[ 'Age' ] ) & ( X[ 'Age' ] < 35 ) ] = 4
X['Age'][ ( 35 <= X[ 'Age' ] ) & ( X[ 'Age' ] < 50 ) ] = 5
X['Age'][ ( 50 <= X[ 'Age' ] ) & ( X[ 'Age' ] < 70 ) ] = 6
X['Age'][ X[ 'Age' ] >= 70 ] = 7
X[ 'Fare' ].fillna( X[ 'Fare' ].median(), inplace = True )
X[ 'Age' ] = np.array( X[ 'Age' ], dtype = '<U3' )

# ONE-HOT ENCODING FOR NAMES, SEX, EMBARKED.
X = pd.get_dummies( X )

# DO NOT DROP THESE COLUMNS BECAUSE OF THEIR CORRELATION WITH SURVIVAL.
to_not_drop = [ 'Fare', 'Name_Mr', 'Sex_female', 'Sex_male', 'Pclass', 'SibSp', 'Name_Miss', \
               'Name_Mrs' ] 

# DROPPING ALL COLUMNS EXCEPT 'to_not_drop'.
for col in X.columns :
    if col not in to_not_drop :
        X.drop( col, axis = 1, inplace = True )

In [54]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass        891 non-null int64
SibSp         891 non-null int64
Fare          891 non-null float64
Sex_female    891 non-null uint8
Sex_male      891 non-null uint8
Name_Miss     891 non-null uint8
Name_Mr       891 non-null uint8
Name_Mrs      891 non-null uint8
dtypes: float64(1), int64(2), uint8(5)
memory usage: 25.3 KB


In [55]:
X.describe()

Unnamed: 0,Pclass,SibSp,Fare,Sex_female,Sex_male,Name_Miss,Name_Mr,Name_Mrs
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,6.028058,0.904602,32.204208,0.352413,0.647587,0.204265,0.580247,0.140292
std,3.443655,1.613459,49.693429,0.47799,0.47799,0.40339,0.493796,0.347485
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0
50%,9.0,0.0,14.4542,0.0,1.0,0.0,1.0,0.0
75%,9.0,1.0,31.0,1.0,1.0,0.0,1.0,0.0
max,9.0,10.0,512.3292,1.0,1.0,1.0,1.0,1.0


In [56]:
print( X.shape, Y.shape )

(891, 8) (891,)


In [57]:
features = list( X.columns )

In [58]:
# SPLITTING ORIGINAL TRAINING DATA INTO TRAINING AND VALIDATION DATA IN A STRATIFIED MANNER.
split = strat(test_size=0.2, random_state=42)
for train_indices, valid_indices in split.split(X, Y):
    strat_train_x = X.loc[ train_indices ]
    strat_train_y = Y.loc[ train_indices ]
    
    strat_valid_x = X.loc[ valid_indices ]
    strat_valid_y = Y.loc[ valid_indices ]

In [59]:
# CONVERTING ALL DATAFRAMES TO NUMPY ARRAYS FOR FAST COMPUTATIONS.
strat_train_x = np.array(strat_train_x)
strat_train_y = np.array(strat_train_y)

In [60]:
# RANDOM FOREST CLASSIFIER.
forest = rfc( n_estimators = 3000, min_samples_split = 5, criterion = 'gini', max_depth = 5)
forest.fit( strat_train_x, strat_train_y )

pred_train = forest.predict( strat_train_x )
accuracy_train = ( ( pred_train == strat_train_y ).sum() ) / len( strat_train_y )

pred_valid = forest.predict( strat_valid_x )
accuracy_valid = ( ( pred_valid == strat_valid_y ).sum() ) / len( strat_valid_y )

print(accuracy_train, accuracy_valid)

0.8525280898876404 0.8100558659217877


In [61]:
# CALLING INBUILT METHOD WHICH GIVES THE IMPORTANCES OF FEATURES. HIGHER IMPORTANCE VALUE MEANS
# HIGHER CORRELATION WITH THE TARGET VALUE.
importances = list( forest.feature_importances_ )
feature_importance = [ ( feature, importance ) for feature, importance in zip( features, \
                                                                              importances ) ]
feature_importance.sort( reverse = True, key = lambda x : x[ 1 ] )
feature_importance

[('Name_Mr', 0.2003123935654276),
 ('Sex_male', 0.17053602584101027),
 ('Fare', 0.16304636525594715),
 ('Sex_female', 0.16232387795517578),
 ('Pclass', 0.119013339701392),
 ('SibSp', 0.10773716916155937),
 ('Name_Miss', 0.03889825877893301),
 ('Name_Mrs', 0.03813256974055508)]

In [62]:
# TRYING TO INCREASE ACCURACY.
# POLYNOMIAL FEATURES CAN BE USED TO INCREASE THE MODEL'S LEARNING POWER AND THUS ACCURACY.
poly = PolynomialFeatures(include_bias=False, degree=3)

strat_train_x_poly = poly.fit_transform(strat_train_x)
strat_valid_x_poly = poly.fit_transform(strat_valid_x)

forest_poly = rfc( n_estimators = 2000, min_samples_split = 5, criterion = 'gini', max_depth = 5)
forest_poly.fit( strat_train_x_poly, strat_train_y )

pred_train_poly = forest_poly.predict( strat_train_x_poly )
accuracy_train_poly = ( ( pred_train_poly == strat_train_y ).sum() ) / len( strat_train_y )

pred_valid_poly = forest_poly.predict( strat_valid_x_poly )
accuracy_valid_poly = ( ( pred_valid_poly == strat_valid_y ).sum() ) / len( strat_valid_y )

print(accuracy_train_poly, accuracy_valid_poly)

0.8651685393258427 0.8212290502793296


In [63]:
# IMPORTING KAGGLE TEST SET.
data2 = pd.read_csv( 'test.csv' )
data2[ 'SibSp' ] += data2[ 'Parch' ]
data2[ 'Name' ] = data2[ 'Name' ].apply( get_title )
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [64]:
# SAME DATA CLEANING AS ABOVE.
X2 = data2[ [ 'Pclass', 'Sex', 'Name', 'Age', 'SibSp', 'Fare', 'Embarked' ] ]

X2[ 'Pclass' ] **= power
X2[ 'Age' ].fillna( X2[ 'Age' ].median(), inplace = True )
X2['Age'][ X2[ 'Age' ] < 10 ] = 1
X2['Age'][ ( 10 <= X2[ 'Age' ] ) & ( X2[ 'Age' ] < 18 ) ] = 2
X2['Age'][ ( 18 <= X2[ 'Age' ] ) & ( X2[ 'Age' ] < 25 ) ] = 3
X2['Age'][ ( 25 <= X2[ 'Age' ] ) & ( X2[ 'Age' ] < 35 ) ] = 4
X2['Age'][ ( 35 <= X2[ 'Age' ] ) & ( X2[ 'Age' ] < 50 ) ] = 5
X2['Age'][ ( 50 <= X2[ 'Age' ] ) & ( X2[ 'Age' ] < 70 ) ] = 6
X2['Age'][ X2[ 'Age' ] >= 70 ] = 7
X2[ 'Fare' ].fillna( X2[ 'Fare' ].median(), inplace = True )
X2[ 'Age' ] = np.array( X2[ 'Age' ], dtype = '<U3' )

X2 = pd.get_dummies( X2 )

for col in X2.columns :
    if col not in to_not_drop :
        X2.drop( col, axis = 1, inplace = True )

In [65]:
X2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Pclass        418 non-null int64
SibSp         418 non-null int64
Fare          418 non-null float64
Sex_female    418 non-null uint8
Sex_male      418 non-null uint8
Name_Miss     418 non-null uint8
Name_Mr       418 non-null uint8
Name_Mrs      418 non-null uint8
dtypes: float64(1), int64(2), uint8(5)
memory usage: 11.9 KB


In [66]:
X2_poly = poly.fit_transform(X2)
predictions2_poly = forest_poly.predict( X2_poly )

In [67]:
output = df( data2[ 'PassengerId' ], columns = [ 'PassengerId' ] )
output[ 'Survived' ] = predictions2_poly
output.to_csv( 'output_titanic_poly.csv', index = False )