In [198]:
import pandas as pd
import numpy as np
import re
import datetime
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

pd.set_option('display.max_colwidth',100)

In [199]:
data = pd.read_csv("../Downloads/ship_data.csv")
data.head(10)

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
0,1,3,Alexander Harris,male,22.0,1,0,7250.0,New York,0
1,2,1,Frank Parsons,female,38.0,1,0,71283.3,Los Angeles,1
2,3,3,Anthony Churchill,female,26.0,0,0,7925.0,New York,1
3,4,1,Alexandra Hughes,female,35.0,1,0,53100.0,New York,1
4,5,3,Joan Fraser,male,35.0,0,0,8050.0,New York,0
5,6,3,Megan Clarkson,male,,0,0,8458.3,Chicago,0
6,7,1,Molly Bower,male,54.0,0,0,51862.5,New York,0
7,8,3,Steven Jones,male,2.0,3,1,21075.0,New York,0
8,9,3,Bernadette Vance,female,27.0,0,2,11133.3,New York,1
9,10,2,Irene Chapman,female,-20.0,1,0,30070.8,Los Angeles,1


In [200]:
data['Gender'].value_counts()

male      577
female    314
Name: Gender, dtype: int64

In [201]:
data['Class'].value_counts()

3    491
1    216
2    184
Name: Class, dtype: int64

In [202]:
child = data['Age'] < 16.0
valid_age = data['Age'] > 0.0

In [203]:
children = data[child & valid_age]
children

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
7,8,3,Steven Jones,male,2.00,3,1,21075.0,New York,0
10,11,3,Gavin Payne,female,4.00,1,1,16700.0,New York,1
14,15,3,Evan Mackay,female,14.00,0,0,7854.2,New York,0
16,17,3,Sonia Knox,male,2.00,4,1,29125.0,Chicago,0
22,23,3,James Russell,female,15.00,0,0,8029.2,Chicago,1
24,25,3,Blake Harris,female,8.00,3,1,21075.0,New York,0
39,40,3,Rose Mathis,female,14.00,1,0,11241.7,Los Angeles,1
43,44,2,Jack Gray,female,3.00,1,2,41579.2,Los Angeles,1
50,51,3,Trevor Fisher,male,7.00,4,1,39687.5,New York,0
58,59,2,Peter Oliver,female,5.00,1,2,27750.0,New York,1


In [204]:
children.shape

(82, 10)

In [205]:
children['Class'].value_counts()

3    58
2    18
1     6
Name: Class, dtype: int64

In [206]:
data.isnull().any()

Passenger ID      False
Class             False
Name              False
Gender            False
Age                True
Siblings Count    False
Parents Count     False
Fare              False
Embarked           True
Survived          False
dtype: bool

In [207]:
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [218]:
data['Embarked'] = data['Embarked'].fillna("New York")

In [220]:
data['Embarked'].value_counts()

New York       646
Los Angeles    168
Chicago         77
Name: Embarked, dtype: int64

In [222]:
data.isnull().any()

Passenger ID      False
Class             False
Name              False
Gender            False
Age               False
Siblings Count    False
Parents Count     False
Fare              False
Embarked          False
Survived          False
dtype: bool

In [223]:
data.head()

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
0,1,3,Alexander Harris,male,22.0,1,0,7250.0,New York,0
1,2,1,Frank Parsons,female,38.0,1,0,71283.3,Los Angeles,1
2,3,3,Anthony Churchill,female,26.0,0,0,7925.0,New York,1
3,4,1,Alexandra Hughes,female,35.0,1,0,53100.0,New York,1
4,5,3,Joan Fraser,male,35.0,0,0,8050.0,New York,0


In [224]:
data['Family'] = data['Siblings Count'] + data['Parents Count']

In [225]:
data.head()

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived,Family
0,1,3,Alexander Harris,male,22.0,1,0,7250.0,New York,0,1
1,2,1,Frank Parsons,female,38.0,1,0,71283.3,Los Angeles,1,1
2,3,3,Anthony Churchill,female,26.0,0,0,7925.0,New York,1,0
3,4,1,Alexandra Hughes,female,35.0,1,0,53100.0,New York,1,1
4,5,3,Joan Fraser,male,35.0,0,0,8050.0,New York,0,0


In [226]:
alone = data[data['Family'] == 0]
with_family = data[data['Family'] != 0]

In [227]:
print "alone count:",alone.shape[0]
print "with family count:",with_family.shape[0]

alone count: 537
with family count: 354


In [228]:
data.groupby(data['Class']).Survived.value_counts(normalize = True)

Class  Survived
1      1           0.629630
       0           0.370370
2      0           0.527174
       1           0.472826
3      0           0.757637
       1           0.242363
Name: Survived, dtype: float64

In [229]:
data.groupby(data['Gender']).Survived.value_counts(normalize = True)

Gender  Survived
female  1           0.742038
        0           0.257962
male    0           0.811092
        1           0.188908
Name: Survived, dtype: float64

In [230]:
data['has_family'] = data['Family']&1
data

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived,Family,has_family
0,1,3,Alexander Harris,male,22.000000,1,0,7250.0,New York,0,1,1
1,2,1,Frank Parsons,female,38.000000,1,0,71283.3,Los Angeles,1,1,1
2,3,3,Anthony Churchill,female,26.000000,0,0,7925.0,New York,1,0,0
3,4,1,Alexandra Hughes,female,35.000000,1,0,53100.0,New York,1,1,1
4,5,3,Joan Fraser,male,35.000000,0,0,8050.0,New York,0,0,0
5,6,3,Megan Clarkson,male,30.014244,0,0,8458.3,Chicago,0,0,0
6,7,1,Molly Bower,male,54.000000,0,0,51862.5,New York,0,0,0
7,8,3,Steven Jones,male,2.000000,3,1,21075.0,New York,0,4,0
8,9,3,Bernadette Vance,female,27.000000,0,2,11133.3,New York,1,2,0
9,10,2,Irene Chapman,female,-20.000000,1,0,30070.8,Los Angeles,1,1,1


In [231]:
data.groupby(data['has_family']).Survived.value_counts(normalize = True)

has_family  Survived
0           0           0.659733
            1           0.340267
1           1           0.518349
            0           0.481651
Name: Survived, dtype: float64

In [232]:
"If a person had family survival chance is 48%, whereas if he doesn't have any family 34%"

"If a person had family survival chance is 48%, whereas if he doesn't have any family 34%"

# 10 th part starts here

In [233]:
data.head()

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived,Family,has_family
0,1,3,Alexander Harris,male,22.0,1,0,7250.0,New York,0,1,1
1,2,1,Frank Parsons,female,38.0,1,0,71283.3,Los Angeles,1,1,1
2,3,3,Anthony Churchill,female,26.0,0,0,7925.0,New York,1,0,0
3,4,1,Alexandra Hughes,female,35.0,1,0,53100.0,New York,1,1,1
4,5,3,Joan Fraser,male,35.0,0,0,8050.0,New York,0,0,0


In [234]:
data.dtypes

Passenger ID        int64
Class               int64
Name               object
Gender             object
Age               float64
Siblings Count      int64
Parents Count       int64
Fare              float64
Embarked           object
Survived            int64
Family              int64
has_family          int64
dtype: object

In [261]:
data['Gender'].value_counts()

0    577
1    314
Name: Gender, dtype: int64

In [238]:
data.loc[data.Gender == 'male','Gender'] = 0
data.loc[data.Gender == 'female','Gender'] = 1

In [262]:
data['Embarked'].value_counts()

0    646
1    168
2     77
Name: Embarked, dtype: int64

In [239]:
data.loc[data.Embarked == 'New York','Embarked'] = 0
data.loc[data.Embarked == 'Los Angeles','Embarked'] = 1
data.loc[data.Embarked == 'Chicago','Embarked'] = 2

In [240]:
data.head()

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived,Family,has_family
0,1,3,Alexander Harris,0,22.0,1,0,7250.0,0,0,1,1
1,2,1,Frank Parsons,1,38.0,1,0,71283.3,1,1,1,1
2,3,3,Anthony Churchill,1,26.0,0,0,7925.0,0,1,0,0
3,4,1,Alexandra Hughes,1,35.0,1,0,53100.0,0,1,1,1
4,5,3,Joan Fraser,0,35.0,0,0,8050.0,0,0,0,0


In [241]:
data.isnull().any()

Passenger ID      False
Class             False
Name              False
Gender            False
Age               False
Siblings Count    False
Parents Count     False
Fare              False
Embarked          False
Survived          False
Family            False
has_family        False
dtype: bool

In [242]:
data.dtypes

Passenger ID        int64
Class               int64
Name               object
Gender             object
Age               float64
Siblings Count      int64
Parents Count       int64
Fare              float64
Embarked           object
Survived            int64
Family              int64
has_family          int64
dtype: object

In [253]:
data['Gender'] = data['Gender'].convert_objects(convert_numeric=True)
data['Embarked'] = data['Embarked'].convert_objects(convert_numeric=True)

  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [255]:
features = ['Class','Age','Family','Fare','Embarked','Gender']
target = ['Survived']

In [263]:
from sklearn.model_selection import train_test_split

In [257]:
Xtrain,Xtest=train_test_split(data,test_size=0.10)    

In [265]:
from xgboost import XGBClassifier
model = XGBClassifier()
final_model = XGBClassifier()

In [266]:
model.fit(Xtrain[features],Xtrain[target])

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [267]:
print(model.score(Xtest[features],Xtest[target]))

0.833333333333


In [268]:
# training on the whole training data
final_model.fit(data[features],data[target])

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)