In [80]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import numpy as np
import pandas as pd
import re as re

train = pd.read_csv('train.csv', header = 0, dtype={'Age': np.float64})
test  = pd.read_csv('test.csv' , header = 0, dtype={'Age': np.float64})
full_data = [train, test]

print (train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB
None


# Feature Engineering


## Pclass


In [81]:
train[["Pclass", "Survived"]].groupby("Pclass").mean()


Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


## Sex

In [82]:
train[["Sex", "Survived"]].groupby("Sex").mean()


Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


## Parch and SibSp

In [83]:
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
print (train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())


   FamilySize  Survived
0           1  0.303538
1           2  0.552795
2           3  0.578431
3           4  0.724138
4           5  0.200000
5           6  0.136364
6           7  0.333333
7           8  0.000000
8          11  0.000000


In [84]:


for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
print (train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())



   IsAlone  Survived
0        0  0.505650
1        1  0.303538


## Embarked

In [85]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
print (train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())


  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


## Fare
Fill the missing values with median

In [86]:
for dataset in full_data:
#     dataset['Fare'] = dataset['Fare'][dataset['Fare']].astype(int)
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'][train['Fare'].notnull()].median())
    
#     dataset['Fare'] = dataset['Fare'].astype('int64')
# train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# print (train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())

train.columns
train.info()
# train

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
FamilySize     891 non-null int64
IsAlone        891 non-null int64
dtypes: float64(2), int64(7), object(5)
memory usage: 104.4+ KB


## Age


In [87]:
train["Age"][train["Age"].isnull()].size

177

There are many missing values for age so we can fill it with random values

In [88]:
for dataset in full_data:
    age_avg = dataset["Age"][dataset["Age"].notnull()].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()

    age_rand_list = np.random.randint(age_avg-age_std, age_avg+age_std, size=age_null_count)
    dataset["Age"][dataset["Age"].isnull()] = age_rand_list
    dataset['Age'] = dataset['Age'].astype(int)
#     dataset["CategoricalAge"] = pd.qcut(dataset["Age"], 5)
# #     print(age_null_count)
# print (train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
FamilySize     891 non-null int64
IsAlone        891 non-null int64
dtypes: float64(1), int64(8), object(5)
memory usage: 104.4+ KB


## Name


In [89]:
import re as re
def get_title(name):
	title_search = re.search(' ([A-Za-z]+)\.', name)
	# If the title exists, extract and return it.
	if title_search:
		return title_search.group(1)
	return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

print(pd.crosstab(train['Title'], train['Sex']))
# train

Sex       female  male
Title                 
Capt           0     1
Col            0     2
Countess       1     0
Don            0     1
Dr             1     6
Jonkheer       0     1
Lady           1     0
Major          0     2
Master         0    40
Miss         182     0
Mlle           2     0
Mme            1     0
Mr             0   517
Mrs          125     0
Ms             1     0
Rev            0     6
Sir            0     1


In [90]:
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print (train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
train.info()

    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
FamilySize     891 non-null int64
IsAlone        891 non-null int64
Title          891 non-null object
dtypes: float64(1), int64(8), object(6)
memory usage: 111.4+ KB


# Data cleaning


In [91]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'IsAlone', 'Title'], dtype='object')

In [92]:

for dataset in full_data:
#     # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} )

    #     # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    #     # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} )
    
#     # Mapping Fare
#     dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
#     dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
#     dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
#     dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
#     dataset['Fare'] = dataset['Fare']
    
#     # Mapping Age
#     dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
#     dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
#     dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
#     dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
#     dataset.loc[ dataset['Age'] > 64, 'Age']                           = 4

# Feature Selection
train.info()
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp',\
                 'Parch', 'FamilySize']
traindr = train.drop(drop_elements, axis = 1)
# train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)

testdr  = test.drop(drop_elements, axis = 1)

print (traindr.head(10))

traind = traindr.values
testd  = testdr.values
traindr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null int64
FamilySize     891 non-null int64
IsAlone        891 non-null int64
Title          891 non-null int64
dtypes: float64(1), int64(11), object(3)
memory usage: 111.4+ KB
   Survived  Pclass  Sex  Age     Fare  Embarked  IsAlone  Title
0         0       3    1   22   7.2500         0        0      1
1         1       1    0   38  71.2833         1        0      3
2         1       3    0   26   7.9250         0        1      2
3         1       1    0   35  53.1000         0       

In [93]:
# train
X = traindr[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'IsAlone', 'Title']].values
y = traindr['Survived'].values
X_final = testdr[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'IsAlone', 'Title']].values
# from sklearn.preprocessing import StandardScaler
# ss = StandardScaler(copy=False)
# ss.fit_transform(X)


In [106]:

from sklearn.cross_validation import train_test_split
[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size=0.1, random_state=23)


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
# out = pd.DataFrame(data=X_test)
# out.to_csv(path_or_buf='testout.csv', header=None, index=False)

(801, 7)
(801,)
(90, 7)


In [125]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


clf1 = LogisticRegression(random_state=1, n_jobs=-1)
clf2 = SVC(random_state=1, probability=True)
clf3 = KNeighborsClassifier( n_jobs=-1)
clf4 = RandomForestClassifier(n_jobs=-1, n_estimators=1000, criterion='gini', max_depth=5)

pipeline1 = Pipeline([('ssc1', StandardScaler()),
                      ('lr', clf1)])
pipeline2 = Pipeline([('ssc2', StandardScaler()),
                      ('lr', clf2)])
pipeline3 = Pipeline([('ssc3', StandardScaler()),
                      ('lr', clf3)])

estimators = [('eclf1', pipeline1), ('eclf2', pipeline2), ('eclf3', pipeline3), ('eclf4', clf4)]

vc = VotingClassifier(estimators=estimators, n_jobs=-1)

vc.fit(X_train, y_train)
vc.score(X_test, y_test)

0.87777777777777777

In [126]:

pipeline1.fit(X_train,y_train)
print(pipeline1.score(X_test, y_test))
pipeline2.fit(X_train,y_train)
print(pipeline2.score(X_test, y_test))
pipeline3.fit(X_train,y_train)
print(pipeline3.score(X_test, y_test))
clf4.fit(X_train,y_train)
print(clf4.score(X_test, y_test))

0.8
0.888888888889
0.822222222222
0.855555555556


In [127]:
vc.get_params()

{'eclf1': Pipeline(steps=[('ssc1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False))]),
 'eclf1__lr': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'eclf1__lr__C': 1.0,
 'eclf1__lr__class_weight': None,
 'eclf1__lr__dual': False,
 'eclf1__lr__fit_intercept': True,
 'eclf1__lr__intercept_scaling': 1,
 'eclf1__lr__max_iter': 100,
 'eclf1__lr__multi_class': 'ovr',
 'eclf1__lr__n_jobs': -1,
 'eclf1__lr__penalty': 'l2',
 'eclf1__lr__random_state': 1,
 'eclf1__lr__solver': 'liblinear',
 'eclf1__l

In [128]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'eclf1__lr__C': [ 0.01],
    'eclf1__lr__penalty': ['l2'],
    'eclf2__lr__C': [1.0],
    'voting': ['soft'],
    'eclf4__max_depth': [4,5,6,None],
    'eclf3__lr__n_neighbors':[5],
    
}

gs = GridSearchCV(estimator=vc, param_grid=param_grid, cv=10, verbose=1, n_jobs=-1)
gs.fit(X_train, y_train)
print(gs.best_score_)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   23.9s finished


0.826466916355


In [129]:
print(gs.best_score_)
print(gs.best_params_)
print(gs.best_estimator_)


0.826466916355
{'eclf1__lr__C': 0.01, 'eclf1__lr__penalty': 'l2', 'eclf2__lr__C': 1.0, 'eclf3__lr__n_neighbors': 5, 'eclf4__max_depth': None, 'voting': 'soft'}
VotingClassifier(estimators=[('eclf1', Pipeline(steps=[('ssc1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_sta...tors=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))],
         n_jobs=-1, voting='soft', weights=None)


In [133]:
est = gs.best_estimator_
print(est.score(X_test,y_test))
est.fit(X,y)
# vc.fit(X_train, y_train)
# print(vc.score(X_test,y_test))
# vc.fit(X,y)

print(X_final.shape)
# y_pred_teat = vc.predict(X_final)
y_pred_teat = est.predict(X_final)
stest  = pd.read_csv('test.csv' , header = 0, dtype={'Age': np.float64})
s = np.vstack((stest['PassengerId'].values, y_pred_teat))
# s

submit = pd.DataFrame(data =s.transpose(), columns=("PassengerId", "Survived"))
submit
submit.to_csv(path_or_buf='ensembleGridSearch2.csv', index=False)

0.877777777778
(418, 7)
