In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import tensorflow as tf
from skimage.color import rgb2gray
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,CuDNNLSTM,MaxPooling2D,Conv2D,Flatten,Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint,Callback
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [2]:
df=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
Y=df['Survived']
X=df.drop('Survived',1)
df_concat=pd.concat([X,df_test],0)

In [3]:
def preprocess(df):
    df=df.drop(['Ticket','Name','PassengerId'],1)
    df['Sex']=df['Sex'].map({'male':1,'female':0})
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df.Cabin=df['Cabin'].astype(str)
    df.Cabin=df.Cabin.map(lambda x: substrings_in_string(x,cabin_list))
    df.Pclass=df.Pclass.map({1 : '1st', 2 : '2nd', 3 : '3rd'})
    dummies=pd.get_dummies(df)
    dummies.fillna(dummies.mean(),inplace=True)
    return dummies


def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan

In [6]:
df_dum=preprocess(df_concat)

In [7]:
X_dummies=df_dum.iloc[:X.shape[0],:]
test=df_dum.iloc[X.shape[0]:,:]

In [131]:
y_hot_encoding=tf.keras.utils.to_categorical(Y)

In [14]:
model=Sequential()
model.add(Dense(256,activation='relu',input_dim=X_dummies.shape[1]))
model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 256)               5120      
_________________________________________________________________
dense_6 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_7 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_8 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 66        
Total params: 48,418
Trainable params: 48,418
Non-trainable params: 0
_________________________________________________________________


In [15]:
Y.unique()

array([0, 1], dtype=int64)

In [16]:
model.fit(X_dummies,Y,epochs=30,batch_size=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x25bd5541668>

In [134]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_dummies, Y)  

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=0)

In [135]:
clf.score(X_dummies,Y)

0.8473625140291807

In [167]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0)
parameters = {'n_estimators':np.arange(10,210,10).tolist(), 'max_depth':np.arange(10,210,10).tolist()}
clf = GridSearchCV(rfc, parameters, cv=5)
clf.fit(X_dummies, Y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [168]:
clf.score(X_dummies,Y)

0.9259259259259259

In [173]:
predict=clf.predict(test)

prediction=pd.read_csv('gender_submission.csv')

prediction.Survived=predict

prediction.to_csv('prediction.csv',index=False)

In [181]:
np.logspace(-5,-1,num=10).tolist()

[1e-05,
 2.782559402207126e-05,
 7.742636826811278e-05,
 0.00021544346900318823,
 0.0005994842503189409,
 0.0016681005372000592,
 0.004641588833612777,
 0.012915496650148827,
 0.03593813663804626,
 0.1]

In [185]:
XGB=xgb.XGBClassifier()
parameters = {'n_estimators':np.arange(10,210,50).tolist(), 
              'max_depth':np.arange(10,210,50).tolist(),
              'learning_rate':np.logspace(-5,-1,num=5).tolist()}
clf = GridSearchCV(XGB, parameters, cv=5)

In [186]:
clf.fit(X_dummies, Y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 60, 110, 160], 'max_depth': [10, 60, 110, 160], 'learning_rate': [1e-05, 0.0001, 0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [187]:
predict=clf.predict(test)

prediction=pd.read_csv('gender_submission.csv')

prediction.Survived=predict

prediction.to_csv('prediction.csv',index=False)