<a href="https://colab.research.google.com/github/sonjiwon1234/kaggle/blob/master/TITANIC_SURVIVED_GIT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# set seaborn scheme 
plt.style.use('seaborn')

# set font for graph
sns.set(font_scale=2.5) 

import missingno as msno

#ignore warnings
#import warnings
#warnings.filterwarnings('ignore')

%matplotlib inline

# 1. Check data

## <h4> 1.1 read data

- using pandas to read data

In [None]:
#WORK_DIR = '/content'
WORK_DIR = '.'
df_train = pd.read_csv(WORK_DIR + '/datasets/train.csv')
df_test = pd.read_csv(WORK_DIR + '/datasets/test.csv')

In [None]:
df_train.head()


Features

    pclass : 클래스, Integer
    age : 나이, Integer
    sibsp : 형제와 배우자의 수, Integer
    parch : 부모와 아이의 수, Integer
    fare : 탑승료, Float
    survived : 생존여부, Integer (target label)
    embark_town : 출발지


## <h4>1.2 check null data

In [None]:
# check null data in train set
for col in df_train.columns:
    print('column: {:>10}\t Percent of NULL value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0])))


In [None]:
# check null data in test set
for col in df_test.columns:
    print('column: {:>10}\t Percent of NULL value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0])))


# 2. Preprocessing

## <h4> 2.1 Extract title in name
- Mr, Mrs, Miss, Master

In [None]:
df_train['Name'].str.extract('([A-Za-z]+)\.')

In [None]:
df_train['Initial']=0
for i in df_train:
    df_train['Initial']= df_train.Name.str.extract('([A-Za-z]+)\.') 
    
df_test['Initial']=0
for i in df_test:
    df_test['Initial']= df_test.Name.str.extract('([A-Za-z]+)\.')

In [None]:
pd.crosstab(df_train['Initial'], df_train['Sex']).T.style.background_gradient(cmap='summer_r')

In [None]:
df_train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess',
                          'Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Mr',
                       'Mr','Mr','Mr','Mr','Mr', 'Mr'],inplace=True)

df_test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess',
                          'Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Mr',
                       'Mr','Mr','Mr','Mr','Mr', 'Mr'],inplace=True)

In [None]:
pd.crosstab(df_train['Initial'], df_train['Sex']).T.style.background_gradient(cmap='summer_r')

## <h4> 2.2 Change Initial to numerical value
- Master,Miss, Mr, Mrs, to 0, 1, 2, 3

In [None]:
df_train['Initial'] = df_train['Initial'].map(
    {'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3})
df_test['Initial'] = df_test['Initial'].map(
    {'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3})
df_train.Initial.unique() 

## <h4> 2.3 Family size
- sibbs + parch


In [None]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1 # +1, including  oneself
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1 # +1, including  oneself

## <h4> 2.4 Fill Embarked
- most frequency value

In [None]:
df_train['Embarked'].fillna('S', inplace=True)

## <h4> 2.5 Change Sex to numerical value 
- female, male to 0, 1

In [None]:
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1})
df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male': 1})

## <h4> 2.6 Fare

In [None]:
#NULL값 치환
df_train.loc[df_train.Fare.isnull(), 'Fare'] = df_train['Fare'].mean()
df_test.loc[df_test.Fare.isnull(), 'Fare'] = df_test['Fare'].mean()

df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i>0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i: np.log(i) if i>0 else 0)

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['Fare'], color='b', 
            label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

## <h4> 2.7 Fill Age
- regression imputation 
- Mean imputation

In [None]:
# regression imputation

# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
# from sklearn.linear_model import BayesianRidge

In [None]:
# imp = IterativeImputer(max_iter=10, verbose=0)
# imp.fit(df_train_age)
# imputed_df_train = imp.transform(df_train_age)
# imputed_df_train = pd.DataFrame(imputed_df_train, columns=df_train_age.columns)

# #imp = IterativeImputer(max_iter=10, verbose=0)
# imp.fit(df_test_age)
# imputed_df_test = imp.transform(df_test_age)
# imputed_df_test = pd.DataFrame(imputed_df_test, columns=df_test_age.columns)

In [None]:
# Mean imputation

In [None]:
df_all = pd.concat([df_train, df_test])

In [None]:
df_all.reset_index(drop=True)

In [None]:
age_median = df_all.groupby(['Initial', 'Pclass'])['Age'].agg(['median'])

In [None]:
age_median = age_median.reset_index()

In [None]:
for index in df_train[df_train.Age.isnull()].index:
  median = age_median[(age_median.Initial == df_train.iloc[index]['Initial']) &
                      (age_median.Pclass == df_train.iloc[index]['Pclass'])
                      ]['median'].values[0] 
  df_train.at[index, 'Age'] = median  

In [None]:
for index in df_test[df_test.Age.isnull()].index:
  median = age_median[(age_median.Initial == df_test.iloc[index]['Initial']) &
                      (age_median.Pclass == df_test.iloc[index]['Pclass'])
                      ]['median'].values[0] 
  df_test.at[index, 'Age'] = median  

## <h4> 2.8 Change Age to Categorical value
- Divide into 6 sections
- 0 ~ 4 / 5 ~ 11 / 12 ~ 17 / 18 ~ 35 / 36 ~ 56 / 56 ~ 80

In [None]:
 #df_train['Age_cat'] = 0
df_train.loc[df_train['Age'] < 5, 'Age_cat'] = 0 # babies
df_train.loc[(5 <= df_train['Age']) & (df_train['Age'] < 12), 'Age_cat'] = 1 # children
df_train.loc[(12 <= df_train['Age']) & (df_train['Age'] < 18), 'Age_cat'] = 2 # teen
df_train.loc[(18 <= df_train['Age']) & (df_train['Age'] < 36), 'Age_cat'] = 3 # young adulthood
df_train.loc[(36 <= df_train['Age']) & (df_train['Age'] < 56), 'Age_cat'] = 4 # middle age
df_train.loc[56 <= df_train['Age'], 'Age_cat'] = 5 # older adulthood

#df_test['Age_cat'] = 0
df_test.loc[df_test['Age'] < 5, 'Age_cat'] = 0 # babies
df_test.loc[(5 <= df_test['Age']) & (df_test['Age'] < 12), 'Age_cat'] = 1 # children
df_test.loc[(12 <= df_test['Age']) & (df_test['Age'] < 18), 'Age_cat'] = 2 # teen
df_test.loc[(18 <= df_test['Age']) & (df_test['Age'] < 36), 'Age_cat'] = 3 # young adulthood
df_test.loc[(36 <= df_test['Age']) & (df_test['Age'] < 56), 'Age_cat'] = 4 # middle age
df_test.loc[56 <= df_test['Age'], 'Age_cat'] = 5 # older adulthood

## <h4> 2.9 one-hot encoding
using pandas.get_dummies()
- Initial 
- Embarked


In [None]:
df_train_encoding = pd.get_dummies(df_train, columns=['Initial'], prefix='Initial')
df_test_encoding = pd.get_dummies(df_test, columns=['Initial'], prefix='Initial')

df_train_encoding.head()

In [None]:
df_train_encoding = pd.get_dummies(df_train_encoding, columns=['Embarked'], prefix='Embarked')
df_test_encoding = pd.get_dummies(df_test_encoding, columns=['Embarked'], prefix='Embarked')

df_train_encoding.head()

## <h4> 2.10 Drop columns
Drop unnecessary columns

In [None]:
df_train_encoding.drop(['PassengerId', 'Name', 'Age','SibSp', 'Parch', 'Ticket', 'Cabin','Fare'], axis=1, inplace=True)
df_test_encoding.drop(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin','Fare'], axis=1, inplace=True)

# 3. Predict

## <h4>3.1 Set dataset
using train_test_split()
- split data into train, valid and test set

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.model_selection import train_test_split

In [None]:
# 학습에 쓰일 데이터와 target label 분리
X_train = df_train_encoding.drop('Survived', axis=1).values
target_label = df_train_encoding['Survived'].values
X_test = df_test_encoding.values

In [None]:
#X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.2, random_state=2018, stratify = target_label)
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.2, random_state=2018)

train_test_split(.., stratify = )  
- use stratify can get good score in kaggle public section with 0.80382 score
- but bad score in private section  

- My think is that i use stratify for avoid class imbalance problem, but this driven overfitting problem 

## <h3> 3.2 Modeling
- random forest with GridSearchCV
- nn with voting

### <h4> 3.2.1 random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier  

In [None]:
# hyper-parms tuning

params = { 'n_estimators' : [10, 100, 200, 400],
           'max_depth' : [10, 20, 30, None],
           'min_samples_leaf' : [1,2,4],
           'min_samples_split' : [2,5,10]
            }

rf_clf = RandomForestClassifier(random_state = 42)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, target_label)

print('best parms: ', grid_cv.best_params_)
print('best score: {:.4f}'.format(grid_cv.best_score_))

- best parms:  {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
- best score : 0.8373

In [None]:
# training

rfmodel = RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100)
rfmodel.fit(X_tr, y_tr)

In [None]:
# predict
rfprediction = rfmodel.predict(X_vld)

###<h4>3.2.2 NN

In [None]:
from keras.models import Sequential 
from keras.layers import Dense , Dropout, Flatten
from keras.wrappers.scikit_learn import KerasClassifier 
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
from keras.optimizers import rmsprop
import random 

In [None]:
def create_model( lr = 0.001, opt = 'RMSprop', init = 'he_normal', dr = 0.2): # create model 

  # fix random seed for reproducibility
  seed = 42 
  random.seed(seed)

  model = Sequential() 
  
  model.add(Dense(32, input_dim=X_train.shape[1], activation='relu', kernel_initializer =init)) 
  model.add(Dropout(dr))
  #for i in range(1, len(lyrs)):
  model.add(Dense(8, activation='relu'))
  
  model.add(Dense(1, activation='sigmoid')) 

  opt= rmsprop(lr=lr)
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) 

  return model

In [None]:
# using Grid Search to optimize hyper-parms
from sklearn.model_selection import GridSearchCV

model = KerasClassifier(build_fn=create_model, verbose=0)

epochs = [50,100,150,300, 400] ##// increase epochs        
batches = [1, 16, 32, 64]

param_grid = dict(epochs=epochs, batch_size=batches)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_tr, y_tr)                                                                                                                                    

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score']

params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

-  Best: 0.833905 using {'batch_size': 32, 'epochs': 300}
  - 0.832663 (0.027716) with: {'batch_size': 16, 'epochs': 400}
  - 0.832655 (0.028021) with: {'batch_size': 1, 'epochs': 100} 
  - 0.830171 (0.025037) with: {'batch_size': 64, 'epochs': 150}



In [None]:
epoch_size = 300
batch_size = 32

In [None]:
# using Grid Search to optimize hyper-parms
model = KerasClassifier(build_fn=create_model,epochs=epoch_size, batch_size=batch_size ,verbose=0)

optimizers = ['rmsprop','adam','Adadelta']
init = ['glorot_uniform', 'he_normal']

param_grid = dict(opt=optimizers, init=init)                                                                                                                                 
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_tr, y_tr)   

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score']

params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

- Best: 0.833905 using {'init': 'he_normal', 'opt': 'rmsprop'}
  - 0.830163 (0.027448) with: {'init': 'glorot_uniform', 'opt': 'adam'}
  - 0.830155 (0.029660) with: {'init': 'glorot_uniform', 'opt': 'Adadelta'}
  

In [None]:
# using Grid Search to optimize hyper-parms
model = KerasClassifier(build_fn=create_model,epochs=epoch_size, batch_size=batch_size ,verbose=0)

drops = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]

param_grid = dict(dr = drops)                                                                                                                                 
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_tr, y_tr)   

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] 
stds = grid_result.cv_results_['std_test_score']

params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.824426 using {'dr': 0.2}

In [None]:
nnmodel = KerasClassifier(build_fn=create_model, epochs=epoch_size, batch_size=batch_size, verbose=0) #, validation_split=0.1) # without validation split # not enough dataset

In [None]:
from sklearn.model_selection import cross_val_score

accuracies =  cross_val_score(estimator=nnmodel, X= X_tr, y=y_tr, cv=10, n_jobs=-1)
accuracies
accuracies.mean()

In [None]:
history = nnmodel.fit(X_tr, y_tr, callbacks = [])
nnprediction = nnmodel.predict(X_vld)
nnprediction = nnprediction.flatten()

In [None]:
loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]

In [None]:
plt.figure(1)
for l in acc_list:
    plt.plot(range(epoch_size), history.history[l], 'b')
for l in val_acc_list:
    plt.plot(range(epoch_size), history.history[l], 'g')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')


plt.figure(2)
for l in loss_list:
    plt.plot(range(epoch_size), history.history[l], 'b')
for l in val_loss_list:
    plt.plot(range(epoch_size), history.history[l], 'g')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

#### - Voting with nn model

In [None]:
#nnmodel = KerasClassifier(build_fn=create_model, epochs=epoch_size, batch_size=batch_size, verbose=0)
nnmodel1 =  KerasClassifier(build_fn=create_model, epochs=epoch_size, batch_size=batch_size, verbose=0)
nnmodel1._estimator_type="classifier"
nnmodel2 =  KerasClassifier(build_fn=create_model, epochs=400, batch_size=16, verbose=0)
nnmodel2._estimator_type="classifier"
nnmodel3 =  KerasClassifier(build_fn=create_model, epochs=100, batch_size=1, verbose=0)
nnmodel3._estimator_type="classifier"
nnmodel4 =  KerasClassifier(build_fn=create_model, epochs=150, batch_size=64, verbose=0)
nnmodel4._estimator_type="classifier"

In [None]:
vcnnmodel = VotingClassifier(estimators = [('model1',nnmodel1),('model2',nnmodel2),('model3',nnmodel3),('model4',nnmodel4)], voting = 'hard')

In [None]:
vcnnmodel.fit(X_tr, y_tr)

In [None]:
vcnnprediction = vcnnmodel.predict(X_vld)
vcnnprediction = vcnnprediction.flatten()

## <h4> 3.3 Result

In [None]:
# accuracy
rfacc = 100 * metrics.accuracy_score(rfprediction, y_vld)
print('randomforest acc :  {:.2f}% '.format(rfacc))

In [None]:
# accuracy
nnacc = 100 * metrics.accuracy_score(nnprediction, y_vld)
print('NN acc : {:.2f}% '.format( nnacc))

In [None]:
# accuracy
vcnnacc = 100 * metrics.accuracy_score(vcnnprediction, y_vld)
print('Voting(NN) acc : {:.2f}% '.format( vcnnacc))

In [None]:
accres = pd.Series([rfacc,nnacc, vcnnacc], index = ['Random Forest', 'NN', 'Voting(NN)'])

In [None]:
plt.figure(figsize=(8, 8))
plt.xlim(82,accres.max())
accres.sort_values(ascending=True).plot.barh()
plt.xlabel('Accuracy')
plt.ylabel('Model')
plt.show()

## <h4> 3.5 Save result

In [None]:
#WORK_DIR = '/content'
WORK_DIR = '.'
submission = pd.read_csv(WORK_DIR + '/datasets/sample_submission.csv')
submission.head()

In [None]:
res_prediction = rfmodel.predict(X_test)
submission['Survived'] = res_prediction
submission.to_csv('./baseline_submission_vc.csv', index=False)