In [32]:
import pandas as pd
import numpy as np
import os

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn import metrics

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
#missing median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)
    
def missing_Cabin(df, name):
    df[name] = df[name].fillna('N')
    
def missing_Em(df, name):
    df[name] = df[name].fillna(0)
    
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df[0:10]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [44]:
Test_y = df_test.drop(['Pclass','Name', 'Sex','Age','Fare', 'Ticket', 'Cabin', 'Embarked', 'SibSp'], axis=1)

missing_Cabin(df,'Embarked')
missing_Cabin(df_test,'Embarked')
missing_Cabin(df,'Cabin')
missing_Cabin(df_test,'Cabin')

# remove Nans from Both AGE and Fair for both datasets
missing_median(df,'Age')
missing_median(df_test,'Age')

missing_median(df,'Fare')
missing_median(df_test,'Fare')

Kag = df_test.drop(['Pclass','Name', 'Sex','Age','Fare', 'Ticket', 'Cabin', 'Embarked', 'SibSp','Parch'], axis=1)


labelencoder = LabelEncoder()
# label encode sex
df.iloc[:,4] = labelencoder.fit_transform(df.iloc[:,4])
df_test.iloc[:,3] = labelencoder.fit_transform(df_test.iloc[:,3])

# label encode Embark
df.iloc[:,11] = labelencoder.fit_transform(df.iloc[:,11])
df_test.iloc[:,10] = labelencoder.fit_transform(df_test.iloc[:,10])

# label encode Cabin
df.iloc[:,10] = labelencoder.fit_transform(df.iloc[:,10])
df_test.iloc[:,9] = labelencoder.fit_transform(df_test.iloc[:,9])



encode_numeric_zscore(df,'Fare')
encode_numeric_zscore(df_test,'Fare')



In [69]:
X_val = df_test.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)
y = df['Survived']
x = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket','Cabin'], axis=1)

y = y.values
x = x.values

x.shape[1]

7

In [46]:
# Cross-Validate
kf = KFold(5)
    
oos_y = []
oos_pred = []
fold = 0
for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = Sequential()
    model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1,  activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=20, verbose=1, mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure this fold's RMSE
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))
    
    scores = model.evaluate(x_test,y_test)
    print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

Fold #1
Epoch 00089: early stopping
Fold score (RMSE): 0.3773198883273282

acc: 81.56%
Fold #2
Epoch 00076: early stopping
Fold score (RMSE): 0.37302330408469303

acc: 83.71%
Fold #3
Epoch 00156: early stopping
Fold score (RMSE): 0.38390559407552205

acc: 79.21%
Fold #4
Epoch 00178: early stopping
Fold score (RMSE): 0.3925733369337584

acc: 76.40%
Fold #5
Epoch 00120: early stopping
Fold score (RMSE): 0.32412180038501376

acc: 85.96%


In [47]:
oos_y

[array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0], dtype=int64),
 array([0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
      

In [63]:
oos_pred

[array([[0.07986587],
        [0.93346226],
        [0.5342689 ],
        [0.91231054],
        [0.07753334],
        [0.07910864],
        [0.3567504 ],
        [0.6041175 ],
        [0.4125828 ],
        [0.8880654 ],
        [0.91969883],
        [0.8897787 ],
        [0.068785  ],
        [0.01053963],
        [0.42902884],
        [0.7848355 ],
        [0.4359694 ],
        [0.14625627],
        [0.6554369 ],
        [0.7343922 ],
        [0.16878463],
        [0.14723584],
        [0.5036083 ],
        [0.30451587],
        [0.22858179],
        [0.12587738],
        [0.14536919],
        [0.10320537],
        [0.61203563],
        [0.07674737],
        [0.31663424],
        [0.95671356],
        [0.611557  ],
        [0.14859486],
        [0.4837317 ],
        [0.36107454],
        [0.14537461],
        [0.07107205],
        [0.5128408 ],
        [0.77504534],
        [0.6440185 ],
        [0.7888973 ],
        [0.1462044 ],
        [0.8595952 ],
        [0.5347871 ],
        [0

In [64]:
prediction = model.predict(X_val)
np.set_printoptions(suppress=True)

prediction

array([[0.0824507 ],
       [0.6530449 ],
       [0.14265956],
       [0.09483577],
       [0.55930156],
       [0.3751515 ],
       [0.6383114 ],
       [0.1337145 ],
       [0.72412044],
       [0.0785833 ],
       [0.09376549],
       [0.3194919 ],
       [0.93491304],
       [0.19526951],
       [0.9471092 ],
       [0.7790512 ],
       [0.14293519],
       [0.344737  ],
       [0.642959  ],
       [0.60684353],
       [0.37906063],
       [0.799721  ],
       [0.91461277],
       [0.5206871 ],
       [0.7942258 ],
       [0.09480845],
       [0.90035063],
       [0.31472972],
       [0.3373078 ],
       [0.10066704],
       [0.19142814],
       [0.12920982],
       [0.5036273 ],
       [0.5129351 ],
       [0.35858104],
       [0.35697475],
       [0.6722905 ],
       [0.67080116],
       [0.11292995],
       [0.20047623],
       [0.11869183],
       [0.31408855],
       [0.09151103],
       [0.8052416 ],
       [0.9376922 ],
       [0.11037681],
       [0.26916823],
       [0.151

In [65]:
count = 0
while count < len(prediction):
    for x in prediction:
        if prediction[count] <= .65:
            prediction[count] = 0
        else:
            prediction[count] = 1
        count = count + 1
print(count)
prediction

418


array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],

In [66]:
binary_values = pd.DataFrame(data=prediction, columns = ["Survived"])
binary_values[0:5]


Unnamed: 0,Survived
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [67]:
result = pd.concat([Kag, binary_values.astype(int)], axis=1 )
result[0:35]

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [68]:
result.to_csv('pred3.csv', sep=',', index=False)