In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [2]:

import numpy as np
import keras as K
import tensorflow as tf
import pandas as pd
import math

Using TensorFlow backend.


In [3]:
import pandas as pd
gender_submission = pd.read_csv("../input/titanic/gender_submission.csv")
test = pd.read_csv("../input/titanic/test.csv")
X = pd.read_csv("../input/titanic/train.csv")

#print(X)

In [4]:
X.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [5]:
percentile = [0.2,0.4,0.6,0.8]
X.describe(percentiles = percentile)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
20%,179.0,0.0,1.0,19.0,0.0,0.0,7.8542
40%,357.0,0.0,2.0,25.0,0.0,0.0,10.5
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
60%,535.0,0.0,3.0,31.8,0.0,0.0,21.6792
80%,713.0,1.0,3.0,41.0,1.0,1.0,39.6875
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:

y = X["Survived"]
y.head(3)

0    0
1    1
2    1
Name: Survived, dtype: int64

In [7]:


def clean_data(data):
    data['Fare'] = data['Fare'].fillna(data['Fare'].dropna().median())
    data['Age'] =  data['Age'].fillna(data['Age'].dropna().median())
    
    data.loc[data['Sex'] == 'male', 'Sex'] = 0
    data.loc[data['Sex'] =='female',  'Sex'] = 1
    
    data['Embarked'] = data['Embarked'].fillna('S')
    data.loc[data["Embarked"] == 'S', 'Embarked'] = 0
    data.loc[data['Embarked'] == 'C', 'Embarked'] = 1
    data.loc[data['Embarked'] == 'Q', 'Embarked'] =2

In [8]:
clean_data(X)
clean_data(test)
X.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0


In [9]:
print('check the nan value in test data')
#cabin has many null so remove
del test['Cabin']
print(test.isnull().sum())

check the nan value in test data
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [10]:
print(test.isnull().sum())

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [11]:
## combine test and train as single to apply some function and applying the feature scaling
all_data=[X,test]

In [12]:
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in all_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [13]:
# Define function to extract titles from passenger names
import re

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
for dataset in all_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
for dataset in all_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                                 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [14]:
for dataset in all_data:
    dataset['Age_Range'] = pd.cut(dataset['Age'], bins=[0,12,20,40,120], labels=['Children','Teenage','Adult','Elder'])

In [15]:
for dataset in all_data:
    dataset['Fare_Range'] = pd.cut(dataset['Fare'], bins=[0,7.91,14.45,31,120], labels=['Low_fare','median_fare',
                                                                                      'Average_fare','high_fare'])

In [16]:
traindf=X
testdf=test

In [17]:
all_dat=[traindf,testdf]

In [18]:
for dataset in all_dat:
    drop_column = ['Age','Fare','Name','Ticket']
    dataset.drop(drop_column, axis=1, inplace = True)

In [19]:
drop_column = ['PassengerId']
traindf.drop(drop_column, axis=1, inplace = True)
#print(testdf)

In [20]:
testdf.drop(drop_column, axis=1, inplace = True)
print(testdf)

     Pclass  Sex  SibSp  Parch  Embarked  FamilySize   Title Age_Range  \
0         3    0      0      0         2           1      Mr     Adult   
1         3    1      1      0         0           2     Mrs     Elder   
2         2    0      0      0         2           1      Mr     Elder   
3         3    0      0      0         0           1      Mr     Adult   
4         3    1      1      1         0           3     Mrs     Adult   
..      ...  ...    ...    ...       ...         ...     ...       ...   
413       3    0      0      0         0           1      Mr     Adult   
414       1    1      0      0         1           1    Rare     Adult   
415       3    0      0      0         0           1      Mr     Adult   
416       3    0      0      0         0           1      Mr     Adult   
417       3    0      1      1         1           3  Master     Adult   

       Fare_Range  
0        Low_fare  
1        Low_fare  
2     median_fare  
3     median_fare  
4     media

In [21]:
all_dat

[     Survived  Pclass  Sex  SibSp  Parch Cabin  Embarked  FamilySize Title  \
 0           0       3    0      1      0   NaN         0           2    Mr   
 1           1       1    1      1      0   C85         1           2   Mrs   
 2           1       3    1      0      0   NaN         0           1  Miss   
 3           1       1    1      1      0  C123         0           2   Mrs   
 4           0       3    0      0      0   NaN         0           1    Mr   
 ..        ...     ...  ...    ...    ...   ...       ...         ...   ...   
 886         0       2    0      0      0   NaN         0           1  Rare   
 887         1       1    1      0      0   B42         0           1  Miss   
 888         0       3    1      1      2   NaN         0           4  Miss   
 889         1       1    0      0      0  C148         1           1    Mr   
 890         0       3    0      0      0   NaN         2           1    Mr   
 
     Age_Range    Fare_Range  
 0       Adult     

In [22]:
testdf.head(5)

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,FamilySize,Title,Age_Range,Fare_Range
0,3,0,0,0,2,1,Mr,Adult,Low_fare
1,3,1,1,0,0,2,Mrs,Elder,Low_fare
2,2,0,0,0,2,1,Mr,Elder,median_fare
3,3,0,0,0,0,1,Mr,Adult,median_fare
4,3,1,1,1,0,3,Mrs,Adult,median_fare


In [23]:
traindf = pd.get_dummies(traindf, columns = ["Pclass","Sex","Title","Age_Range","Embarked","Fare_Range"],
                             prefix=["Pclass", "Sexy","Title","Age_type","Em_type","Fare_type"])

In [24]:
testdf = pd.get_dummies(testdf, columns = ["Pclass","Sex","Title","Age_Range","Embarked","Fare_Range"],
                             prefix=["Pclass", "Sexy","Title","Age_type","Em_type","Fare_type"])

In [25]:
del traindf['Survived']

In [26]:
print(traindf)

     SibSp  Parch Cabin  FamilySize  Pclass_1  Pclass_2  Pclass_3  Sexy_0  \
0        1      0   NaN           2         0         0         1       1   
1        1      0   C85           2         1         0         0       0   
2        0      0   NaN           1         0         0         1       0   
3        1      0  C123           2         1         0         0       0   
4        0      0   NaN           1         0         0         1       1   
..     ...    ...   ...         ...       ...       ...       ...     ...   
886      0      0   NaN           1         0         1         0       1   
887      0      0   B42           1         1         0         0       0   
888      1      2   NaN           4         0         0         1       0   
889      0      0  C148           1         1         0         0       1   
890      0      0   NaN           1         0         0         1       1   

     Sexy_1  Title_Master  ...  Age_type_Teenage  Age_type_Adult  \
0      

In [27]:
del traindf['Cabin']

In [28]:
print(testdf)

     SibSp  Parch  FamilySize  Pclass_1  Pclass_2  Pclass_3  Sexy_0  Sexy_1  \
0        0      0           1         0         0         1       1       0   
1        1      0           2         0         0         1       0       1   
2        0      0           1         0         1         0       1       0   
3        0      0           1         0         0         1       1       0   
4        1      1           3         0         0         1       0       1   
..     ...    ...         ...       ...       ...       ...     ...     ...   
413      0      0           1         0         0         1       1       0   
414      0      0           1         1         0         0       0       1   
415      0      0           1         0         0         1       1       0   
416      0      0           1         0         0         1       1       0   
417      1      1           3         0         0         1       1       0   

     Title_Master  Title_Miss  ...  Age_type_Teenag

In [29]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import optimizers
#import keras.utils.np_utils.to_categorical

In [30]:
X=X.to_numpy()

In [31]:
y=y.to_numpy()

In [32]:
y = [y]

In [33]:
print(y)
print(np.shape(y))

[array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1

In [34]:
model = Sequential()

model.add(Dense(activation="relu", input_dim=24, units=20, kernel_initializer="uniform"))
model.add(Dropout(0.3))

model.add(Dense(activation="relu", units=17, kernel_initializer="uniform"))
model.add(Dropout(0.2))

model.add(Dense(activation="relu", units=7, kernel_initializer="uniform"))
model.add(Dropout(0.2))

model.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))

In [35]:
K.optimizers.Adamax(learning_rate=0.0002, beta_1=0.9, beta_2=0.999)
model.compile(loss='binary_crossentropy',
              optimizer='adamax',
              metrics=['accuracy'])

In [36]:
model.summary()
#train.plot(kind = 'scatter', x='Age', y = 'Fare', alpha = 0.5, color = 'red')
#plot.show()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 20)                500       
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 17)                357       
_________________________________________________________________
dropout_2 (Dropout)          (None, 17)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 7)                 126       
_________________________________________________________________
dropout_3 (Dropout)          (None, 7)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                

In [37]:
model.fit(traindf, y, epochs=10000, batch_size=16)
#//make data numerical
#//check for NaN

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoc

In [38]:
Y_pred.dtype

NameError: name 'Y_pred' is not defined

In [39]:
print(Y_pred)

NameError: name 'Y_pred' is not defined

In [40]:
Y_pred.dtype

NameError: name 'Y_pred' is not defined

In [41]:
test = pd.read_csv("../input/titanic/test.csv")

In [42]:
predictions = pd.DataFrame(Y_pred, columns=['Survived'])
test = pd.read_csv(os.path.join('../input/titanic/', 'test.csv'))
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv('my_output.csv', sep=",", index = False)

NameError: name 'Y_pred' is not defined