In [56]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from collections import Counter 
from sklearn.utils import shuffle 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

df_train=pd.read_csv('./data/train.csv')
df_test=pd.read_csv('./data/test.csv')
df_sub=pd.read_csv('./data/gender_submission.csv')
df_train.head()
print(df_test.head())

   PassengerId  Pclass                                          Name     Sex  ...   Ticket     Fare  Cabin Embarked
0          892       3                              Kelly, Mr. James    male  ...   330911   7.8292    NaN        Q
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female  ...   363272   7.0000    NaN        S
2          894       2                     Myles, Mr. Thomas Francis    male  ...   240276   9.6875    NaN        Q
3          895       3                              Wirz, Mr. Albert    male  ...   315154   8.6625    NaN        S
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  ...  3101298  12.2875    NaN        S

[5 rows x 11 columns]


In [46]:
sex=pd.get_dummies(df_train['Sex'],drop_first=True)
sex1=pd.get_dummies(df_test['Sex'],drop_first=True)
print(sex.head())
df_train.drop(['Name','Sex','Parch','Ticket','SibSp','Fare','Cabin','Age','Embarked'],axis=1,inplace=True)
df_test.drop(['Name','Sex','Parch','Ticket','SibSp','Fare','Cabin','Age','Embarked'],axis=1,inplace=True)
df_train.head()


   male
0     1
1     0
2     0
3     0
4     1


Unnamed: 0,PassengerId,Survived,Pclass
0,1,0,3
1,2,1,1
2,3,1,3
3,4,1,1
4,5,0,3


In [47]:
df_train=pd.concat([df_train,sex],axis=1)
df_test=pd.concat([df_test,sex1],axis=1)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,male
0,1,0,3,1
1,2,1,1,0
2,3,1,3,0
3,4,1,1,0
4,5,0,3,1


In [48]:
scaler1,scaler2=StandardScaler(),StandardScaler()
train_columns=df_train.columns
test_columns=df_test.columns

df_train=pd.DataFrame(scaler1.fit_transform(df_train))
df_test=pd.DataFrame(scaler2.fit_transform(df_test))

print(df_train)


            0         1         2         3
0   -1.730108 -0.789272  0.827377  0.737695
1   -1.726220  1.266990 -1.566107 -1.355574
2   -1.722332  1.266990  0.827377 -1.355574
3   -1.718444  1.266990 -1.566107 -1.355574
4   -1.714556 -0.789272  0.827377  0.737695
..        ...       ...       ...       ...
886  1.714556 -0.789272 -0.369365  0.737695
887  1.718444  1.266990 -1.566107 -1.355574
888  1.722332 -0.789272  0.827377 -1.355574
889  1.726220  1.266990 -1.566107  0.737695
890  1.730108 -0.789272  0.827377  0.737695

[891 rows x 4 columns]


In [49]:
df_train.columns=train_columns
df_test.columns=test_columns
features=df_train.iloc[:,2:].columns.tolist()
target=df_train.loc[:,'Survived'].name

X_train=df_train.iloc[:,2:].values
y_train=df_train.loc[:,'Survived'].values

In [50]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from torch.autograd import Variable

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.fc1=nn.Linear(2,512)
        self.fc2=nn.Linear(512,512)
        self.fc3=nn.Linear(512,2)
        self.dropout=nn.Dropout(0.01)

        self.layer=nn.Sequential(
            self.fc1,
            self.dropout,
            self.fc2,
            self.dropout,
            self.fc3
        )
    def forward(self, x):
        for idx,i in enumerate(self.layer):
            x=i(x)
            if idx==1 or idx==3:
                x=F.relu(x)
        return x
model=Net()
print(model)

Net(
  (fc1): Linear(in_features=2, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.01, inplace=False)
  (layer): Sequential(
    (0): Linear(in_features=2, out_features=512, bias=True)
    (1): Dropout(p=0.01, inplace=False)
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): Dropout(p=0.01, inplace=False)
    (4): Linear(in_features=512, out_features=2, bias=True)
  )
)


In [51]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=1e-3)

batch_size=64
n_epochs=1000
batch_no=len(X_train)//batch_size

train_loss=0
train_loss_min=np.Inf

for epoch in range(n_epochs):
    for i in range(batch_no):
        start=i*batch_size
        end=start+batch_size

        x_var=Variable(torch.FloatTensor(X_train[start:end]))
        y_var=Variable(torch.LongTensor(y_train[start:end]))

        optimizer.zero_grad()
        output=model(x_var)
        loss=criterion(output,y_var)

        loss.backward()
        optimizer.step()

        values,labels=torch.max(output,1)
        if i%63==0 and epoch%100==0:
            print(values)
            print(labels)
        num_right=np.sum(labels.data.numpy()==y_train[start:end])
        train_loss+=loss.item()*batch_size
    
    train_loss=train_loss/len(X_train)
    if train_loss<=train_loss_min:
        print(f'validation loss decreased:\t{train_loss_min}->{train_loss}')
        torch.save(model.state_dict(),'model.pt')
        train_loss_min=train_loss

    if epoch%100==0:
         print(f"Epoch:{epoch+1}\t Train_loss:{train_loss}\t Train Accuracy:{num_right/len(y_train[start:end])}")

print("Train End!!!")








tensor([0.1094, 0.1003, 0.1362, 0.0942, 0.0967, 0.0890, 0.0752, 0.0717, 0.1393,
        0.1412, 0.1256, 0.1208, 0.1048, 0.1130, 0.1254, 0.1154, 0.1008, 0.0354,
        0.1217, 0.0980, 0.0372, 0.0129, 0.1182, 0.0673, 0.1576, 0.1082, 0.0849,
        0.0718, 0.1154, 0.1048, 0.0986, 0.1373, 0.1280, 0.0050, 0.0883, 0.0843,
        0.1088, 0.1057, 0.0805, 0.1331, 0.1103, 0.1004, 0.0861, 0.1178, 0.1457,
        0.0901, 0.0790, 0.1479, 0.0717, 0.1363, 0.0724, 0.0973, 0.0953, 0.1438,
        0.0512, 0.0961, 0.1282, 0.0954, 0.1113, 0.0970, 0.0785, 0.1426, 0.0398,
        0.0960], grad_fn=<MaxBackward0>)
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
validation loss decreased:	inf->0.4714053204297752
Epoch:1	 Train_loss:0.4714053204297752	 Train Accuracy:0.0
validation loss decreased:	0.4714053204297752->0.45913085370783585
valid

In [55]:
###predictions 

X_test=df_test.iloc[:,1:].values
print(X_test)
X_test_var=Variable(torch.FloatTensor(X_test),requires_grad=False)
with torch.no_grad():
    test_result=model(X_test_var)
values,labels= torch.max(test_result,1)
survived=labels.data.numpy()

[[ 0.87348191  0.75592895]
 [ 0.87348191 -1.32287566]
 [-0.31581919  0.75592895]
 [ 0.87348191  0.75592895]
 [ 0.87348191 -1.32287566]
 [ 0.87348191  0.75592895]
 [ 0.87348191 -1.32287566]
 [-0.31581919  0.75592895]
 [ 0.87348191 -1.32287566]
 [ 0.87348191  0.75592895]
 [ 0.87348191  0.75592895]
 [-1.50512029  0.75592895]
 [-1.50512029 -1.32287566]
 [-0.31581919  0.75592895]
 [-1.50512029 -1.32287566]
 [-0.31581919 -1.32287566]
 [-0.31581919  0.75592895]
 [ 0.87348191  0.75592895]
 [ 0.87348191 -1.32287566]
 [ 0.87348191 -1.32287566]
 [-1.50512029  0.75592895]
 [ 0.87348191  0.75592895]
 [-1.50512029 -1.32287566]
 [-1.50512029  0.75592895]
 [-1.50512029 -1.32287566]
 [ 0.87348191  0.75592895]
 [-1.50512029 -1.32287566]
 [ 0.87348191  0.75592895]
 [-1.50512029  0.75592895]
 [ 0.87348191  0.75592895]
 [-0.31581919  0.75592895]
 [-0.31581919  0.75592895]
 [ 0.87348191 -1.32287566]
 [ 0.87348191 -1.32287566]
 [-1.50512029  0.75592895]
 [ 0.87348191  0.75592895]
 [ 0.87348191 -1.32287566]
 

In [53]:
submission = pd.DataFrame({'PassengerId': df_sub['PassengerId'], 'Survived': survived})
submission.to_csv('submission.csv', index=False)