# DataSet & Library Loading

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)

df_train = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')
# df_sub   = pd.read_csv('submission.csv')

## df_train

In [2]:
# rerange column
columns_titles_train = ['Stroke',
                        'Diabetes_binary',
                        'HighBP',
                        'HighChol',
                        'CholCheck',
                        'BMI',
                        'Smoker',
                        'HeartDiseaseorAttack',
                        'PhysActivity',
                        'Fruits',
                        'Veggies',
                        'HvyAlcoholConsump',
                        'AnyHealthcare',
                        'NoDocbcCost',
                        'GenHlth',
                        'MentHlth',
                        'PhysHlth',
                        'DiffWalk',
                        'Sex',
                        'Age',
                        'Education',
                        'Income']
df_train = df_train.reindex(columns = columns_titles_train)

df_train.head()

Unnamed: 0,Stroke,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,1.0,0.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,0.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [3]:
# putting 'Stroke' colums at the start of dataset for better management
# dropped some feature value are ambiguous

df_train.drop(['HighBP',
               'HighChol',
               'CholCheck',
               'PhysActivity',
               'AnyHealthcare',
               'NoDocbcCost',
               'GenHlth',
               'MentHlth',
               'PhysHlth',
               'DiffWalk'],axis=1,inplace=True)

In [4]:
df_train.fillna(df_train.mean(),inplace=True)
Scaler_train = StandardScaler()
train_columns = df_train.columns
df_train = pd.DataFrame(Scaler_train.fit_transform(df_train))
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.258115,-1.0,-0.539225,-0.959048,-0.417654,-1.262596,0.520504,-0.211665,1.092665,-1.611588,1.048176,1.053963
1,3.874236,-1.0,-0.539225,1.0427,-0.417654,0.792019,-1.921213,-0.211665,1.092665,1.193954,1.048176,1.053963
2,-0.258115,-1.0,-0.539225,-0.959048,-0.417654,0.792019,0.520504,-0.211665,1.092665,1.544647,1.048176,1.053963
3,-0.258115,-1.0,-0.259854,1.0427,-0.417654,0.792019,0.520504,-0.211665,1.092665,0.843262,1.048176,1.053963
4,-0.258115,-1.0,-0.120169,1.0427,-0.417654,0.792019,0.520504,-0.211665,-0.915193,-0.208817,0.069681,1.053963


In [5]:
df_train.columns = train_columns
df_train.head()

Unnamed: 0,Stroke,Diabetes_binary,BMI,Smoker,HeartDiseaseorAttack,Fruits,Veggies,HvyAlcoholConsump,Sex,Age,Education,Income
0,-0.258115,-1.0,-0.539225,-0.959048,-0.417654,-1.262596,0.520504,-0.211665,1.092665,-1.611588,1.048176,1.053963
1,3.874236,-1.0,-0.539225,1.0427,-0.417654,0.792019,-1.921213,-0.211665,1.092665,1.193954,1.048176,1.053963
2,-0.258115,-1.0,-0.539225,-0.959048,-0.417654,0.792019,0.520504,-0.211665,1.092665,1.544647,1.048176,1.053963
3,-0.258115,-1.0,-0.259854,1.0427,-0.417654,0.792019,0.520504,-0.211665,1.092665,0.843262,1.048176,1.053963
4,-0.258115,-1.0,-0.120169,1.0427,-0.417654,0.792019,0.520504,-0.211665,-0.915193,-0.208817,0.069681,1.053963


In [6]:
features = df_train.iloc[:,1:].columns.tolist()
features

['Diabetes_binary',
 'BMI',
 'Smoker',
 'HeartDiseaseorAttack',
 'Fruits',
 'Veggies',
 'HvyAlcoholConsump',
 'Sex',
 'Age',
 'Education',
 'Income']

In [7]:
target = df_train.loc[:, 'Stroke'].name
target

'Stroke'

In [8]:
X_train = df_train.iloc[:,1:].values
y_train = df_train.loc[:, 'Stroke'].values

## df_test

In [9]:
# same as above for test dataset

columns_titles_test = ['Stroke',
                       'Diabetes_binary',
                       'HighBP',
                       'HighChol',
                       'CholCheck',
                       'BMI',
                       'Smoker',
                       'HeartDiseaseorAttack',
                       'PhysActivity',
                       'Fruits',
                       'Veggies',
                       'HvyAlcoholConsump',
                       'AnyHealthcare',
                       'NoDocbcCost',
                       'GenHlth',
                       'MentHlth',
                       'PhysHlth',
                       'DiffWalk',
                       'Sex',
                       'Age',
                       'Education',
                       'Income']

df_test = df_test.reindex(columns = columns_titles_test)

df_test.head()

Unnamed: 0,Stroke,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0.0,0.0,0.0,1.0,27.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,2.0,0.0,0.0,1.0,2.0,5.0,5.0
1,0.0,0.0,0.0,1.0,1.0,39.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,5.0,0.0,0.0,0.0,3.0,6.0,8.0
2,0.0,0.0,0.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,14.0,0.0,0.0,0.0,2.0,6.0,7.0
3,1.0,0.0,1.0,1.0,1.0,29.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,5.0,8.0
4,0.0,0.0,0.0,0.0,1.0,27.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,13.0,4.0,3.0


In [10]:
# df_test.drop(['HighBP',
#                'HighChol',
#                'CholCheck',
#                'PhysActivity',
#                'AnyHealthcare',
#                'NoDocbcCost',
#                'GenHlth',
#                'MentHlth',
#                'PhysHlth',
#                'DiffWalk',
#                'Sex',
#                'Education',
#                'Income'],axis=1,inplace=True)

df_test.drop(['HighBP',
              'HighChol',
              'CholCheck',
              'PhysActivity',
              'AnyHealthcare',
              'NoDocbcCost',
              'GenHlth',
              'MentHlth',
              'PhysHlth',
              'DiffWalk'],axis=1,inplace=True)

In [11]:
df_test.fillna(df_test.mean(),inplace=True)

Scaler_test = StandardScaler()
test_columns = df_test.columns
df_test = pd.DataFrame(Scaler_test.fit_transform(df_test))
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.255971,-1.0,-0.406597,1.069768,-0.413702,-1.238742,0.510427,-0.210283,1.083962,-2.2981,0.09312,-0.294259
1,-0.255971,-1.0,1.305819,1.069768,-0.413702,0.807271,0.510427,-0.210283,-0.922541,-1.947646,1.049859,1.068298
2,-0.255971,-1.0,-0.549299,-0.934782,-0.413702,0.807271,0.510427,-0.210283,-0.922541,-2.2981,1.049859,0.614112
3,3.906696,-1.0,-0.121195,-0.934782,-0.413702,0.807271,0.510427,-0.210283,-0.922541,0.855991,0.09312,1.068298
4,-0.255971,-1.0,-0.406597,1.069768,-0.413702,-1.238742,0.510427,-0.210283,-0.922541,1.5569,-0.863619,-1.20263


In [12]:
df_test.columns  = test_columns
df_test.head()

Unnamed: 0,Stroke,Diabetes_binary,BMI,Smoker,HeartDiseaseorAttack,Fruits,Veggies,HvyAlcoholConsump,Sex,Age,Education,Income
0,-0.255971,-1.0,-0.406597,1.069768,-0.413702,-1.238742,0.510427,-0.210283,1.083962,-2.2981,0.09312,-0.294259
1,-0.255971,-1.0,1.305819,1.069768,-0.413702,0.807271,0.510427,-0.210283,-0.922541,-1.947646,1.049859,1.068298
2,-0.255971,-1.0,-0.549299,-0.934782,-0.413702,0.807271,0.510427,-0.210283,-0.922541,-2.2981,1.049859,0.614112
3,3.906696,-1.0,-0.121195,-0.934782,-0.413702,0.807271,0.510427,-0.210283,-0.922541,0.855991,0.09312,1.068298
4,-0.255971,-1.0,-0.406597,1.069768,-0.413702,-1.238742,0.510427,-0.210283,-0.922541,1.5569,-0.863619,-1.20263


# Pytorch

In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable

In [14]:
# credit to https://www.kaggle.com/mburakergenc/ttianic-minimal-pytorch-mlp

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(11, 1331)
        self.fc2 = nn.Linear(1331, 1331)
        self.fc3 = nn.Linear(1331, 11)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x
model = Net()
print(model)

Net(
  (fc1): Linear(in_features=11, out_features=1331, bias=True)
  (fc2): Linear(in_features=1331, out_features=1331, bias=True)
  (fc3): Linear(in_features=1331, out_features=11, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


## Pytorch Loss Function (Cross Entropy CE)

In [15]:
criterion = nn.CrossEntropyLoss()

## Pytorch Optimizer (Stochastic Gradient Descent SGD)

In [16]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

## Pytorch Training

### Cuda run over gpu instead cpu

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [18]:
X_train

array([[-1.        , -0.53922458, -0.9590482 , ..., -1.61158832,
         1.04817589,  1.0539633 ],
       [-1.        , -0.53922458,  1.04270047, ...,  1.19395443,
         1.04817589,  1.0539633 ],
       [-1.        , -0.53922458, -0.9590482 , ...,  1.54464728,
         1.04817589,  1.0539633 ],
       ...,
       [ 1.        ,  0.57825718,  1.04270047, ..., -0.20881695,
         0.06968066, -0.33278234],
       [ 1.        ,  1.55605371, -0.9590482 , ..., -0.55950979,
         0.06968066, -1.25727942],
       [ 1.        , -0.39953936,  1.04270047, ..., -0.20881695,
         1.04817589, -0.33278234]])

In [19]:
torch.cuda.is_available()

False

In [20]:
tensor = torch.from_numpy(X_train)
tensor

tensor([[-1.0000, -0.5392, -0.9590,  ..., -1.6116,  1.0482,  1.0540],
        [-1.0000, -0.5392,  1.0427,  ...,  1.1940,  1.0482,  1.0540],
        [-1.0000, -0.5392, -0.9590,  ...,  1.5446,  1.0482,  1.0540],
        ...,
        [ 1.0000,  0.5783,  1.0427,  ..., -0.2088,  0.0697, -0.3328],
        [ 1.0000,  1.5561, -0.9590,  ..., -0.5595,  0.0697, -1.2573],
        [ 1.0000, -0.3995,  1.0427,  ..., -0.2088,  1.0482, -0.3328]],
       dtype=torch.float64)

In [21]:
X_train = torch.FloatTensor(X_train)
X_train = X_train.to(device)
X_train = X_train.cuda() 
X_train.is_cuda

RuntimeError: cuda runtime error (30) : unknown error at /pytorch/aten/src/THC/THCGeneral.cpp:50

In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Training

In [None]:
# credit to https://www.kaggle.com/mburakergenc/ttianic-minimal-pytorch-mlp

batch_size = 64
n_epochs = 100
batch_no = len(X_train) // batch_size

train_loss = 0
train_loss_min = np.Inf

# for epoch in range(n_epochs):
#     for i in range(batch_no):
#         start = i * batch_size
#         end   = start + batch_size
#         x_var = Variable(torch.FloatTensor(X_train[start:end]))
#         y_var = Variable(torch.LongTensor(y_train[start:end])) 
        
#         optimizer.zero_grad()
#         output = model(x_var)
#         loss   = criterion(output,y_var)
#         loss.backward()
#         optimizer.step()
        
#         values, labels = torch.max(output, 1)
#         num_right   = np.sum(labels.data.numpy() == y_train[start:end])
#         train_loss += loss.item()*batch_size
    
#     train_loss = train_loss / len(X_train)
#     if train_loss <= train_loss_min:
#         print("Validation loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min,train_loss))
#         torch.save(model.state_dict(), "model.pt")
#         train_loss_min = train_loss
    
#     if epoch % 50 == 0:
#         print('')
#         print("Epoch: {} \tTrain Loss: {} \tTrain Accuracy: {}".format(epoch+1, train_loss,num_right / len(y_train[start:end]) ))
# print('Training Ended! ')