In [1]:
'''
STEP 1: LOAD AND CLEAN THE DATASET
dataset comes from: https://www.kaggle.com/c/titanic/data
'''
import time
import pandas as pd
import numpy as np

data = pd.read_csv('train.csv')

#Useful chunks of code for visualizing the dataset
#data.head(10)
#data.isna().sum()/len(data)
#data.dtypes() - all should be of type int or float

#droping missing values, redundant variables, and variable Cabin, which had too many missing values
data = data[~(data.Age.isna())]
data = data[~(data.Embarked.isna())]
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

#encoding categorical variables (ones that had dtype 'object')
data['Sex'] = data['Sex'].astype('category').cat.codes
data['Embarked'] = data['Embarked'].astype('category').cat.codes

y = data.Survived.values
X = data.drop('Survived', axis=1).values

#spliting into train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify=y)


'''
STEP 2: TRANSFORM TO TENSORS, LOAD INTO DATASET, AND DECLARE DATALOADERS
'''
import torch
import torch.nn as nn

X_train = torch.tensor(X_train, dtype=torch.float, requires_grad=True)
X_test = torch.tensor(X_test, dtype=torch.float, requires_grad=True)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

#Loading dataset into PyTorch class
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

#Declaring DataLoaders over training and test dataset
train_loader = DataLoader(train_dataset, batch_size=498, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=214, shuffle=False)

'''
STEP 3: CREATE MODEL CLASS
'''
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        out = self.linear(x)
        return out

'''
STEP 4: DECLARE MODEL CLASS
'''
input_dim = train_dataset[0][0].shape[0]
output_dim = 2

model = LogisticRegressionModel(input_dim, output_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

'''
STEP 5: DECLARE LOSS, OPTIMIZER AND LEARNING RATE
'''
criterion = nn.CrossEntropyLoss()

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


'''
STEP 6: TRAIN THE MODEL
'''

num_epochs = 1000
iter = 0
start = time.time()
for epoch in range(num_epochs):
    epoch+=1
    for i, (x_train, labels) in enumerate(train_loader):
        correct_train_predictions = 0
        model.train()
        x_train = x_train.to(device)
        labels = labels.to(device)
        

        optimizer.zero_grad()
        outputs = model(x_train)
        
        loss = criterion(outputs.squeeze(), labels.squeeze())
        

        loss.backward()
        
        optimizer.step()
        
        correct_train_predictions += (torch.max(outputs,1)[1] == labels).sum()
        accuracy_train = 100 * correct_train_predictions / len(train_dataset)
        
        iter += 1
        
        if iter % 50 == 0:
            correct_test_predictions = 0

            with torch.no_grad():
              model.eval()
              for x_test, labels in test_loader:

                  x_test = x_test.to(device)
                  labels = labels.to(device)

                  outputs = model(x_test)
                  

                  correct_test_predictions += (torch.max(outputs,1)[1] == labels).sum()
            
            accuracy_test = 100 * correct_test_predictions / len(test_dataset)
            
            # Print Loss
            print('Epoch: {}. Loss: {}. Train Accuracy: {}%, Test Accuracy: {}%'.format(epoch, np.round(loss.item(),6), accuracy_train, accuracy_test))
print(f"Calculations took {time.time() - start}")

Epoch: 50. Loss: 6.71991. Train Accuracy: 35%, Test Accuracy: 41%
Epoch: 100. Loss: 4.775579. Train Accuracy: 33%, Test Accuracy: 40%
Epoch: 150. Loss: 2.915118. Train Accuracy: 33%, Test Accuracy: 40%
Epoch: 200. Loss: 1.298233. Train Accuracy: 41%, Test Accuracy: 41%
Epoch: 250. Loss: 0.635334. Train Accuracy: 67%, Test Accuracy: 57%
Epoch: 300. Loss: 0.599747. Train Accuracy: 69%, Test Accuracy: 63%
Epoch: 350. Loss: 0.577855. Train Accuracy: 69%, Test Accuracy: 63%
Epoch: 400. Loss: 0.562135. Train Accuracy: 69%, Test Accuracy: 63%
Epoch: 450. Loss: 0.550922. Train Accuracy: 70%, Test Accuracy: 64%
Epoch: 500. Loss: 0.542696. Train Accuracy: 71%, Test Accuracy: 64%
Epoch: 550. Loss: 0.536314. Train Accuracy: 72%, Test Accuracy: 65%
Epoch: 600. Loss: 0.53104. Train Accuracy: 73%, Test Accuracy: 66%
Epoch: 650. Loss: 0.526445. Train Accuracy: 73%, Test Accuracy: 68%
Epoch: 700. Loss: 0.522288. Train Accuracy: 73%, Test Accuracy: 68%
Epoch: 750. Loss: 0.518445. Train Accuracy: 73%, Te

In [2]:
# Comparison with sklearn's LinearRegression implementation 
# (detach method on X_train, and X_test is nessecary to remove track of gradients from tensors inside them)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000, solver='lbfgs')
model.fit(X_train.detach(), y_train)
model.score(X_train.detach(), y_train), model.score(X_test.detach(),y_test)

(0.8172690763052208, 0.8037383177570093)