In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm # not sure how much of a performance hit this is since its python while everything else is c
import numpy as np

DATA_PATH_TRAIN = 'data/train.csv'
DATA_PATH_TEST = 'data/test.csv'
BATCH_SIZE = 3000
LEARNING_RATE = 0.001
EPOCHS = 3000
DEVICE = torch.device("cpu")

In [6]:
train_df = pd.read_csv(DATA_PATH_TRAIN)
test_df = pd.read_csv(DATA_PATH_TEST)

# Separate target variable and IDs, then drop from respective DataFrames
survived = train_df['Survived']
test_ids = test_df['PassengerId']
train_df = train_df.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])
test_df = test_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# Preprocessing function
def preprocessdata(data, train_cols=None):
    data.dropna(axis=1, thresh=int(0.85 * len(data)), inplace=True)

    numeric_cols = data.select_dtypes(exclude=['object']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())
    data[categorical_cols] = data[categorical_cols].fillna('Unknown')

    data = pd.get_dummies(data, columns=categorical_cols, dummy_na=False)

    if train_cols is not None:
        missing_cols = set(train_cols) - set(data.columns)
        for c in missing_cols:
            data[c] = 0
        data = data[train_cols]

    return data

train = preprocessdata(train_df)
test = preprocessdata(test_df, train_cols=train.columns)

In [8]:
print(train)
print(train.info())
print(train.describe())

     Pclass  SibSp  Parch     Fare  Sex_female  Sex_male  Embarked_C  \
0         3      1      0   7.2500       False      True       False   
1         1      1      0  71.2833        True     False        True   
2         3      0      0   7.9250        True     False       False   
3         1      1      0  53.1000        True     False       False   
4         3      0      0   8.0500       False      True       False   
..      ...    ...    ...      ...         ...       ...         ...   
886       2      0      0  13.0000       False      True       False   
887       1      0      0  30.0000        True     False       False   
888       3      1      2  23.4500        True     False       False   
889       1      0      0  30.0000       False      True        True   
890       3      0      0   7.7500       False      True       False   

     Embarked_Q  Embarked_S  Embarked_Unknown  
0         False        True             False  
1         False       False            