In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train.csv')

In [6]:
data[:15]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
def preprocess_data(file_path):
    data = pd.read_csv(file_path)

    # Fill missing values
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    
    # Encode categorical variables
    label_encoder = LabelEncoder()
    data['Sex'] = label_encoder.fit_transform(data['Sex'])
    data['Embarked'] = label_encoder.fit_transform(data['Embarked'])
    
    # Scale numerical variables
    standard_scaler = StandardScaler()
    data[['Age', 'Fare']] = standard_scaler.fit_transform(data[['Age', 'Fare']])

    # Select features and target
    features = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].values
    target = data['Survived'].values

    return features, target

train_file_path = "train.csv"
X, y = preprocess_data(train_file_path)


In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

class TitanicDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = TitanicDataset(X_train, y_train)
val_dataset = TitanicDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)


In [17]:
class TitanicModel(nn.Module):
    def __init__(self):
        super(TitanicModel, self).__init__()
        self.fc1 = nn.Linear(7, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

model = TitanicModel()


In [18]:
# Set the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct_predictions = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        preds = (outputs > 0.5).long()
        correct_predictions += torch.sum(preds == labels)

    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct_predictions.double() / len(dataloader.dataset)

    return epoch_loss, epoch_acc

# Validation function
def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels.float())

            running_loss += loss.item() * inputs.size(0)
            preds = (outputs > 0.5).long()
            correct_predictions += torch.sum(preds == labels)

    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct_predictions.double() / len(dataloader.dataset)

    return epoch_loss, epoch_acc

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train the model
num_epochs = 50
best_acc = 0.0

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_model.pth")

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


Epoch 1/50
Train Loss: 0.7085, Train Acc: 0.3919
Val Loss: 0.6917, Val Acc: 0.4749
Epoch 2/50
Train Loss: 0.6822, Train Acc: 0.6278
Val Loss: 0.6709, Val Acc: 0.6704
Epoch 3/50
Train Loss: 0.6591, Train Acc: 0.6643
Val Loss: 0.6512, Val Acc: 0.6480
Epoch 4/50
Train Loss: 0.6370, Train Acc: 0.6573
Val Loss: 0.6328, Val Acc: 0.6425
Epoch 5/50
Train Loss: 0.6169, Train Acc: 0.6545
Val Loss: 0.6153, Val Acc: 0.6369
Epoch 6/50
Train Loss: 0.5998, Train Acc: 0.6573
Val Loss: 0.6002, Val Acc: 0.6536
Epoch 7/50
Train Loss: 0.5849, Train Acc: 0.6671
Val Loss: 0.5843, Val Acc: 0.6816
Epoch 8/50
Train Loss: 0.5710, Train Acc: 0.6952
Val Loss: 0.5691, Val Acc: 0.7039
Epoch 9/50
Train Loss: 0.5574, Train Acc: 0.7177
Val Loss: 0.5546, Val Acc: 0.7207
Epoch 10/50
Train Loss: 0.5428, Train Acc: 0.7247
Val Loss: 0.5387, Val Acc: 0.7654
Epoch 11/50
Train Loss: 0.5287, Train Acc: 0.7402
Val Loss: 0.5239, Val Acc: 0.7542
Epoch 12/50
Train Loss: 0.5150, Train Acc: 0.7556
Val Loss: 0.5109, Val Acc: 0.7877
E

In [20]:
# Load the best model
best_model = TitanicModel()
best_model.load_state_dict(torch.load("best_model.pth"))
best_model.to(device)

# Preprocess test data
def preprocess_test_data(file_path):
    data = pd.read_csv(file_path)

    passenger_ids = data['PassengerId'].values

    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    
    label_encoder = LabelEncoder()
    data['Sex'] = label_encoder.fit_transform(data['Sex'])
    data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

    standard_scaler = StandardScaler()
    data[['Age', 'Fare']] = standard_scaler.fit_transform(data[['Age', 'Fare']])

    features = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].values

    return passenger_ids, features

test_file_path = "test.csv"


In [21]:
passenger_ids, X_test = preprocess_test_data(test_file_path)

# Make predictions
def predict(model, features, device):
    model.eval()
    inputs = torch.tensor(features, dtype=torch.float).to(device)
    with torch.no_grad():
        outputs = model(inputs).squeeze()
    predictions = (outputs > 0.5).long().cpu().numpy()
    return predictions

predictions = predict(best_model, X_test, device)

# Create submission file
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Survived": predictions
})
submission.to_csv("submission.csv", index=False)
