In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

# Preprocess the data
def preprocess_data(data):
    # Drop irrelevant columns
    data = data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
    
    # Fill missing values
    data["Age"] = data["Age"].fillna(data["Age"].median())
    data["Embarked"] = data["Embarked"].fillna("S")
    
    # Encode categorical variables
    label_enc = LabelEncoder()
    data["Sex"] = label_enc.fit_transform(data["Sex"])
    data["Embarked"] = label_enc.fit_transform(data["Embarked"])
    
    # Separate features and target
    X = data.drop("Survived", axis=1).values
    y = data["Survived"].values
    
    return X, y

X, y = preprocess_data(data)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Define a simple PyTorch model
class TitanicModel(nn.Module):
    def __init__(self, input_dim):
        super(TitanicModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Model setup
input_dim = X_train.shape[1]
model = TitanicModel(input_dim)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    
    # Mini-batching
    permutation = torch.randperm(X_train_tensor.size(0))
    for i in range(0, X_train_tensor.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_X, batch_y = X_train_tensor[indices], y_train_tensor[indices]
        
        # Forward pass
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Print epoch loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor).squeeze()
    predictions = (predictions > 0.5).float()
    accuracy = (predictions == y_test_tensor).sum().item() / y_test_tensor.size(0)
    print(f"Accuracy on test set: {accuracy:.2f}")


Epoch [10/50], Loss: 0.5480
Epoch [20/50], Loss: 0.4757
Epoch [30/50], Loss: 0.4916
Epoch [40/50], Loss: 0.5142
Epoch [50/50], Loss: 0.4153
Accuracy on test set: 0.82


In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Set up the Kaggle API environment
os.environ['KAGGLE_CONFIG_DIR'] = "/allah/stuff/ml/kaggle"

# Set the working directory for data
os.chdir('/allah/data/')
print(f"Current working directory: {os.getcwd()}")

# Download the Titanic dataset using Kaggle API
!kaggle competitions download -c titanic

# Extract the downloaded dataset
zip_file = "titanic.zip"
extract_dir = "titanic_data"
with zipfile.ZipFile(zip_file, "r") as zip_ref:
    zip_ref.extractall(extract_dir)
print(f"Extracted files to {extract_dir}")

# Load the Titanic datasets
train_csv = os.path.join(extract_dir, "train.csv")
test_csv = os.path.join(extract_dir, "test.csv")
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

# Display the first few rows of the training dataset
print("Training Dataset Preview:")
print(train_df.head())

# Load and display the training dataset for verification
df = pd.read_csv(train_csv)
print("\nDataset Verification Preview:")
print(df.head())


In [None]:
# Load the dataset
df = pd.read_csv("titanic_data/train.csv")

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])  # Encode Sex (male=1, female=0)
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])  # Encode Embarked (C=0, Q=1, S=2)

# Select features and target variable
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features]
y = df['Survived']

# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
class TitanicNN(nn.Module):
    def __init__(self, input_dim):
        super(TitanicNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 2)  # Output layer for binary classification

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:
input_dim = X_train.shape[1]
model = TitanicNN(input_dim)

criterion = nn.CrossEntropyLoss()  # Suitable for classification tasks
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [None]:
epochs = 500

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()  # Clear gradients
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")


In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)  # Get predictions
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
from sklearn.metrics import classification_report

# Generate predictions on the test set
y_pred = []
y_true = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.numpy())
        y_true.extend(y_batch.numpy())

# Print classification report
report = classification_report(y_true, y_pred, target_names=["Not Survived", "Survived"])
print(report)


In [None]:
import matplotlib.pyplot as plt

# Example: Plotting the loss curve
losses = []  # Collect losses during training

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    losses.append(running_loss / len(train_loader))

# Plot the loss curve
plt.plot(range(epochs), losses, label="Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.legend()
plt.show()
