In [516]:
import torch
import pandas as pd
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

# Pre-processing

In [517]:
# Read the train and test datasets
df_train = pd.read_csv('/content/adult.data', header=None)
df_test = pd.read_csv('/content/adult.test', skiprows=[0], header=None)

In [518]:
def clean_data(df):
    # Remove extra spaces in the object features
    df[df.select_dtypes(include=['object']).columns] = df[df.select_dtypes(include=['object']).columns].apply(lambda col: col.map(str.strip))

    # Replace missing values, indicated with '?' in the dataset with Nan
    df.replace('?', pd.NA, inplace=True)

    # Fill the columns with missing values with the Mode as they are categorical values
    df[[1, 6, 13]] = df[[1, 6, 13]].apply(lambda col: col.fillna(col.mode()[0]))

    # Convert categorical to numerical values
    df = pd.get_dummies(df, columns=[1, 3, 5, 6, 7, 8, 9, 13], drop_first=True)

    # Convert target values to 0 and 1
    df[14] = df[14].apply(lambda x: 1 if '>50K' in x else 0)

    return df

In [519]:
df_train = clean_data(df_train)
df_test = clean_data(df_test)

In [520]:
# Reindex test set to have the same columns as the train set
df_test = df_test.reindex(columns=df_train.columns, fill_value=False)  # Fill the missing column with False

# Split training and testing sets
X_train = df_train.drop(14, axis=1)
y_train = df_train[14]
X_test = df_test.drop(14, axis=1)
y_test = df_test[14]

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.values)
X_test = scaler.transform(X_test.values)

In [521]:
# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

# Handle the shapes
y_train = y_train.unsqueeze(1)
y_test = y_test.unsqueeze(1)

# Building the Model

In [522]:
# Define the model structure
class IncomeClassifier(nn.Module):
    def __init__(self, input_size):
        super().__init__()

        # Define layers
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)

        # Define activation function
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.output(x)
        return x

In [523]:
# Initiating the model
input_size = 97
model = IncomeClassifier(input_size)

# Define the loss function and optimizer
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Create a dataset from the tensors
train_dataset = TensorDataset(X_train, y_train)

# Create a DataLoader for mini-batch training
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [524]:
epochs = 10
for epoch in range(epochs):
    model.train()
    running_train_loss = 0.0

    # Zero the gradients before each update
    optimizer.zero_grad()

    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Forward pass: Get predictions from the model
        outputs = model(inputs)

        # Compute the loss
        loss = loss_fn(outputs, targets)

        # Compute gradients
        loss.backward()

        # Update the model's parameters
        optimizer.step()

        # Calculate the average loss of this epoch
        running_train_loss += loss.item()

    # Compute average training loss for the epoch
    avg_train_loss = running_train_loss / len(train_loader)

    model.eval()
    with torch.no_grad():
        # Compute the validation loss
        val_outputs = model(X_test)
        val_loss = loss_fn(val_outputs, y_test)

        print(f'Epoch [{epoch+1}/{epochs}], Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss.item():.4f}')

Epoch [1/10], Training Loss: 0.4702, Validation Loss: 0.3781
Epoch [2/10], Training Loss: 0.3515, Validation Loss: 0.3494
Epoch [3/10], Training Loss: 0.3366, Validation Loss: 0.3450
Epoch [4/10], Training Loss: 0.3279, Validation Loss: 0.3401
Epoch [5/10], Training Loss: 0.3233, Validation Loss: 0.3375
Epoch [6/10], Training Loss: 0.3190, Validation Loss: 0.3381
Epoch [7/10], Training Loss: 0.3186, Validation Loss: 0.3450
Epoch [8/10], Training Loss: 0.3182, Validation Loss: 0.3424
Epoch [9/10], Training Loss: 0.3168, Validation Loss: 0.3368
Epoch [10/10], Training Loss: 0.3086, Validation Loss: 0.3342


In [525]:
# Calculate accuracy
model.eval()

# Initialize variables for tracking accuracy
correct = 0
total = 0

with torch.no_grad():
    # Forward pass for the test data
    val_outputs = model(X_test)

    # Apply sigmoid to convert logits to probabilities
    probabilities = torch.sigmoid(val_outputs)

    # 0.5 threshold for binary classification
    predicted = (probabilities > 0.5).float()

    # Calculate the number of correct predictions
    correct = (predicted == y_test).sum().item()
    total = y_test.size(0)

    # Calculate accuracy
    accuracy = correct / total

print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 84.72%
