<a href="https://colab.research.google.com/github/sravanipopuri2006/-Alpha-beta-pruning-of-Minimax-Search-Algorithm/blob/main/WORKSHOP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
import random

# ------------------------------
# REPRODUCIBILITY
# ------------------------------
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# ------------------------------
# LOAD DATA
# ------------------------------
df = pd.read_csv("/content/income.csv")

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

# ------------------------------
# DEFINE COLUMNS BASED ON YOUR FILE
# ------------------------------
categorical_cols = ['sex', 'education', 'marital-status', 'workclass', 'occupation']
continuous_cols = ['age', 'hours-per-week']
label_col = 'label'  # final target (0 or 1)

# ------------------------------
# ENCODE CATEGORICAL + LABEL
# ------------------------------
label_enc = LabelEncoder()
df[label_col] = label_enc.fit_transform(df[label_col])

cat_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    cat_encoders[col] = le

# ------------------------------
# SPLIT TRAIN/TEST (25k train, 5k test)
# ------------------------------
train_df = df.iloc[:25000]
test_df = df.iloc[25000:30000]

# ------------------------------
# PREPARE ARRAYS
# ------------------------------
cat_train = np.stack([train_df[col].values for col in categorical_cols], axis=1)
cat_test = np.stack([test_df[col].values for col in categorical_cols], axis=1)
cont_train = np.stack([train_df[col].values for col in continuous_cols], axis=1)
cont_test = np.stack([test_df[col].values for col in continuous_cols], axis=1)
y_train = train_df[label_col].values
y_test = test_df[label_col].values

# ------------------------------
# SCALE CONTINUOUS VARIABLES
# ------------------------------
scaler = StandardScaler()
cont_train = scaler.fit_transform(cont_train)
cont_test = scaler.transform(cont_test)

# ------------------------------
# CONVERT TO TENSORS
# ------------------------------
cat_train = torch.tensor(cat_train, dtype=torch.int64)
cat_test = torch.tensor(cat_test, dtype=torch.int64)
cont_train = torch.tensor(cont_train, dtype=torch.float)
cont_test = torch.tensor(cont_test, dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# ------------------------------
# DATA LOADERS
# ------------------------------
train_ds = TensorDataset(cat_train, cont_train, y_train)
test_ds = TensorDataset(cat_test, cont_test, y_test)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

# ------------------------------
# MODEL DEFINITION
# ------------------------------
class TabularModel(nn.Module):
    def __init__(self, emb_dims, n_cont, hidden=50, p=0.4):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(categories, size)
                                     for categories, size in emb_dims])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        n_emb = sum([size for _, size in emb_dims])
        self.layers = nn.Sequential(
            nn.Linear(n_emb + n_cont, hidden),
            nn.ReLU(),
            nn.BatchNorm1d(hidden),
            nn.Dropout(p),
            nn.Linear(hidden, 2)
        )

    def forward(self, x_cat, x_cont):
        embeddings = [e(x_cat[:, i]) for i, e in enumerate(self.embeds)]
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        return self.layers(x)

# ------------------------------
# EMBEDDINGS
# ------------------------------
cat_sizes = [int(df[col].nunique()) for col in categorical_cols]
emb_dims = [(size, min(50, (size + 1)//2)) for size in cat_sizes]

# ------------------------------
# MODEL, LOSS, OPTIMIZER
# ------------------------------
model = TabularModel(emb_dims, n_cont=len(continuous_cols), hidden=50, p=0.4)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ------------------------------
# TRAINING LOOP
# ------------------------------
epochs = 300
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for cat_batch, cont_batch, y_batch in train_dl:
        optimizer.zero_grad()
        outputs = model(cat_batch, cont_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_dl):.4f}")

# ------------------------------
# EVALUATION
# ------------------------------
model.eval()
with torch.no_grad():
    y_pred, y_true = [], []
    test_loss = 0
    for cat_batch, cont_batch, y_batch in test_dl:
        outputs = model(cat_batch, cont_batch)
        loss = criterion(outputs, y_batch)
        test_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        y_pred.extend(preds.tolist())
        y_true.extend(y_batch.tolist())

test_loss /= len(test_dl)
accuracy = np.mean(np.array(y_pred) == np.array(y_true))
print(f"\n✅ Test Loss: {test_loss:.4f}")
print(f"✅ Test Accuracy: {accuracy*100:.2f}%")


Dataset shape: (30000, 10)
Columns: ['age', 'sex', 'education', 'education-num', 'marital-status', 'workclass', 'occupation', 'hours-per-week', 'income', 'label']
Epoch 50/300, Loss: 0.2779
Epoch 100/300, Loss: 0.2781
Epoch 150/300, Loss: 0.2745
Epoch 200/300, Loss: 0.2735
Epoch 250/300, Loss: 0.2727
Epoch 300/300, Loss: 0.2726

✅ Test Loss: 0.2549
✅ Test Accuracy: 88.38%
