<a href="https://colab.research.google.com/github/sanjaykumar-nb/DLWORKSHOP/blob/main/DLworkshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import random

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)


In [2]:
df = pd.read_csv("income.csv")
print(df.shape)
df.head()


(30000, 10)


Unnamed: 0,age,sex,education,education-num,marital-status,workclass,occupation,hours-per-week,income,label
0,27,Male,HS-grad,9,Never-married,Private,Craft-repair,40,<=50K,0
1,47,Male,Masters,14,Married,Local-gov,Exec-managerial,50,>50K,1
2,59,Male,HS-grad,9,Divorced,Self-emp,Prof-specialty,20,<=50K,0
3,38,Female,Prof-school,15,Never-married,Federal-gov,Prof-specialty,57,>50K,1
4,64,Female,11th,7,Widowed,Private,Farming-fishing,40,<=50K,0


In [3]:
# Example columns — adjust according to your dataset
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation',
                    'relationship', 'race', 'sex', 'native-country']
continuous_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain',
                   'capital-loss', 'hours-per-week']
label_col = 'income'   # e.g., '<=50K' or '>50K'


In [4]:
# 🧹 Clean column names (remove any extra spaces)
df.columns = df.columns.str.strip()

# ✅ Define columns based on your dataset
categorical_cols = ['sex', 'education', 'marital-status', 'workclass', 'occupation']
continuous_cols = ['age', 'education-num', 'hours-per-week']
label_col = 'label'   # since label = 0/1 (already numeric target)

# 🧩 Encode categorical features
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# 🏷️ Target Encoding (if needed)
# 'label' is already 0/1, but if you prefer to use 'income' instead, uncomment below:
# income_le = LabelEncoder()
# df['income'] = income_le.fit_transform(df['income'])
# label_col = 'income'  # then change label column name in later steps

print("✅ Categorical and label encoding completed successfully.")


✅ Categorical and label encoding completed successfully.


In [5]:
X_cats = df[categorical_cols].values
X_conts = df[continuous_cols].values
y = df[label_col].values

# Train-test split
X_cats_train, X_cats_test, X_conts_train, X_conts_test, y_train, y_test = train_test_split(
    X_cats, X_conts, y, test_size=5000, train_size=25000, random_state=SEED)


In [6]:
X_cats_train = torch.tensor(X_cats_train, dtype=torch.int64)
X_cats_test = torch.tensor(X_cats_test, dtype=torch.int64)
X_conts_train = torch.tensor(X_conts_train, dtype=torch.float)
X_conts_test = torch.tensor(X_conts_test, dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)


In [7]:
train_ds = TensorDataset(X_cats_train, X_conts_train, y_train)
test_ds = TensorDataset(X_cats_test, X_conts_test, y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)


In [8]:
# Get embedding sizes
cat_sizes = [len(label_encoders[col].classes_) for col in categorical_cols]
embeddings = [(size, min(50, (size+1)//2)) for size in cat_sizes]

class TabularModel(nn.Module):
    def __init__(self, emb_sizes, n_cont, out_sz, hidden_sz, dropout):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_sizes])
        self.emb_drop = nn.Dropout(dropout)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        emb_dim = sum([nf for ni, nf in emb_sizes])
        self.fc1 = nn.Linear(emb_dim + n_cont, hidden_sz)
        self.bn1 = nn.BatchNorm1d(hidden_sz)
        self.fc2 = nn.Linear(hidden_sz, out_sz)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x_cat, x_cont):
        x = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)

        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [9]:
model = TabularModel(embeddings, len(continuous_cols), 2, 50, 0.4)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 300
for epoch in range(epochs):
    total_loss = 0
    model.train()
    for x_cat, x_cont, yb in train_loader:
        optimizer.zero_grad()
        preds = model(x_cat, x_cont)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch+1) % 25 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 25/300, Loss: 0.2759
Epoch 50/300, Loss: 0.2721
Epoch 75/300, Loss: 0.2690
Epoch 100/300, Loss: 0.2679
Epoch 125/300, Loss: 0.2696
Epoch 150/300, Loss: 0.2661
Epoch 175/300, Loss: 0.2671
Epoch 200/300, Loss: 0.2668
Epoch 225/300, Loss: 0.2671
Epoch 250/300, Loss: 0.2666
Epoch 275/300, Loss: 0.2683
Epoch 300/300, Loss: 0.2642


In [10]:
model.eval()
correct, total, test_loss = 0, 0, 0

with torch.no_grad():
    for x_cat, x_cont, yb in test_loader:
        outputs = model(x_cat, x_cont)
        loss = criterion(outputs, yb)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == yb).sum().item()
        total += yb.size(0)

accuracy = correct / total
print(f"Test Loss: {test_loss/len(test_loader):.4f}")
print(f"Test Accuracy: {accuracy*100:.2f}%")


Test Loss: 0.2496
Test Accuracy: 88.38%


In [12]:
def predict_new_data(model, label_encoders, scaler, new_data, categorical_cols, label_col):
    import numpy as np
    import pandas as pd

    # Convert new_data to DataFrame if needed
    if isinstance(new_data, dict):
        new_data = pd.DataFrame([new_data])
    elif isinstance(new_data, list):
        new_data = pd.DataFrame(new_data)

    # Encode categorical columns safely
    for col in categorical_cols:
        if col in new_data.columns:
            le = label_encoders[col]
            new_data[col] = new_data[col].apply(
                lambda x: le.transform([x])[0] if x in le.classes_
                else -1  # Assign -1 for unseen category
            )

    # Ensure all expected columns are present
    missing_cols = set(scaler.feature_names_in_) - set(new_data.columns)
    for col in missing_cols:
        new_data[col] = 0  # fill missing columns

    # Reorder columns
    new_data = new_data[scaler.feature_names_in_]

    # Scale numeric columns
    new_data_scaled = scaler.transform(new_data)

    # Predict
    prediction = model.predict(new_data_scaled)

    # Decode if target was label encoded
    if label_col in label_encoders:
        prediction = label_encoders[label_col].inverse_transform(prediction)

    return prediction


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import joblib

# Separate features and target
X = df.drop(['income', 'label'], axis=1)  # or only 'label' if 'income' is your target
y = df['label']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)


In [14]:
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')


['label_encoders.pkl']

In [15]:
model = joblib.load('model.pkl')
scaler = joblib.load('scaler.pkl')
label_encoders = joblib.load('label_encoders.pkl')


In [16]:
new_data = {
    'age': 35,
    'sex': 'Male',
    'education': 'Masters',
    'education-num': 14,
    'marital-status': 'Married',
    'workclass': 'Private',
    'occupation': 'Prof-specialty',
    'hours-per-week': 45
}

prediction = predict_new_data(model, label_encoders, scaler, new_data, categorical_cols, 'label')
print("Predicted label:", prediction)


Predicted label: [1]
