In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
class SalaryDataset:
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        print("Initial data shape:", self.data.shape)
        print("Initial salary range:", self.data['Salary'].min(), "-", self.data['Salary'].max())
        
        self.preprocess_data()
        
    def preprocess_data(self):
        print("\nMissing values before cleaning:")
        print(self.data.isnull().sum())
        
        numeric_columns = ['Age', 'Years of Experience', 'Salary']
        categorical_columns = ['Gender', 'Education Level', 'Job Title']
        
        for col in numeric_columns:
            self.data[col] = pd.to_numeric(self.data[col], errors='coerce')
            self.data[col] = self.data[col].fillna(self.data[col].median())
        
        for col in categorical_columns:
            self.data[col] = self.data[col].astype(str)
            self.data[col] = self.data[col].fillna(self.data[col].mode()[0])
        
        print("\nMissing values after cleaning:")
        print(self.data.isnull().sum())

        self.data['Education Level'] = pd.Categorical(self.data['Education Level']).codes
        self.data['Experience_Education'] = (
            self.data['Years of Experience'] * 
            self.data['Education Level']
        )
        
        Q1 = self.data['Salary'].quantile(0.25)
        Q3 = self.data['Salary'].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        self.data = self.data[
            (self.data['Salary'] >= lower_bound) & 
            (self.data['Salary'] <= upper_bound)
        ]
        
        X_numeric = self.data[['Age', 'Years of Experience', 'Experience_Education']].values
        X_categorical = self.data[categorical_columns]
        
        self.numeric_transformer = RobustScaler(quantile_range=(1, 99))
        self.categorical_transformer = OneHotEncoder(
            drop='first', 
            sparse_output=False, 
            handle_unknown='ignore'
        )
        
        X_numeric_scaled = self.numeric_transformer.fit_transform(X_numeric)
        
        X_categorical_encoded = self.categorical_transformer.fit_transform(X_categorical)
        
        self.X = np.hstack([X_numeric_scaled, X_categorical_encoded])
        
        self.salary_scaler = RobustScaler(quantile_range=(1, 99))
        self.y = self.salary_scaler.fit_transform(
            self.data['Salary'].values.reshape(-1, 1)
        ).ravel()
        
        self.X = self.X.astype(np.float32)
        self.y = self.y.astype(np.float32)
        
        print("\nProcessed data information:")
        print("Features shape:", self.X.shape)
        print("Target shape:", self.y.shape)
        
        if np.any(np.isnan(self.X)) or np.any(np.isnan(self.y)):
            raise ValueError("NaN values found in processed data")
    
    def get_data(self):
        return torch.FloatTensor(self.X), torch.FloatTensor(self.y)
    
    def inverse_transform_salary(self, scaled_salary):
        return self.salary_scaler.inverse_transform(scaled_salary.reshape(-1, 1))

In [3]:
class SalaryPredictor(nn.Module):
    def __init__(self, input_size):
        super(SalaryPredictor, self).__init__()
        
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 16)
        self.fc5 = nn.Linear(16, 1)
        
        self.res1 = nn.Linear(input_size, 64)
        self.res2 = nn.Linear(64, 32)
        
        self.bn1 = nn.BatchNorm1d(128)
        self.bn2 = nn.BatchNorm1d(64)
        self.bn3 = nn.BatchNorm1d(32)
        self.bn4 = nn.BatchNorm1d(16)
        
        self.dropout = nn.Dropout(0.2)
        
        self._initialize_weights()
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        identity1 = self.res1(x)
        
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        
        x = F.relu(self.bn2(self.fc2(x)))
        x = x + identity1 
        x = self.dropout(x)
        
        identity2 = self.res2(x)
        
        x = F.relu(self.bn3(self.fc3(x)))
        x = x + identity2 
        x = self.dropout(x)
        
        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout(x)
        
        x = self.fc5(x)
        return x

In [4]:
def train_model(model, X, y, epochs=300, lr=0.001, batch_size=32):
    criterion = nn.HuberLoss(delta=1.0) 
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=15, verbose=True)
    
    
    y_binned = pd.qcut(y.numpy(), q=5, labels=False) 
    X_train, X_test, y_train, y_test = train_test_split(
        X.numpy(), y.numpy(), test_size=0.2, random_state=42, stratify=y_binned
    )
    
    X_train = torch.FloatTensor(X_train)
    X_test = torch.FloatTensor(X_test)
    y_train = torch.FloatTensor(y_train)
    y_test = torch.FloatTensor(y_test)
    
    train_data = torch.utils.data.TensorDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    
    print("\nStarting training:")
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    
    best_loss = float('inf')
    best_model_state = None
    patience = 25
    patience_counter = 0
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        num_batches = 0
        
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs.squeeze(), batch_y)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
        
        avg_loss = total_loss / num_batches
        
        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test)
            test_loss = criterion(test_outputs.squeeze(), y_test)
            
            y_mean = torch.mean(y_test)
            ss_tot = torch.sum((y_test - y_mean) ** 2)
            ss_res = torch.sum((y_test - test_outputs.squeeze()) ** 2)
            r2 = 1 - (ss_res / ss_tot)
            
            scheduler.step(test_loss)
            
            if test_loss < best_loss:
                best_loss = test_loss
                best_model_state = model.state_dict().copy()
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
            
            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], '
                      f'Train Loss: {avg_loss:.6f}, '
                      f'Test Loss: {test_loss.item():.6f}, '
                      f'R²: {r2:.4f}')
    
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return model


In [5]:
if __name__ == "__main__":
    torch.manual_seed(42)
    np.random.seed(42)
    
    try:
        print("Loading and preprocessing data...")
        dataset = SalaryDataset('../data/Salary_Data.csv')
        X, y = dataset.get_data()
        
        print("\nInitializing model...")
        model = SalaryPredictor(input_size=X.shape[1])
        
        print("\nTraining model...")
        model = train_model(model, X, y)
        
        print("\nMaking predictions...")
        model.eval()
        with torch.no_grad():
            print("\nComparison of actual vs predicted salaries:")
            total_error = 0
            n_samples = min(5, len(X))
            
            for i in range(n_samples):
                pred = model(X[i:i+1])
                actual = dataset.inverse_transform_salary(y[i:i+1].numpy())
                predicted = dataset.inverse_transform_salary(pred.numpy())
                error_pct = abs(actual[0][0] - predicted[0][0]) / actual[0][0] * 100
                total_error += error_pct
                print(f"Actual: ${actual[0][0]:.2f}, Predicted: ${predicted[0][0]:.2f}, Error: {error_pct:.1f}%")
            
            print(f"\nAverage error: {total_error/n_samples:.1f}%")
                
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        import traceback
        print(traceback.format_exc())

Loading and preprocessing data...
Initial data shape: (6704, 6)
Initial salary range: 350.0 - 250000.0

Missing values before cleaning:
Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

Missing values after cleaning:
Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

Processed data information:
Features shape: (6704, 206)
Target shape: (6704,)

Initializing model...

Training model...





Starting training:
Training samples: 5363
Test samples: 1341
Epoch [10/300], Train Loss: 0.016978, Test Loss: 0.006661, R²: 0.8350
Epoch [20/300], Train Loss: 0.010827, Test Loss: 0.003894, R²: 0.9035
Epoch [30/300], Train Loss: 0.008263, Test Loss: 0.003447, R²: 0.9146
Epoch [40/300], Train Loss: 0.006778, Test Loss: 0.002613, R²: 0.9353
Epoch [50/300], Train Loss: 0.006245, Test Loss: 0.002088, R²: 0.9483
Epoch [60/300], Train Loss: 0.005815, Test Loss: 0.002370, R²: 0.9413
Epoch [70/300], Train Loss: 0.005647, Test Loss: 0.001607, R²: 0.9602
Epoch [80/300], Train Loss: 0.005558, Test Loss: 0.001822, R²: 0.9549
Epoch [90/300], Train Loss: 0.005110, Test Loss: 0.001567, R²: 0.9612
Epoch [100/300], Train Loss: 0.005207, Test Loss: 0.001543, R²: 0.9618
Epoch [110/300], Train Loss: 0.005434, Test Loss: 0.001681, R²: 0.9584
Epoch [120/300], Train Loss: 0.004996, Test Loss: 0.001856, R²: 0.9540
Epoch [130/300], Train Loss: 0.004976, Test Loss: 0.001632, R²: 0.9596
Early stopping at epoch 

In [6]:
save_path = 'model.pth'
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")

Model saved to model.pth
