In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [18]:
data = pd.read_csv('data/job_descriptions_30000.csv')
data = data[0:1000]
data.head() 

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1017340707950150,5 to 10 Years,BBA,$55K-$84K,Panama City,Panama,8.5379,-80.7821,Contract,93242,...,242.271.4459,Procurement Manager,Supplier Diversity Manager,The Muse,Promote diversity and inclusion in the supply ...,"{'Transportation Benefits, Professional Develo...",Supplier diversity programs Diversity and incl...,Promote supplier diversity initiatives and inc...,RWE AG,"{""Sector"":""Energy"",""Industry"":""Energy - Utilit..."
1,2421048253959975,0 to 12 Years,MBA,$61K-$108K,Tunis,Tunisia,33.8869,9.5375,Part-Time,18411,...,579.442.3566,Architectural Designer,Architectural Drafter,Idealist,Architectural Drafters assist architects and e...,"{'Employee Assistance Programs (EAP), Tuition ...",Architectural drafting AutoCAD 2D and 3D model...,Prepare detailed architectural drawings and pl...,Asian Paints,"{""Sector"":""Consumer Goods"",""Industry"":""Paints ..."
2,1822636506606589,0 to 11 Years,M.Com,$57K-$82K,Harare,Zimbabwe,-19.0154,29.1549,Full-Time,120621,...,858-776-8996,Art Teacher,Art Education Coordinator,ZipRecruiter,An Art Education Coordinator plans and manages...,"{'Employee Referral Programs, Financial Counse...",Art education curriculum Program development T...,"Coordinate art education programs, curriculum ...",Laboratory Corp. of America,"{""Sector"":""Healthcare Services"",""Industry"":""He..."
3,3068000579894602,5 to 12 Years,B.Com,$56K-$95K,Tirana,Albania,41.1533,20.1683,Temporary,128908,...,938.587.7586x35852,Environmental Consultant,Environmental Impact Analyst,Internships.com,Environmental Impact Analysts assess the envir...,"{'Transportation Benefits, Professional Develo...",Environmental impact analysis Data collection ...,Assess the environmental impact of projects an...,Massachusetts Mutual Life Insurance,"{""Sector"":""Insurance"",""Industry"":""Insurance: L..."
4,1747904829392680,4 to 13 Years,BCA,$58K-$122K,City of Baghdad,Iraq,33.2232,43.6793,Temporary,114717,...,(405)990-8581x57164,Art Teacher,Art Education Coordinator,LinkedIn,An Art Education Coordinator plans and manages...,"{'Employee Referral Programs, Financial Counse...",Art education curriculum Program development T...,"Coordinate art education programs, curriculum ...",Sartorius AG,"{""Sector"":""Lab Equipment"",""Industry"":""Life Sci..."


In [19]:
le = LabelEncoder()
data['Experience'] = le.fit_transform(data['Experience'])
data['Qualifications'] = le.fit_transform(data['Qualifications'])
data['Work Type'] = le.fit_transform(data['Work Type'])
data['Company Size'] = le.fit_transform(data['Company Size'])

In [20]:
def extract_lower_bound(salary_range):
    try:
        cleaned = ''.join(c for c in salary_range if c.isdigit() or c in '.-')
        lower_bound = float(cleaned.split('-')[0])
        return lower_bound
    except:
        return np.nan

data['Salary_Numeric'] = data['Salary Range'].apply(extract_lower_bound)

data = data.dropna(subset=['Salary_Numeric'])

data['Salary_Bin'] = pd.qcut(data['Salary_Numeric'], q=5, labels=[0, 1, 2, 3, 4])

X = data[['Experience', 'Qualifications', 'Work Type', 'Company Size']]
y = data['Salary_Bin']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [22]:
class SalaryDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels.astype(int))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [23]:
class SalaryNet(nn.Module):
    def __init__(self, input_size):
        super(SalaryNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 5)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [24]:
def train_pytorch_model(model, train_loader, criterion, optimizer, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

In [25]:
def predict_pytorch_model(model, test_loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for features, _ in test_loader:
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.numpy())
    return np.array(all_preds)

In [26]:
train_dataset = SalaryDataset(X_train_scaled, y_train)
test_dataset = SalaryDataset(X_test_scaled, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [27]:
pytorch_model = SalaryNet(X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(pytorch_model.parameters(), lr=0.001)
train_pytorch_model(pytorch_model, train_loader, criterion, optimizer)

In [28]:
classifiers = {
    'Neural Network (PyTorch)': pytorch_model,
    'Neural Network (SKLearn)': MLPClassifier(hidden_layer_sizes=(64, 32, 16), max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(kernel='rbf')
}

In [29]:
results = {}

for name, clf in classifiers.items():
    if name == 'Neural Network (PyTorch)':
        y_pred = predict_pytorch_model(clf, test_loader)
    elif name == 'Neural Network (Keras)':
        y_pred = np.argmax(clf.predict(X_test_scaled), axis=1)
    else:
        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_test_scaled)
    
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1-score': f1_score(y_test, y_pred, average='weighted')
    }



In [15]:
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Neural Network (PyTorch):
Accuracy: 0.2300
Precision: 0.1816
Recall: 0.2300
F1-score: 0.1922

Neural Network (SKLearn):
Accuracy: 0.1800
Precision: 0.1761
Recall: 0.1800
F1-score: 0.1756

Random Forest:
Accuracy: 0.2150
Precision: 0.2008
Recall: 0.2150
F1-score: 0.1947

SVM:
Accuracy: 0.2750
Precision: 0.1923
Recall: 0.2750
F1-score: 0.1793


In [30]:
def predict_salary_range(experience, qualifications, work_type, company_size):
    input_data = np.array([[experience, qualifications, work_type, company_size]])
    input_data_scaled = scaler.transform(input_data)

    predictions = {}

    pytorch_model.eval()
    with torch.no_grad():
        input_tensor = torch.FloatTensor(input_data_scaled)
        output = pytorch_model(input_tensor)
        _, predicted = torch.max(output, 1)
        predictions['Neural Network (PyTorch)'] = predicted.item()

    for name, clf in classifiers.items():
        if name not in ['Neural Network (PyTorch)', 'Neural Network (Keras)']:
            predictions[name] = clf.predict(input_data_scaled)[0]

    return predictions

In [31]:
sample_experience = 2  
sample_qualifications = 1  
sample_work_type = 0
sample_company_size = 3  

sample_predictions = predict_salary_range(sample_experience, sample_qualifications, sample_work_type, sample_company_size)

print("\nSample Predictions:")
for model, prediction in sample_predictions.items():
    print(f"{model}: Salary Range Category {prediction}")

def map_prediction_to_range(prediction):
    ranges = [
        "0 - 50,000",
        "50,001 - 100,000",
        "100,001 - 150,000",
        "150,001 - 200,000",
        "200,001+"
    ]
    return ranges[prediction]

print("\nSample Predictions with Salary Ranges:")
for model, prediction in sample_predictions.items():
    salary_range = map_prediction_to_range(prediction)
    print(f"{model}: {salary_range}")




Sample Predictions:
Neural Network (PyTorch): Salary Range Category 0
Neural Network (SKLearn): Salary Range Category 0
Random Forest: Salary Range Category 0
SVM: Salary Range Category 0

Sample Predictions with Salary Ranges:
Neural Network (PyTorch): 0 - 50,000
Neural Network (SKLearn): 0 - 50,000
Random Forest: 0 - 50,000
SVM: 0 - 50,000


