In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
data = pd.read_csv('job_descriptions.csv')
data = data[0:1000]
data.head() 

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


In [3]:
le = LabelEncoder()
data['Experience'] = le.fit_transform(data['Experience'])
data['Qualifications'] = le.fit_transform(data['Qualifications'])
data['Work Type'] = le.fit_transform(data['Work Type'])
data['Company Size'] = le.fit_transform(data['Company Size'])

In [5]:
def extract_lower_bound(salary_range):
    try:
        cleaned = ''.join(c for c in salary_range if c.isdigit() or c in '.-')
        lower_bound = float(cleaned.split('-')[0])
        return lower_bound
    except:
        return np.nan

data['Salary_Numeric'] = data['Salary Range'].apply(extract_lower_bound)

data = data.dropna(subset=['Salary_Numeric'])

data['Salary_Bin'] = pd.qcut(data['Salary_Numeric'], q=5, labels=[0, 1, 2, 3, 4])

X = data[['Experience', 'Qualifications', 'Work Type', 'Company Size']]
y = data['Salary_Bin']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [13]:
class SalaryDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels.astype(int))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [14]:
class SalaryNet(nn.Module):
    def __init__(self, input_size):
        super(SalaryNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 5)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [15]:
def train_pytorch_model(model, train_loader, criterion, optimizer, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

In [16]:
def predict_pytorch_model(model, test_loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for features, _ in test_loader:
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.numpy())
    return np.array(all_preds)

In [17]:
train_dataset = SalaryDataset(X_train_scaled, y_train)
test_dataset = SalaryDataset(X_test_scaled, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [18]:
pytorch_model = SalaryNet(X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(pytorch_model.parameters(), lr=0.001)
train_pytorch_model(pytorch_model, train_loader, criterion, optimizer)

In [20]:
classifiers = {
    'Neural Network (PyTorch)': pytorch_model,
    'Neural Network (SKLearn)': MLPClassifier(hidden_layer_sizes=(64, 32, 16), max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(kernel='rbf')
}

In [21]:
results = {}

for name, clf in classifiers.items():
    if name == 'Neural Network (PyTorch)':
        y_pred = predict_pytorch_model(clf, test_loader)
    elif name == 'Neural Network (Keras)':
        y_pred = np.argmax(clf.predict(X_test_scaled), axis=1)
    else:
        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_test_scaled)
    
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1-score': f1_score(y_test, y_pred, average='weighted')
    }



In [22]:
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Neural Network (PyTorch):
Accuracy: 0.2800
Precision: 0.2855
Recall: 0.2800
F1-score: 0.2648

Neural Network (SKLearn):
Accuracy: 0.2150
Precision: 0.2185
Recall: 0.2150
F1-score: 0.2157

Random Forest:
Accuracy: 0.2200
Precision: 0.2208
Recall: 0.2200
F1-score: 0.2154

SVM:
Accuracy: 0.2900
Precision: 0.2806
Recall: 0.2900
F1-score: 0.1981


In [23]:
def predict_salary_range(experience, qualifications, work_type, company_size):
    input_data = np.array([[experience, qualifications, work_type, company_size]])
    input_data_scaled = scaler.transform(input_data)

    predictions = {}

    pytorch_model.eval()
    with torch.no_grad():
        input_tensor = torch.FloatTensor(input_data_scaled)
        output = pytorch_model(input_tensor)
        _, predicted = torch.max(output, 1)
        predictions['Neural Network (PyTorch)'] = predicted.item()

    for name, clf in classifiers.items():
        if name not in ['Neural Network (PyTorch)', 'Neural Network (Keras)']:
            predictions[name] = clf.predict(input_data_scaled)[0]

    return predictions

In [24]:
sample_experience = 2  
sample_qualifications = 1  
sample_work_type = 0
sample_company_size = 3  

sample_predictions = predict_salary_range(sample_experience, sample_qualifications, sample_work_type, sample_company_size)

print("\nSample Predictions:")
for model, prediction in sample_predictions.items():
    print(f"{model}: Salary Range Category {prediction}")

def map_prediction_to_range(prediction):
    ranges = [
        "0 - 50,000",
        "50,001 - 100,000",
        "100,001 - 150,000",
        "150,001 - 200,000",
        "200,001+"
    ]
    return ranges[prediction]

print("\nSample Predictions with Salary Ranges:")
for model, prediction in sample_predictions.items():
    salary_range = map_prediction_to_range(prediction)
    print(f"{model}: {salary_range}")




Sample Predictions:
Neural Network (PyTorch): Salary Range Category 3
Neural Network (SKLearn): Salary Range Category 3
Random Forest: Salary Range Category 4
SVM: Salary Range Category 3

Sample Predictions with Salary Ranges:
Neural Network (PyTorch): 150,001 - 200,000
Neural Network (SKLearn): 150,001 - 200,000
Random Forest: 200,001+
SVM: 150,001 - 200,000


