In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
data = pd.read_csv('../data/job_descriptions.csv')

In [5]:
data[['salary_lower', 'salary_upper']] = data['Salary Range'].str.extract(r'\$(\d+)K-\$(\d+)K')
data['salary_lower'] = pd.to_numeric(data['salary_lower'])
data['salary_upper'] = pd.to_numeric(data['salary_upper'])

In [6]:
data['salary_category'] = pd.cut(data['salary_upper'], bins=[0, 50, 75, 100, np.inf], labels=['Low', 'Medium', 'High', 'Very High'])

In [7]:
features = ['Experience', 'latitude', 'longitude', 'Company Size']

In [8]:
le = LabelEncoder()
for col in features:
    if data[col].dtype == 'object':
        data[col] = le.fit_transform(data[col])

In [9]:
X = data[features]
y = data['salary_category']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
classifiers = {
    'MLP': MLPClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42)
}

In [13]:
results = {}

for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = {
        'classification_report': report,
    }
    
    y_score = clf.predict_proba(X_test_scaled)
    for i, class_name in enumerate(clf.classes_):
        fpr, tpr, _ = roc_curve(y_test == class_name, y_score[:, i])
        roc_auc = auc(fpr, tpr)
        results[name]['roc_auc'][class_name] = {'fpr': fpr, 'tpr': tpr, 'auc': roc_auc}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
for name, result in results.items():
    print(f"\n{name} Classification Report:")
    print(pd.DataFrame(result['classification_report']).transpose())

fig, axes = plt.subplots(2, 3, figsize=(20, 15))
axes = axes.ravel()

for i, (name, result) in enumerate(results.items()):
    sns.heatmap(result['confusion_matrix'], annot=True, fmt='d', ax=axes[i])
    axes[i].set_title(f'{name} Confusion Matrix')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('True')

plt.tight_layout()
plt.show()