In [1]:
# This notebook includes model developement with target variable as max_salary

In [2]:
import pandas as pd

In [3]:
file_path = 'Data/cleaned_postings.csv'
df = pd.read_csv(file_path)

#### Remove outliers

In [4]:
Q1 = df['max_salary'].quantile(0.25)
Q3 = df['max_salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['max_salary'] >= lower_bound) & (df['max_salary'] <= upper_bound)]

#### Arrange the data

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

label_cols = ['company_name', 'title', 'location', 'formatted_work_type', 'application_type', 'formatted_experience_level', 'work_type', 'pay_period']
encoder = LabelEncoder()

for col in label_cols:
    df.loc[:, col] = encoder.fit_transform(df[col])

X = df.drop(columns=['max_salary', 'original_listed_time', 'pay_period'])
y = df['max_salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

#### Use Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

lin_reg_model = LinearRegression()

poly = PolynomialFeatures(degree = 2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

lin_reg_model.fit(X_train_poly, y_train)

y_pred_poly = lin_reg_model.predict(X_test_poly)

mae_poly = mean_absolute_error(y_test, y_pred_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

In [7]:
model_report = {
    "Mean Absolute Error (MAE)": [mae_poly],
    "Mean Squared Error (MSE)": [mse_poly],
    "R-squared (R²)": [r2_poly]
}

pd.DataFrame(model_report)

Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),R-squared (R²)
0,29985.397338,1623594000.0,0.547373


#### Using RandomForest Regressor

In [8]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [9]:
model_report = {
    "Mean Absolute Error (MAE)": [mae_rf],
    "Mean Squared Error (MSE)": [mse_rf],
    "R-squared (R²)": [r2_rf]
}
pd.DataFrame(model_report)

Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),R-squared (R²)
0,7096.714084,155485600.0,0.956654


In [10]:
feature_importance = rf_model.feature_importances_

pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by = 'Importance', ascending = False)

Unnamed: 0,Feature,Importance
4,min_salary,0.349439
8,formatted_experience_level,0.238959
1,title,0.127039
5,formatted_work_type,0.090703
9,work_type,0.0781
0,company_name,0.044424
2,location,0.02351
6,remote_allowed,0.019665
3,views,0.018569
7,application_type,0.009591
