In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv("my_df.csv")

In [3]:
categorical_columns = ['work_year', 'experience_level', 'job_title', 'employee_residence',
                       'remote_ratio', 'company_location', 'company_size']

In [4]:
X = data.drop(columns=['salary_in_usd'])
y = data['salary_in_usd']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='error', sparse_output=False), categorical_columns)
    ])

In [7]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('scaler', PowerTransformer()),
                           ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

In [8]:
pipeline.fit(X_train, y_train)

In [9]:
mse = pipeline.score(X_test, y_test)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.45117765253407804


In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_regression_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    return {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R-squared': r2,
        'MAPE': mape
    }

In [11]:
evaluation_results = evaluate_regression_model(pipeline, X_test, y_test)
print("Evaluation Results:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value}")


Evaluation Results:
MAE: 35106.35922559665
MSE: 2019450891.5291407
RMSE: 44938.3009417261
R-squared: 0.45117765253407804
MAPE: 33.28555328308481


In [12]:
import pickle

pipeline_file = 'pipeline_model.pkl'
with open(pipeline_file, 'wb') as file:
    pickle.dump(pipeline, file)