In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv("my_df.csv")

In [3]:
#data.columns

In [13]:
categorical_columns = ['work_year', 'experience_level', 'job_title', 'employee_residence',
                       'remote_ratio', 'company_location', 'company_size']

In [14]:
X = data.drop(columns=['salary_in_usd'])
y = data['salary_in_usd']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='error', sparse_output=False), categorical_columns)
    ])

In [23]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('scaler', PowerTransformer()),
                           ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])


In [24]:
pipeline.fit(X_train, y_train)

In [25]:
mse = pipeline.score(X_test, y_test)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.44073059349582766


In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_regression_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    return {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R-squared': r2,
        'MAPE': mape
    }

In [28]:
evaluation_results = evaluate_regression_model(pipeline, X_test, y_test)
print("Evaluation Results:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value}")


Evaluation Results:
MAE: 35602.36874289402
MSE: 2119851876.194034
RMSE: 46041.849183042534
R-squared: 0.44073059349582766
MAPE: 33.39760442358062


In [29]:
import pickle

pipeline_file = 'pipeline_model.pkl'
with open(pipeline_file, 'wb') as file:
    pickle.dump(pipeline, file)