In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import joblib

In [8]:
df = pd.read_csv("/content/drive/MyDrive/dataset/salaray_data.csv")

In [9]:
df.replace(['', 'NA', 'NaN', 'nan'], np.nan, inplace=True)
df.dropna(inplace=True)
df['Years of Experience'] = df['Years of Experience'].replace(0, np.nan)
df = df[df['Salary'] > 10000].dropna(subset=['Age', 'Years of Experience', 'Salary'])

In [10]:
df = df[['Age', 'Years of Experience', 'Education Level', 'Gender', 'Salary']]

In [11]:
ohe = OneHotEncoder(drop='first', sparse_output=False)
encoded_columns = ohe.fit_transform(df[['Gender', 'Education Level']])
encoded_df = pd.DataFrame(encoded_columns, columns=ohe.get_feature_names_out(['Gender', 'Education Level']))
df = pd.concat([df[['Age', 'Years of Experience', 'Salary']].reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)


In [12]:
X = df.drop(columns='Salary')
y = df['Salary']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])

In [15]:
# Define hyperparameters for RandomizedSearchCV
param_grid = {
    'rf__n_estimators': [50, 100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30, 40],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['auto', 'sqrt', 'log2']
}

In [16]:
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    random_state=42,
    n_jobs=-1,
    scoring='neg_mean_squared_error'
)

In [17]:
search.fit(X_train, y_train)

20 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "

In [18]:
best_model = search.best_estimator_

In [19]:
joblib.dump(best_model, 'salary_predictor_rf.pkl')
joblib.dump(ohe, 'encoder.pkl')
expected_features = X.columns.tolist()
joblib.dump(expected_features, 'expected_features.pkl')

['expected_features.pkl']

In [20]:
print("Best Parameters:", search.best_params_)
print("Best Score:", search.best_score_)

Best Parameters: {'rf__n_estimators': 100, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1, 'rf__max_features': 'log2', 'rf__max_depth': None}
Best Score: -230563872.31517324
