In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

In [None]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/dataset/salaray_data.csv")

In [None]:
# Clean data by removing NaN values and filtering outliers
df.replace(['', 'NA', 'NaN', 'nan'], np.nan, inplace=True)
df.dropna(inplace=True)
df['Years of Experience'] = df['Years of Experience'].replace(0, np.nan)
df = df[df['Salary'] > 10000].dropna(subset=['Age', 'Years of Experience', 'Salary'])


In [None]:
# Filter columns to only include necessary features and drop "Job Title"
df = df[['Age', 'Years of Experience', 'Education Level', 'Gender', 'Salary']]


In [None]:
# Encode 'Gender' and 'Education Level' columns
ohe = OneHotEncoder(drop='first', sparse_output=False)
encoded_columns = ohe.fit_transform(df[['Gender', 'Education Level']])
encoded_df = pd.DataFrame(encoded_columns, columns=ohe.get_feature_names_out(['Gender', 'Education Level']))
df = pd.concat([df[['Age', 'Years of Experience', 'Salary']].reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)


In [None]:
# Separate features (X) and target (y)
X = df.drop(columns='Salary')
y = df['Salary']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [None]:
# Scaling and model training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the RandomForestRegressor model
rf = RandomForestRegressor()
rf.fit(X_train_scaled, y_train)

In [None]:
# Save the model, scaler, and encoder
joblib.dump(rf, 'salary_predictor_rf.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(ohe, 'encoder.pkl')

In [None]:
# Expected feature order for reference in Flask app
expected_features = X.columns.tolist()
joblib.dump(expected_features, 'expected_features.pkl')