In [1]:
# income_predictor.ipynb
import os

os.makedirs('models', exist_ok=True)

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset
df = pd.read_csv(r"C:\Users\Riya\Desktop\Employee_Salary_prediction\dataset.csv")  # adjust path as needed

# Check for nulls and handle them
df.dropna(inplace=True)

# Encode categorical columns
gender_encoder = LabelEncoder()
education_encoder = LabelEncoder()
job_encoder = LabelEncoder()

df['Gender'] = gender_encoder.fit_transform(df['Gender'])
df['Education Level'] = education_encoder.fit_transform(df['Education Level'])
df['Job Title'] = job_encoder.fit_transform(df['Job Title'])

# Save encoders
joblib.dump(gender_encoder, 'models/gender_encoder.pkl')
joblib.dump(education_encoder, 'models/education_encoder.pkl')
joblib.dump(job_encoder, 'models/job_encoder.pkl')

# Feature columns and target
X = df[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience']]
y = df['Salary']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model (Random Forest Regressor - good performance)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'dataset.pkl')

# Evaluate
y_pred = model.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R2 Score: 0.9401211962232473
RMSE: 11981.834393075002
