In [6]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# === Load Data ===
df = pd.read_csv("../data/salary_Data.csv")  # <-- Replace this path
print("✅ Data loaded:", df.shape)

# === Features and Target ===
X = df.drop("Salary", axis=1)
y = df["Salary"]
X = df.drop("Salary", axis=1)
y = df["Salary"]
df_final = df.dropna(subset=["Salary"])  # Drop rows where Salary is NaN
X = df_final.drop("Salary", axis=1)
y = df_final["Salary"]


# === Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Define Columns ===
numeric_cols = ["Age", "Years of Experience"]  # Update if your column names differ
categorical_cols = ["Gender", "Education Level", "Job Title"]

# === Preprocessing Pipelines ===
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor_with_impute = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

# === Define Models ===
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# === Train, Evaluate and Save Best ===
best_score = -float("inf")
best_model = None
best_model_name = ""

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocess", preprocessor_with_impute),
        ("model", model)
    ])
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_test, y_test)
    print(f"{name} R² Score: {score:.3f}")

    if score > best_score:
        best_score = score
        best_model = pipeline
        best_model_name = name

# === Save Best Model ===
os.makedirs("models", exist_ok=True)
model_path = f"models/{best_model_name.replace(' ', '_').lower()}_salary_model.pkl"
joblib.dump(best_model, model_path)
print(f"\n✅ Best model '{best_model_name}' saved at: {model_path}")



✅ Data loaded: (6704, 6)
Linear Regression R² Score: 0.878
Random Forest R² Score: 0.985
Gradient Boosting R² Score: 0.927

✅ Best model 'Random Forest' saved at: models/random_forest_salary_model.pkl
