<a href="https://colab.research.google.com/github/priya-gurjar24/Salary-prediction-using-ensemble/blob/main/Salary_prediction_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


Saving Balitmore_salry.csv.zip to Balitmore_salry.csv.zip


In [None]:
DATA_PATH = '/content/Balitmore_salry.csv'
import pandas as pd
df = pd.read_csv(DATA_PATH)
print(df.head())


FileNotFoundError: [Errno 2] No such file or directory: '/content/Balitmore_salry.csv'

In [None]:
# salary_ensemble.py  (run in a notebook cell or as a script)
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# -------- CONFIG - change DATA_PATH if needed --------
DATA_PATH = "Balitmore_salry.csv"   # set to the extracted CSV path
TARGET = "annualsalary"
OUTPUT_DIR = "artifacts"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------- 1) Load & light cleaning ----------
df = pd.read_csv(DATA_PATH)

# Drop irrelevant columns if present
for col in ["Unnamed: 0", "name"]:
    if col in df.columns:
        df = df.drop(columns=col)

# Ensure target is numeric
df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce")
df = df.dropna(subset=[TARGET]).reset_index(drop=True)

# Add a simple tenure feature (adjust reference year if you prefer)
CURRENT_YEAR = 2025
df["tenure_years"] = CURRENT_YEAR - df["hire_year"]

# Reduce cardinality of jobtitle: keep top-N, rest -> OTHER
TOP_N_JOBTITLES = 50
top_jobs = df["jobtitle"].value_counts().index[:TOP_N_JOBTITLES]
df["jobtitle_mod"] = df["jobtitle"].where(df["jobtitle"].isin(top_jobs), other="OTHER")

# -------- 2) Feature selection ----------
NUMERIC_COLS = ["hire_year", "hire_month", "tenure_years"]
CATEGORICAL_COLS = ["agencyid", "agency", "jobtitle_mod"]

X = df[NUMERIC_COLS + CATEGORICAL_COLS].copy()
y = df[TARGET].values.astype(float)

# -------- 3) Train/test split ----------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# -------- 4) Preprocessing pipeline ----------
num_pipeline = Pipeline([
    ("scale", StandardScaler())
])

# Use sparse=True for OHE to keep memory down; min_frequency groups rare cats
cat_pipeline = OneHotEncoder(handle_unknown="ignore", min_frequency=0.01)

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, NUMERIC_COLS),
    ("cat", cat_pipeline, CATEGORICAL_COLS),
], remainder="drop")

# Transform data using the preprocessor
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


# -------- 5) Models (ensemble) ----------
# tuned defaults that run reasonably fast:
rf = RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=-1, random_state=42)
hgb = HistGradientBoostingRegressor(max_iter=300, random_state=42)
ridge = Ridge(random_state=42)

# Create pipelines with preprocessor and models
rf_pipe = Pipeline([("rf", rf)])
hgb_pipe = Pipeline([("hgb", hgb)])
ridge_pipe = Pipeline([("ridge", ridge)])

# Fit base models
print("Fitting RandomForest...")
rf_pipe.fit(X_train_processed, y_train)
print("Fitting HistGradientBoosting...")
hgb_pipe.fit(X_train_processed.toarray(), y_train)
print("Fitting Ridge baseline...")
ridge_pipe.fit(X_train_processed.toarray(), y_train)

# Voting regressor that averages predictions (regression)
voter = VotingRegressor(estimators=[
    ("rf", rf_pipe),
    ("hgb", hgb_pipe),
    ("ridge", ridge_pipe)
])
print("Fitting VotingRegressor (ensemble)...")
voter.fit(X_train_processed.toarray(), y_train)


# -------- 6) Evaluate ----------
def eval_preds(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) # Removed squared=False
    rmse = np.sqrt(rmse) # Calculate RMSE manually
    r2 = r2_score(y_true, y_pred)
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

preds = {
    "RandomForest": rf_pipe.predict(X_test_processed),
    "HistGradientBoosting": hgb_pipe.predict(X_test_processed.toarray()),
    "Ridge": ridge_pipe.predict(X_test_processed.toarray()),
    "VotingRegressor": voter.predict(X_test_processed.toarray())
}

results = {k: eval_preds(y_test, v) for k, v in preds.items()}
results_df = pd.DataFrame(results).T.reset_index().rename(columns={"index": "model"})
print("\nEvaluation on holdout set:")
print(results_df)

# -------- 7) Feature importance (from RF) ----------
try:
    # get ohe feature names
    ohe = preprocessor.named_transformers_["cat"]
    cat_feature_names = list(ohe.get_feature_names_out(CATEGORICAL_COLS))
    feature_names = NUMERIC_COLS + cat_feature_names
    rf_model = rf_pipe.named_steps["rf"]
    importances = rf_model.feature_importances_
    imp_df = pd.DataFrame({"feature": feature_names, "importance": importances}).sort_values("importance", ascending=False)
    print("\nTop RF feature importances (top 20):")
    print(imp_df.head(20))
except Exception as e:
    print("\nCould not extract feature importances cleanly:", e)

# -------- 8) Save artifacts ----------
joblib.dump(voter, os.path.join(OUTPUT_DIR, "salary_voting_regressor.joblib"))
pd.DataFrame({"y_true": y_test, "y_pred_voting": preds["VotingRegressor"]}).to_csv(os.path.join(OUTPUT_DIR, "test_predictions.csv"), index=False)
print(f"\nSaved model and predictions to {OUTPUT_DIR}/")

Fitting RandomForest...
Fitting HistGradientBoosting...
Fitting Ridge baseline...
Fitting VotingRegressor (ensemble)...

Evaluation on holdout set:
                  model          MAE          RMSE        R2
0          RandomForest  5670.337822  11616.555714  0.770273
1  HistGradientBoosting  5903.925087  11387.868460  0.779229
2                 Ridge  7128.740457  12141.486755  0.749042
3       VotingRegressor  6054.128599  11339.184278  0.781112

Top RF feature importances (top 20):
                              feature  importance
26        jobtitle_mod_AIDE BLUE CHIP    0.305572
24                agency_Youth Summer    0.212347
17           agency_Police Department    0.094442
11             agency_Fire Department    0.084391
2                        tenure_years    0.055482
0                           hire_year    0.054689
1                          hire_month    0.041176
6         agencyid_infrequent_sklearn    0.036914
32                 jobtitle_mod_OTHER    0.019113
20     ag

In [None]:
import zipfile
import os

zip_file_path = '/content/Balitmore_salry.csv.zip'
extracted_file_path = '/content/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_file_path)

# Now the file should be in the extracted_file_path
DATA_PATH = os.path.join(extracted_file_path, 'Balitmore_salry.csv')

# Modify the existing cell to read the extracted CSV

In [None]:
DATA_PATH = '/content/Balitmore_salry.csv'
import pandas as pd
df = pd.read_csv(DATA_PATH)
print(df.head())

   Unnamed: 0                name                       jobtitle agencyid  \
0           0     Aaron,Keontae E                 AIDE BLUE CHIP   W02200   
1           1    Aaron,Patricia G  Facilities/Office Services II   A03031   
2           2       Aaron,Petra L     ASSISTANT STATE'S ATTORNEY   A29005   
3           3  Abaineh,Yohannes T                 EPIDEMIOLOGIST   A65026   
4           4    Abbene,Anthony M         POLICE OFFICER TRAINEE   A99416   

                    agency  hire_date  hire_month  hire_year  annualsalary  
0             Youth Summer         10           6       2013         11310  
1       OED-Employment Dev         24          10       1979         53428  
2  States Attorneys Office         25           9       2006         68300  
3   HLTH-Health Department         23           7       2009         62000  
4        Police Department         24           7       2013         43999  
