In [6]:
pip install pandas numpy scikit-learn joblib matplotlib seaborn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import r2_score, mean_absolute_error


data = pd.read_csv("salary_prediction_data.csv")

print("Dataset Loaded!")
print(data.head())


X = data.drop("Salary", axis=1)
y = data["Salary"]


categorical_features = ["Education", "Location", "Job_Title", "Gender"]
numerical_features = ["Experience", "Age"]


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)


rf = RandomForestRegressor(n_estimators=200, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05)

ensemble = VotingRegressor([
    ('rf', rf),
    ('gb', gb)
])


model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", ensemble)
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

print("Model Training Completed!")


predictions = model.predict(X_test)

print("\nModel Performance:")
print("R2 Score:", r2_score(y_test, predictions))
print("Mean Absolute Error:", mean_absolute_error(y_test, predictions))



joblib.dump(model, "salary_ensemble_model.pkl")

print("\nModel saved as salary_ensemble_model.pkl")

Dataset Loaded!
     Education  Experience  Location Job_Title  Age  Gender         Salary
0  High School           8     Urban   Manager   63    Male   84620.053665
1          PhD          11  Suburban  Director   59    Male  142591.255894
2     Bachelor          28  Suburban   Manager   61  Female   97800.255404
3  High School          29     Rural  Director   45    Male   96834.671282
4          PhD          25     Urban   Analyst   26  Female  132157.786175
Model Training Completed!

Model Performance:
R2 Score: 0.8579168690419762
Mean Absolute Error: 8765.964656699742

Model saved as salary_ensemble_model.pkl


In [None]:
import joblib
import pandas as pd

model = joblib.load("salary_ensemble_model.pkl")

sample = pd.DataFrame({
    "Education": ["Master"],
    "Experience": [5],
    "Location": ["Urban"],
    "Job_Title": ["Engineer"],
    "Age": [29],
    "Gender": ["Male"]
})

prediction = model.predict(sample)

print("Predicted Salary: ₹", round(prediction[0],2))

Predicted Salary: ₹ 103699.48
