In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df = pd.read_csv('glassdoor_cleaned_new.csv')

df = df.dropna(subset=['avg_salary_rupees'])

X = df[['Job Title', 'Location', 'Rating', 'company_name_cleaned']]
y = df['avg_salary_rupees']
X = pd.get_dummies(X, columns=['Job Title', 'Location', 'company_name_cleaned'], drop_first=True)

In [3]:
X = X.dropna()
y = y.loc[X.index] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [5]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_rmse = sqrt(mean_squared_error(y_test, lr.predict(X_test)))
print(f"Linear Regression ➡️ RMSE: {lr_rmse:,.2f}")

Linear Regression ➡️ RMSE: 650,013.55


In [6]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_rmse = sqrt(mean_squared_error(y_test, lasso.predict(X_test)))
print(f"Lasso Regression ➡️ RMSE: {lasso_rmse:,.2f}")

Lasso Regression ➡️ RMSE: 630,684.84


  model = cd_fast.enet_coordinate_descent(


In [7]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_rmse = sqrt(mean_squared_error(y_test, rf.predict(X_test)))
print(f"Random Forest ➡️ RMSE: {rf_rmse:,.2f}")

Random Forest ➡️ RMSE: 386,538.20


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from math import sqrt


rmse_scorer = make_scorer(lambda y_true, y_pred: sqrt(mean_squared_error(y_true, y_pred)))


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3]
}


In [10]:
rf = RandomForestRegressor(random_state=42)

grid = GridSearchCV(estimator=rf,
                    param_grid=param_grid,
                    scoring=rmse_scorer,
                    cv=5,
                    n_jobs=-1,
                    verbose=1)

grid.fit(X, y)
print(f"Best Cross-Validated RMSE: {grid.best_score_:,.2f}")
print("Best Parameters:", grid.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Cross-Validated RMSE: 678,575.60
Best Parameters: {'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100}


In [11]:

# from sklearn.linear_model import Lasso
# from sklearn.model_selection import GridSearchCV

# lasso_grid = GridSearchCV(
#     estimator=Lasso(max_iter=50000), 
#     param_grid={'alpha': [0.1, 1, 10, 50, 100]},
#     scoring='neg_mean_squared_error',
#     cv=5
# )

# lasso_grid.fit(X, y)
# print("Best alpha for Lasso:", lasso_grid.best_params_)

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingRegressor

rf_best = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=3,
    random_state=42  
)

lasso_tuned = make_pipeline(StandardScaler(), Lasso(alpha=100, max_iter=50000))

ensemble = VotingRegressor([
    ('rf', rf_best),
    ('lasso', lasso_tuned),
    ('lr', lr)
])

ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)


In [15]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"🔁 Updated Ensemble Test ➡️ RMSE: {rmse:,.2f}")


🔁 Updated Ensemble Test ➡️ RMSE: 453,568.93
