In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')


df=pd.read_csv("C:/Users/GOHIL RAJENDRASINH/Downloads/stack-overflow-developer-survey-2024/survey_results_public.csv")
df.head()




# **EDA and Data Cleaning**

> **Dropping rows which has null values in the target column(Compensation in USD) in the beginning as it has no use since, the purpose is to predict salary and the only required data are the rows which have the salary records.**

In [None]:
df_cleaned=df[df['ConvertedCompYearly'].notnull()]  #Dropping rows in which salary column has null values

df_cleaned

In [None]:
print("Basic Information of Dataset after dropping null values of target i.e. ConvertedCompYearly:\n")

df_cleaned.info()

In [None]:
df_cleaned = df_cleaned.dropna(subset=['EdLevel'])


> **Selecting columns that are necessary for predicting income**

In [None]:
necessary_df= df_cleaned[['ConvertedCompYearly','Age','EdLevel','YearsCodePro','Country','Industry','LanguageHaveWorkedWith','PlatformHaveWorkedWith','ToolsTechHaveWorkedWith','WorkExp']]

necessary_df.head()

In [None]:
necessary_df["Industry"].unique()

In [None]:
necessary_df["Country"].unique()

In [None]:
necessary_df["Country"].value_counts()

**There are a lot of countries in the dataset that doesn't have significant amount of developers hence, changing the country names to 'Other' for which the No. of developers are 500 or less to make one category**

In [None]:
def shorten_categories(categories,cutoff):
    categories_map = {}
    for i in range(len(categories)):
        if categories.values[i]>=cutoff:
            categories_map[categories.index[i]] = categories.index[i]
        else:
            categories_map[categories.index[i]] = 'Other'
    return categories_map

country_map = shorten_categories(necessary_df['Country'].value_counts(),500)
necessary_df["Country"] = necessary_df["Country"].map(country_map)

In [None]:
necessary_df["Country"].value_counts()

In [None]:
plt.figure(figsize=(10, 15))
sns.countplot(y='Country', data=necessary_df, order=necessary_df['Country'].value_counts().index)
plt.title('No. of Developers Country-wise')
plt.xlabel('Number of Developers')
plt.ylabel('Country')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
prog_Tools = necessary_df['ToolsTechHaveWorkedWith'].str.split(';', expand=True).stack().value_counts().nlargest(10)
prog_Tools.plot(kind='barh', color='Blue')
plt.title('Top 10 Most Popular Programming Tools')
plt.xlabel('Number of Developers')
plt.ylabel('Prog tools')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
popular_languages = necessary_df['LanguageHaveWorkedWith'].str.split(';', expand=True).stack().value_counts().nlargest(10)
popular_languages.plot(kind='barh', color='Red')
plt.title('Top 10 Most Popular Programming Languages')
plt.xlabel('Number of Developers')
plt.ylabel('Programming Language')
plt.show()

> # **Outlier Visualisation and Treatment**

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,8))
plt.ticklabel_format(style = 'plain')

necessary_df.boxplot("ConvertedCompYearly","Country",ax=ax)
plt.suptitle("Compensation V/s Country")
plt.title('Boxplot to visualise Outliers')
plt.ylabel("Salary")
plt.xticks(rotation=90)

plt.show()

> **On the basis of the boxplot above, it seems that there are lot of false values entered salary-wise, one of the best way to identify these are by spreading the compensation values across different countries to get a good understanding of the outliers**

In [None]:
data = necessary_df.groupby('Country')['ConvertedCompYearly'].describe().reset_index()[['Country','25%','75%']]
data.head()

In [None]:
necessary_df = necessary_df.merge(data, on = "Country",how ="left")
necessary_df.head()

In [None]:


#slightly adapted from https://www.kaggle.com/code/pavithrasivan98/salaryprediction

mask = necessary_df["ConvertedCompYearly"]<necessary_df['25%']
necessary_df.loc[mask,"ConvertedCompYearly"] = necessary_df["25%"]

mask = necessary_df["ConvertedCompYearly"]>necessary_df['75%']
necessary_df.loc[mask,"ConvertedCompYearly"] = necessary_df["75%"]

necessary_df.head()

In [None]:
necessary_df.drop(['25%','75%'],axis=1,inplace=True)

In [None]:
necessary_df.info()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(15,10))
plt.ticklabel_format(style = 'plain')

necessary_df.boxplot("ConvertedCompYearly","Country",ax=ax)
plt.suptitle("Compensation V/s Country")
plt.title('Boxplot to visualise Outliers')
plt.ylabel("Salary")
plt.xticks(rotation=90)

plt.show()

In [None]:
necessary_df['Compensation'] = 'Low Compensation'
necessary_df.loc[necessary_df['ConvertedCompYearly'] > 50000, 'Compensation'] = 'High Compensation'
necessary_df.head()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(data=necessary_df,x='Compensation')
plt.title('Salary-wise Distribution')
plt.xlabel('Compensation')
plt.ylabel('No. of developers')
plt.show()

> **checking value counts of target column to see if the data is reliable for just judging through Accuracy, otherwise Confusion Matrix could be used**

In [None]:
print(necessary_df['Compensation'].value_counts())

In [None]:
# necessary_df.drop(['ConvertedCompYearly'],axis=1,inplace=True)

necessary_df

In [None]:
print("Basic Information of the new Dataset with only selected features:\n",necessary_df.info())
print("\n\nDescription of the new Dataset with only selected features:\n",necessary_df.describe())

> # **Visualizing and Eliminating Duplicate Rows**

In [None]:
necessary_df.duplicated().sum()


In [None]:
duplicate = necessary_df[necessary_df.duplicated()]
duplicate

In [None]:
necessary_df = necessary_df.drop_duplicates()

In [None]:
necessary_df.shape

> # **Analyzing and Eliminating Null values with appropriate methods for each column**

In [None]:
print("null values:\n",necessary_df.isnull().sum().sort_values(ascending=False))

In [None]:
print(necessary_df['Industry'].unique())
print(necessary_df['Industry'].value_counts())

> **The Industry column has an 'other' category hence, replacing 'NaN' values with 'other' as both are undefined inputs. It will result in eliminating null values**

In [None]:
necessary_df['Industry']=necessary_df['Industry'].fillna('Other')

print(necessary_df['Industry'].value_counts())

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(data=necessary_df,y='Industry')
plt.title('Industry-wise Distribution')
plt.xlabel('No. of Developers')
plt.ylabel('Industry')
plt.show()

> **Filling up null values of Work experince(numerical) column with median as it is less sensitive to the outliers as compared to mean.**

In [None]:
necessary_df['WorkExp']=necessary_df['WorkExp'].fillna(necessary_df['WorkExp'].median())

print("null values:\n",necessary_df.isnull().sum().sort_values(ascending=False))

In [None]:
necessary_df.isnull().sum().sort_values(ascending=False)



> **Filling up null values of 'dtype=object' columns with mode.**

In [None]:
#  you‚Äôre replacing missing values in those specific columns with that column‚Äôs most frequent value.
column_with_null = ["PlatformHaveWorkedWith","ToolsTechHaveWorkedWith","LanguageHaveWorkedWith","YearsCodePro"]
necessary_df[column_with_null]=necessary_df[column_with_null].fillna(necessary_df.mode().iloc[0])

In [None]:
necessary_df.isnull().sum().sort_values(ascending=False)


In [None]:
necessary_df.info()

In [None]:
necessary_df['Age'].unique()

**Replacing age groups to their initial age number for better visualisation instead of applying encoder which might give them random labels.**

In [None]:
replace_age=['25','45','35','17','55','18','65','0']

necessary_df['Age'] = necessary_df['Age'].replace(['25-34 years old', '45-54 years old', '35-44 years old',
       'Under 18 years old', '55-64 years old', '18-24 years old',
       '65 years or older', 'Prefer not to say'], replace_age)

necessary_df['Age'] = necessary_df['Age'].astype(int) #making sure all the values are integer by converting


In [None]:
#graph of age vs no. of developers
plt.figure(figsize=(8,8))
sns.countplot(data=necessary_df,x='Age')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('No. of Developers')
plt.show()

In [None]:
necessary_df['EdLevel'].unique()

In [None]:
plt.figure(figsize=(10, 8))
sns.histplot(data=necessary_df, y='EdLevel')
plt.title('Education Level Distribution')
plt.xlabel('No. of Developers')
plt.ylabel('Education Level')
plt.show()

> **Replacing Education level based on ranks in a descending order such as Ph.d=8,Master's=7 and so on for simplifying identification**

In [None]:
#convert into the number
replace_edu=['6','4','7','2','8','5','3','1']
necessary_df['EdLevel'] = necessary_df['EdLevel'].replace(['Bachelor‚Äôs degree (B.A., B.S., B.Eng., etc.)',
       'Some college/university study without earning a degree',
       'Master‚Äôs degree (M.A., M.S., M.Eng., MBA, etc.)',
       'Primary/elementary school',
       'Professional degree (JD, MD, Ph.D, Ed.D, etc.)',
       'Associate degree (A.A., A.S., etc.)',
       'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
       'Something else'], replace_edu)

necessary_df['EdLevel'] = necessary_df['EdLevel'].astype(int)


necessary_df.head()

In [None]:
necessary_df['YearsCodePro'].unique()

> **Replacing the only two object values of Professional coding experience to convert dtype to int for further processing**

In [None]:
necessary_df['YearsCodePro'] = necessary_df['YearsCodePro'].replace('Less than 1 year', '1')
necessary_df['YearsCodePro'] = necessary_df['YearsCodePro'].replace('More than 50 years', '51')

necessary_df['YearsCodePro'] = necessary_df['YearsCodePro'].astype(int)

In [None]:
necessary_df['WorkExp'].unique()

> **Converting float dtype to int by rounding off the values, *rounding off* so that the decimal values doesn't get eliminated as it is**

In [None]:
necessary_df['WorkExp'] = necessary_df['WorkExp'].round().astype(int) 

In [None]:
necessary_df['WorkExp'].unique()

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Age', y='YearsCodePro', hue='Compensation', data=necessary_df, palette='viridis', size='Compensation', sizes=(50, 200))
plt.title('Relationship among Age, YearsCodePro, and Compensation')
plt.xlabel('Age')
plt.ylabel('YearsCodePro')
plt.legend(title='Compensation')
plt.show()

In [None]:
corr_columns = necessary_df[['WorkExp','YearsCodePro','EdLevel','Age']]

correlation_matrix = corr_columns.corr()#creating a correlation matrix of selected columns

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".3f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# **Encoding**

> **Using Label Encoder to encode categorical columns, Using *'Label Encoding'* as there are a lot of different values**

In [None]:
from sklearn.preprocessing import LabelEncoder
LEnc=LabelEncoder()
object_cols=['Compensation','Country','Industry','LanguageHaveWorkedWith','PlatformHaveWorkedWith','ToolsTechHaveWorkedWith']
necessary_df[object_cols]=necessary_df[object_cols].apply(LEnc.fit_transform)
necessary_df

In [None]:
corr_columns = necessary_df[['YearsCodePro','EdLevel','Age','Compensation']]

correlation_matrix = corr_columns.corr()#creating a correlation matrix of selected columns

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".3f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# **Regression - Machine Learning**

In [None]:
column_name = 'Compensation'
column_0 = necessary_df.pop(column_name)
necessary_df.insert(0, column_name, column_0)
necessary_df.head(5)

In [None]:
X = necessary_df.iloc[:,2:]
y = necessary_df.iloc[:,1] 
print("Features X\n",X[0:5])
print("Target y\n", y[0:5])


> **k-Nearest Neighbour**

In [None]:
# import sklearn.model_selection as model_selection
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.preprocessing import StandardScaler
# from sklearn import metrics
# import numpy as np

# # Train-test split
# X_train, X_test, y_train, y_test = model_selection.train_test_split(
#     X, y, test_size=0.3, random_state=1
# )

# # Normalization
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

# # Lists to store results
# k_list = []
# rmse_list = []
# r2_list = []

# # Loop over k values (20‚Äì25)
# for k in range(20, 26):
#     clf_knn = KNeighborsRegressor(
#         n_neighbors=k, weights="distance", metric="euclidean"
#     )
#     clf_knn.fit(X_train, y_train)
#     y_pred = clf_knn.predict(X_test)

#     rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
#     r2 = metrics.r2_score(y_test, y_pred)

#     print(f"k={k} ‚Üí RMSE: {rmse:.2f}, R¬≤: {r2:.3f}")

#     k_list.append(k)
#     rmse_list.append(rmse)
#     r2_list.append(r2)


> **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import model_selection, metrics

# Train-test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=4
)

# Use regressor instead of classifier
DT_reg = DecisionTreeRegressor(max_depth=3,min_impurity_decrease=0.01)
DT_reg.fit(X_train, y_train)

# Predictions
y_pred = DT_reg.predict(X_test)

# Evaluate with regression metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

def rmse(y_true, y_pred):
    # Works on all sklearn versions
    try:
        return mean_squared_error(y_true, y_pred, squared=False)  # new API
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))        # old API

# After you predict:
# y_pred = model.predict(X_test)
print("RMSE:", rmse(y_test, y_pred))
print("MAE :", mean_absolute_error(y_test, y_pred))
print("R¬≤  :", r2_score(y_test, y_pred))


In [None]:
#Grid searchCv :
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid = {
    'max_depth': [5, 8, 12, 15],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2']
}

grid = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
best_model = grid.best_estimator_


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

# Limit tree growth

DT_reg = DecisionTreeRegressor(
    max_depth=8,           # restrict tree depth
    min_samples_split=2,  # need at least 10 samples to split
    min_samples_leaf=8,    # each leaf must have 5 samples
    random_state=42,
    max_features=None
)


DT_reg.fit(X_train, y_train)

y_pred = DT_reg.predict(X_test)
# print("RMSE:", metrics.mean_squared_error(y_test, y_pred, squared=False))
print("R¬≤  :", metrics.r2_score(y_test, y_pred))


In [None]:
#randomforestRegressor
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=550, max_depth=17, min_samples_leaf=4, random_state=42,max_features='sqrt'
)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

print("R¬≤  :", r2_score(y_test, y_pred))


In [None]:


# import numpy as np
# import pandas as pd
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import RandomizedSearchCV, train_test_split
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# # Ensure X and y are standard pandas/numpy objects
# X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
# y = pd.Series(y) if not isinstance(y, pd.Series) else y

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.3, random_state=42
# )

# # Initialize Random Forest (class itself, not fitted instance)
# rf = RandomForestRegressor(random_state=42)

# # Hyperparameter grid (smaller & faster than full GridSearch)
# param_grid = {
#     'n_estimators': [100, 200, 300, 400],
#     'max_depth': [None, 10, 15, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2']
# }

# # RandomizedSearchCV (picklable-safe, faster)
# random_search = RandomizedSearchCV(
#     estimator=rf,
#     param_distributions=param_grid,
#     n_iter=50,        # number of random combinations
#     cv=3,             # 3-fold CV for speed
#     scoring='r2',
#     n_jobs=1,         # safe option to avoid BrokenProcessPool
#     verbose=2,
#     random_state=42
# )

# # Fit RandomizedSearchCV
# random_search.fit(X_train, y_train)

# # Best parameters
# print("Best Parameters:", random_search.best_params_)

# # Best model
# best_rf = random_search.best_estimator_

# # Predictions on test set
# y_pred = best_rf.predict(X_test)

# # Evaluation
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"Test RMSE: {rmse:.2f}")
# print(f"Test MAE : {mae:.2f}")
# print(f"Test R¬≤  : {r2:.3f}")


In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import RandomizedSearchCV, train_test_split
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# # ------------------------------
# # Step 1: Outlier removal (target variable)
# # ------------------------------
# def remove_outliers(df, target_col):
#     Q1 = df[target_col].quantile(0.25)
#     Q3 = df[target_col].quantile(0.75)
#     IQR = Q3 - Q1
#     lower = Q1 - 1.5 * IQR
#     upper = Q3 + 1.5 * IQR
#     return df[(df[target_col] >= lower) & (df[target_col] <= upper)]

# # Combine X and y temporarily
# df = pd.concat([X, y.rename("target")], axis=1)
# df_clean = remove_outliers(df, "target")
# y_clean = df_clean["target"]
# X_clean = df_clean.drop(columns=["target"])

# # ------------------------------
# # Step 2: Encode categorical variables
# # ------------------------------
# cat_cols = X_clean.select_dtypes(include=['object', 'category']).columns.tolist()
# num_cols = X_clean.select_dtypes(exclude=['object', 'category']).columns.tolist()

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
#     ],
#     remainder='passthrough'  # leave numeric columns as-is
# )

# # ------------------------------
# # Step 3: Train-test split
# # ------------------------------
# X_train, X_test, y_train, y_test = train_test_split(
#     X_clean, y_clean, test_size=0.3, random_state=42
# )

# # ------------------------------
# # Step 4: Random Forest + RandomizedSearchCV
# # ------------------------------
# rf = RandomForestRegressor(random_state=42)

# param_grid = {
#     'n_estimators': [200, 300, 400, 500, 600],
#     'max_depth': [None, 15, 20, 25],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2']
# }

# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('regressor', rf)])

# random_search = RandomizedSearchCV(
#     estimator=pipeline,
#     param_distributions={
#         'regressor__' + key: value for key, value in param_grid.items()
#     },
#     n_iter=50,
#     cv=3,
#     scoring='r2',
#     n_jobs=1,  # safe option
#     verbose=2,
#     random_state=42
# )

# # Fit RandomizedSearchCV
# random_search.fit(X_train, y_train)

# # ------------------------------
# # Step 5: Evaluation
# # ------------------------------
# best_model = random_search.best_estimator_
# y_pred = best_model.predict(X_test)

# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print("Best Parameters:", random_search.best_params_)
# print(f"Test RMSE: {rmse:.2f}")
# print(f"Test MAE : {mae:.2f}")
# print(f"Test R¬≤  : {r2:.3f}")


> **Logistic Regression**

In [None]:
X = necessary_df.iloc[:,2:]
y = necessary_df.iloc[:,1] 
print("Features X\n",X[0:5])
print("Target y\n", y[0:5])

**Linear Regression**

In [None]:
#Linear Regression 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn import model_selection
import numpy as np

# Train-test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=4
)

# Scale features
scaler = MinMaxScaler() 
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Regression model
LR_reg = LinearRegression()
LR_reg.fit(X_train, y_train)

# Predictions
y_pred = LR_reg.predict(X_test)

# RMSE function (works for all sklearn versions)
def rmse(y_true, y_pred):
    try:
        return metrics.mean_squared_error(y_true, y_pred, squared=False)  # New API
    except TypeError:
        return np.sqrt(metrics.mean_squared_error(y_true, y_pred))        # Old API

# Evaluate regression performance
print("RMSE:", rmse(y_test, y_pred))
print("MAE :", metrics.mean_absolute_error(y_test, y_pred))
print("R¬≤  :", metrics.r2_score(y_test, y_pred))


# **Evaluation of Regressior**

In [None]:
# LR_reg = LinearRegression()
# from  sklearn.metrics  import RocCurveDisplay
# roc_lr  = RocCurveDisplay.from_estimator(LR_reg , X_test , y_test)
# roc_dt  = RocCurveDisplay.from_estimator( DT_reg , X_test , y_test , ax= roc_lr. ax_)
# # roc_knn  = RocCurveDisplay.from_estimator( clf_reg , X_test , y_test , ax= roc_lr. ax_)

# #As per the ROC curve below Logistic regression has the best performance followed by KNN on the basis of Area Under the Curve

# **Ensemble Learning**

> **Voting Method**

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Example regressor ensemble
voting_reg = VotingRegressor(estimators=[
    ('dt', DT_reg),        # DecisionTreeRegressor
    # ('knn', knn_reg),      # KNeighborsRegressor
    ('lr', LR_reg)         # LinearRegression
])

voting_reg.fit(X_train, y_train)
y_pred = voting_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)
print("R¬≤ Score:", r2_score(y_test, y_pred))


> **Random Forest Technique**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np

# Split the data
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=4
)

# Train regression model
rf_reg = RandomForestRegressor(n_estimators=15, random_state=1)
rf_reg.fit(X_train, y_train)

# Predictions
y_train_pred = rf_reg.predict(X_train)
y_test_pred = rf_reg.predict(X_test)

# Evaluate with regression metrics
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
train_r2 = metrics.r2_score(y_train, y_train_pred)   # Added
test_r2 = metrics.r2_score(y_test, y_test_pred)

print(f"Train RMSE: {train_rmse:.2f}, R¬≤: {train_r2:.3f}")
print(f"Test RMSE : {test_rmse:.2f}, R¬≤: {test_r2:.3f}")



> **Bagging Method**

Using Decision Tree as base estimator for bagging below:

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn import metrics
import numpy as np

# Train-test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=4
)

# Train Bagging Regressor
clf_ensemble_Bagging = BaggingRegressor(estimator=DT_reg, n_estimators=350, random_state=1)

clf_ensemble_Bagging.fit(X_train, y_train)

# Predictions
y_train_pred = clf_ensemble_Bagging.predict(X_train)
y_test_pred = clf_ensemble_Bagging.predict(X_test)

# Evaluate for regression
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
train_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

print(f"Train RMSE: {train_rmse:.2f}, R¬≤: {train_r2:.3f}")
print(f"Test RMSE : {test_rmse:.2f}, R¬≤: {test_r2:.3f}")


Using KNN as base estimator for bagging below:

In [196]:
#
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection, metrics
import numpy as np

# Train-test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=3
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN base regressor
clf_knn = KNeighborsRegressor(n_neighbors=15)

# Bagging Regressor
clf_ensemble_Bagging = BaggingRegressor(
    estimator=clf_knn,
    n_estimators=50,
    random_state=5,
    max_samples=0.8,
    max_features=0.8
)

# Train model
clf_ensemble_Bagging.fit(X_train_scaled, y_train)

# Predictions
y_train_pred = clf_ensemble_Bagging.predict(X_train_scaled)
y_test_pred = clf_ensemble_Bagging.predict(X_test_scaled)

# Regression metrics
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
train_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

print(f"Train RMSE: {train_rmse:.2f}, R¬≤: {train_r2:.3f}")
print(f"Test RMSE : {test_rmse:.2f}, R¬≤: {test_r2:.3f}")


Train RMSE: 28365.04, R¬≤: 0.635
Test RMSE : 30100.52, R¬≤: 0.590


In [None]:
# DecisionTreeRegressor with Bagging
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn import metrics
import numpy as np

dt = DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_leaf=2)

bagging = BaggingRegressor(
    estimator=dt,
    n_estimators=50,  # more trees
    random_state=42,
    max_samples=0.8,  # each tree trained on 80% of data
    max_features=0.8
)

bagging.fit(X_train, y_train)

# Predictions
y_train_pred = bagging.predict(X_train)
y_test_pred = bagging.predict(X_test)

# R¬≤ scores
train_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

# RMSE scores
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

print(f"Train RMSE: {train_rmse:.2f}, R¬≤: {train_r2:.3f}")
print(f"Test  RMSE: {test_rmse:.2f}, R¬≤: {test_r2:.3f}")


In [197]:
#as the GridSearchCV will time consuming
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

# Model
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=12,
    min_samples_leaf=5,
    random_state=42,
    min_samples_split=8,
    max_features='sqrt'
)

# Train
rf.fit(X_train, y_train)

# Predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

# R¬≤ scores
train_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test  RMSE: {test_rmse:.2f}")

print(f"Train R¬≤ Score: {train_r2:.3f}")
print(f"Test  R¬≤ Score: {test_r2:.3f}")


Train RMSE: 19465.90
Test  RMSE: 22216.83
Train R¬≤ Score: 0.828
Test  R¬≤ Score: 0.776


In [198]:
#After the apply the randomized search(as GridSearchCV are to time taking)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import numpy as np

# Stage 1: Fast parameter search
param_dist = {
    'max_depth': [10, 12, 15],
    'min_samples_leaf': [2, 4, 6],
    'min_samples_split': [5, 10, 15],
    'max_features': ['sqrt', 0.8, 1.0]
}

rand_search = RandomizedSearchCV(
    RandomForestRegressor(n_estimators=100, random_state=42),  # Fewer trees for speed
    param_distributions=param_dist,
    n_iter=15,           # Try only 15 random combos
    cv=3,                # 3-fold CV for speed
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

print("üîç Running fast parameter search...")
rand_search.fit(X_train, y_train)
best_params = rand_search.best_params_
print("‚úÖ Best parameters (fast search):", best_params)

# Stage 2: Retrain with 500 trees using best parameters
rf_final = RandomForestRegressor(
    n_estimators=500,
    random_state=42,
    **best_params
)

print("üöÄ Retraining with best parameters and 500 trees...")
rf_final.fit(X_train, y_train)

# Predictions
y_train_pred = rf_final.predict(X_train)
y_test_pred = rf_final.predict(X_test)

# Evaluation
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
train_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

print(f"\nüìä Final Model Performance:")
print(f"Train RMSE: {train_rmse:.2f}, R¬≤: {train_r2:.3f}")
print(f"Test  RMSE: {test_rmse:.2f}, R¬≤: {test_r2:.3f}")
print(y_test_pred)


üîç Running fast parameter search...
‚úÖ Best parameters (fast search): {'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 0.8, 'max_depth': 15}
üöÄ Retraining with best parameters and 500 trees...

üìä Final Model Performance:
Train RMSE: 16861.78, R¬≤: 0.871
Test  RMSE: 20866.17, R¬≤: 0.803
[ 74656.5740886   65109.7296169  120610.5128602  ... 153802.83925331
  28392.15312548  62220.48375166]


In [None]:
# from sklearn.preprocessing import LabelEncoder
# import pickle

# le_country = LabelEncoder()
# le_country.fit(necessary_df['Country'])

# # repeat for other columns
# le_industry = LabelEncoder().fit(necessary_df['Industry'])
# le_language = LabelEncoder().fit(necessary_df['LanguageHaveWorkedWith'])
# le_platform = LabelEncoder().fit(necessary_df['PlatformHaveWorkedWith'])
# le_tools = LabelEncoder().fit(necessary_df['ToolsTechHaveWorkedWith'])

# data = {
#     "model": rf_final,
#     "country": le_country,
#     "industry": le_industry,
#     "languagehaveworkedwith": le_language,  # Note the exact key name from your training
#     "platformhaveworkedwith": le_platform,  # Note the exact key name from your training
#     "toolstechhaveworkedwith": le_tools    # Note the exact key name from your training
# }

# with open("rf_final_model.pkl", "wb") as file:
#     pickle.dump(data, file)

# with open("rf_final_model.pkl", "rb") as file:
#     regressor = pickle.load(file)


# print(f"üíæ Model saved to {rf_final}")

# from sklearn.preprocessing import LabelEncoder
# import pickle

# # List of categorical columns
# object_cols = ['Country','Industry','LanguageHaveWorkedWith','PlatformHaveWorkedWith','ToolsTechHaveWorkedWith']

# # Dictionary to store separate encoders
# encoders = {}

# for col in object_cols:
#     le = LabelEncoder()
#     necessary_df[col] = le.fit_transform(necessary_df[col].astype(str))
#     encoders[col] = le   # store encoder for this column

# # Save model
# with open("rf_final_model.pkl", "wb") as f:
#     pickle.dump(regressor, f)

# # Save encoders
# with open("encoders.pkl", "wb") as f:
#     pickle.dump(encoders, f)

# print("‚úÖ Model saved to rf_final_model.pkl and encoders saved to encoders.pkl")


üíæ Model saved to RandomForestRegressor(max_depth=15, max_features=0.8, min_samples_leaf=4,
                      min_samples_split=15, n_estimators=500, random_state=42)
‚úÖ Model saved to rf_final_model.pkl and encoders saved to encoders.pkl


In [None]:
# from sklearn.preprocessing import LabelEncoder
# import pickle

# # Assuming necessary_df is your original DataFrame with raw categorical data
# # Fit LabelEncoders on raw categorical columns
# le_country = LabelEncoder().fit(necessary_df['Country'])
# le_industry = LabelEncoder().fit(necessary_df['Industry'])
# le_language = LabelEncoder().fit(necessary_df['LanguageHaveWorkedWith'])
# le_platform = LabelEncoder().fit(necessary_df['PlatformHaveWorkedWith'])
# le_tools = LabelEncoder().fit(necessary_df['ToolsTechHaveWorkedWith'])

# # Create the data dictionary with the model and fitted encoders
# data = {
#     "model": rf_final,
#     "country": le_country,
#     "industry": le_industry,
#     "languagehaveworkedwith": le_language,
#     "platformhaveworkedwith": le_platform,
#     "toolstechhaveworkedwith": le_tools
# }

# # Save the dictionary to a .pkl file
# with open("rf_final_model.pkl", "wb") as file:
#     pickle.dump(data, file)

# print("‚úÖ Model and encoders saved to rf_final_model.pkl")

# # Optional: Verify the classes
# print("Country classes:", le_country.classes_)
# print("Industry classes:", le_industry.classes_)
# print("Language classes:", le_language.classes_)
# print("Platform classes:", le_platform.classes_)
# print("Tools classes:", le_tools.classes_)

In [None]:
# import pickle
# with open('rf_final_model.pkl', 'rb') as f:
#     data = pickle.load(f)
# print(data.keys())  # Should show 'model', 'country', 'industry', etc.
# print(necessary_df.columns)

dict_keys(['model', 'country', 'industry', 'languagehaveworkedwith', 'platformhaveworkedwith', 'toolstechhaveworkedwith'])
Index(['Compensation', 'ConvertedCompYearly', 'Age', 'EdLevel', 'YearsCodePro',
       'Country', 'Industry', 'LanguageHaveWorkedWith',
       'PlatformHaveWorkedWith', 'ToolsTechHaveWorkedWith', 'WorkExp'],
      dtype='object')


In [202]:
from sklearn.preprocessing import LabelEncoder
import pickle

encoders = {}
encoders['Country'] = LabelEncoder()
encoders['Country'].fit(necessary_df['Country'])
# Similarly for other categorical columns

# Save encoders
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)


In [203]:
with open('encoders.pkl', 'rb') as f:
    encoders = pickle.load(f)
print(encoders.keys())


dict_keys(['Country'])


> **Boosting Method**

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn import metrics
import numpy as np

# Train-test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=5
)

# AdaBoost Regressor with Decision Tree Regressor as base
clf_ensemble_Boost = AdaBoostRegressor(
    estimator=DT_reg,  # DT_reg should be a DecisionTreeRegressor
    n_estimators=18,
    random_state=1
)
clf_ensemble_Boost.fit(X_train, y_train)

# Predictions
y_train_pred = clf_ensemble_Boost.predict(X_train)
y_test_pred = clf_ensemble_Boost.predict(X_test)

# Regression metrics
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
train_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

print(f"Train RMSE: {train_rmse:.2f}, R¬≤: {train_r2:.3f}")
print(f"Test RMSE : {test_rmse:.2f}, R¬≤: {test_r2:.3f}")


In [None]:
#Decision Tree using the RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import numpy as np

# Stage 1: Fast search
base_tree = DecisionTreeRegressor(random_state=42)

param_dist = {
    'n_estimators': [50, 75, 100],                 # fewer for search
    'learning_rate': [0.05, 0.1, 0.3, 1.0],
    'estimator__max_depth': [3, 5, 7],
    'estimator__min_samples_leaf': [2, 4, 6]
}

rand_search = RandomizedSearchCV(
    AdaBoostRegressor(estimator=base_tree, random_state=42),
    param_distributions=param_dist,
    n_iter=10,          # only 10 random combos
    cv=3,               # faster than 5 folds
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

print("üîç Running fast AdaBoost search...")
rand_search.fit(X_train, y_train)
best_params = rand_search.best_params_
print("‚úÖ Best parameters found:", best_params)

# Stage 2: Retrain with more estimators for accuracy
final_tree = DecisionTreeRegressor(
    max_depth=best_params['estimator__max_depth'],
    min_samples_leaf=best_params['estimator__min_samples_leaf'],
    random_state=42
)

final_model = AdaBoostRegressor(
    estimator=final_tree,
    n_estimators=200,                     # more boosting rounds
    learning_rate=best_params['learning_rate'],
    random_state=42
)

print("üöÄ Retraining final AdaBoost model...")
final_model.fit(X_train, y_train)

# Predictions
y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

# Evaluation
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
train_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

print(f"\nüìä Final Model Performance:")
print(f"Train RMSE: {train_rmse:.2f}, R¬≤: {train_r2:.3f}")
print(f"Test  RMSE: {test_rmse:.2f}, R¬≤: {test_r2:.3f}")
print(y_test_pred)


In [206]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import pickle

# Load the data
df = pd.read_csv("C:/Users/GOHIL RAJENDRASINH/Downloads/stack-overflow-developer-survey-2024/survey_results_public.csv")

# Select necessary columns
necessary_columns = ['Age', 'EdLevel', 'YearsCodePro', 'Country', 'Industry', 'LanguageHaveWorkedWith', 'PlatformHaveWorkedWith', 'ToolsTechHaveWorkedWith', 'WorkExp', 'ConvertedCompYearly']
necessary_df = df[necessary_columns].copy()

# Handle missing values
necessary_df = necessary_df.dropna(subset=necessary_columns)

# Verify initial data
print("Sample data before preprocessing:")
print(necessary_df.head())
print("\nData types before encoding:")
print(necessary_df.dtypes)

# Education level mapping
ed_level_mapping = {
    "Bachelor‚Äôs degree (B.A., B.S., B.Eng., etc.)": 6,
    "Some college/university study without earning a degree": 4,
    "Master‚Äôs degree (M.A., M.S., M.Eng., MBA, etc.)": 7,
    "Primary/elementary school": 2,
    "Professional degree (JD, MD, Ph.D, Ed.D, etc.)": 8,
    "Associate degree (A.A., A.S., etc.)": 5,
    "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)": 3,
    "Something else": 1
}
necessary_df['EdLevel'] = necessary_df['EdLevel'].map(ed_level_mapping).astype(float)

# Handle Age column (convert age ranges to midpoints)
age_mapping = {
    "Under 18 years old": 14,
    "18-24 years old": 21,
    "25-34 years old": 29.5,
    "35-44 years old": 39.5,
    "45-54 years old": 49.5,
    "55-64 years old": 59.5,
    "65 years or older": 70
}
necessary_df['Age'] = necessary_df['Age'].map(age_mapping).astype(float)

# Handle multi-value columns by taking the first value
for col in ['LanguageHaveWorkedWith', 'PlatformHaveWorkedWith', 'ToolsTechHaveWorkedWith']:
    necessary_df[col] = necessary_df[col].apply(lambda x: x.split(';')[0] if isinstance(x, str) else x)

# Fit LabelEncoders on RAW string columns (before transformation)
le_country = LabelEncoder().fit(necessary_df['Country'])
le_industry = LabelEncoder().fit(necessary_df['Industry'])
le_language = LabelEncoder().fit(necessary_df['LanguageHaveWorkedWith'])
le_platform = LabelEncoder().fit(necessary_df['PlatformHaveWorkedWith'])
le_tools = LabelEncoder().fit(necessary_df['ToolsTechHaveWorkedWith'])

# Verify classes (must show strings)
print("\nEncoder classes after fit (must be strings):")
print("Country classes:", le_country.classes_)
print("Industry classes:", le_industry.classes_)
print("Language classes:", le_language.classes_)
print("Platform classes:", le_platform.classes_)
print("Tools classes:", le_tools.classes_)

# Transform the DataFrame columns to numbers for training (if re-training)
necessary_df['Country'] = le_country.transform(necessary_df['Country'])
necessary_df['Industry'] = le_industry.transform(necessary_df['Industry'])
necessary_df['LanguageHaveWorkedWith'] = le_language.transform(necessary_df['LanguageHaveWorkedWith'])
necessary_df['PlatformHaveWorkedWith'] = le_platform.transform(necessary_df['PlatformHaveWorkedWith'])
necessary_df['ToolsTechHaveWorkedWith'] = le_tools.transform(necessary_df['ToolsTechHaveWorkedWith'])

# Convert other numeric columns to float
for col in ['YearsCodePro', 'WorkExp']:
    necessary_df[col] = pd.to_numeric(necessary_df[col], errors='coerce')

# Prepare features and target (for reference, but skip training if model is good)
X = necessary_df.drop('ConvertedCompYearly', axis=1)
y = necessary_df['ConvertedCompYearly']
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Use existing rf_final (assume it's from your good model)
# If re-training is needed, uncomment and run the training block below
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# param_dist = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
# rf = RandomForestRegressor(random_state=42)
# rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
# rf_random.fit(X_train, y_train)
# rf_final = rf_random.best_estimator_

# Save model and encoders to rf_final_model.pkl
data = {
    "model": rf_final,  # Use your existing rf_final
    "country": le_country,
    "industry": le_industry,
    "languagehaveworkedwith": le_language,
    "platformhaveworkedwith": le_platform,
    "toolstechhaveworkedwith": le_tools
}
model_filename = "rf_final_model.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump(data, file)
print(f"üíæ Model and encoders saved to {model_filename}")

Sample data before preprocessing:
                 Age                                            EdLevel  \
72   18-24 years old  Secondary school (e.g. American high school, G...   
379  35-44 years old    Master‚Äôs degree (M.A., M.S., M.Eng., MBA, etc.)   
389  25-34 years old  Some college/university study without earning ...   
392  35-44 years old     Professional degree (JD, MD, Ph.D, Ed.D, etc.)   
398  45-54 years old  Some college/university study without earning ...   

    YearsCodePro                                            Country  \
72             1                                           Pakistan   
379            6                                             Turkey   
389            7                           United States of America   
392           18  United Kingdom of Great Britain and Northern I...   
398           30                           United States of America   

                                 Industry  \
72                   Software Development