## Initialization


In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "notebook_connected"

from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV , cross_val_score, KFold
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings. filterwarnings('ignore')



## Data Loading

In [2]:
df= pd.read_csv("/kaggle/input/stackoverflow-developer-suvery-2022/survey_results_public.csv")

In [3]:
df.head()

Unnamed: 0,ResponseId,MainBranch,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,YearsCode,...,TimeSearching,TimeAnswering,Onboarding,ProfessionalTech,TrueFalse_1,TrueFalse_2,TrueFalse_3,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,None of these,,,,,,,,,...,,,,,,,,,,
1,2,I am a developer by profession,"Employed, full-time",Fully remote,Hobby;Contribute to open-source projects,,,,,,...,,,,,,,,Too long,Difficult,
2,3,"I am not primarily a developer, but I write co...","Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Friend or family member...,Technical documentation;Blogs;Programming Game...,,14.0,...,,,,,,,,Appropriate in length,Neither easy nor difficult,40205.0
3,4,I am a developer by profession,"Employed, full-time",Fully remote,I don’t code outside of work,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Books / Physical media;School (i.e., Universit...",,,20.0,...,,,,,,,,Appropriate in length,Easy,215232.0
4,5,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Stack Overflow;O...,,8.0,...,,,,,,,,Too long,Easy,


## EDA

In [4]:
df.columns

Index(['ResponseId', 'MainBranch', 'Employment', 'RemoteWork',
       'CodingActivities', 'EdLevel', 'LearnCode', 'LearnCodeOnline',
       'LearnCodeCoursesCert', 'YearsCode', 'YearsCodePro', 'DevType',
       'OrgSize', 'PurchaseInfluence', 'BuyNewTool', 'Country', 'Currency',
       'CompTotal', 'CompFreq', 'LanguageHaveWorkedWith',
       'LanguageWantToWorkWith', 'DatabaseHaveWorkedWith',
       'DatabaseWantToWorkWith', 'PlatformHaveWorkedWith',
       'PlatformWantToWorkWith', 'WebframeHaveWorkedWith',
       'WebframeWantToWorkWith', 'MiscTechHaveWorkedWith',
       'MiscTechWantToWorkWith', 'ToolsTechHaveWorkedWith',
       'ToolsTechWantToWorkWith', 'NEWCollabToolsHaveWorkedWith',
       'NEWCollabToolsWantToWorkWith', 'OpSysProfessional use',
       'OpSysPersonal use', 'VersionControlSystem', 'VCInteraction',
       'VCHostingPersonal use', 'VCHostingProfessional use',
       'OfficeStackAsyncHaveWorkedWith', 'OfficeStackAsyncWantToWorkWith',
       'OfficeStackSyncHaveWork

In [5]:
df.shape

(73268, 79)

In [6]:
df['DevType'].value_counts()

Developer, full-stack                                                                                                                                                                                                                                                                                                                                                                                                                  7142
Developer, back-end                                                                                                                                                                                                                                                                                                                                                                                                                    5301
Developer, front-end                                                                                                                            

In [7]:
df['EdLevel'].value_counts()

Bachelor’s degree (B.A., B.S., B.Eng., etc.)                                          30276
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)                                       15486
Some college/university study without earning a degree                                 9326
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)     7904
Associate degree (A.A., A.S., etc.)                                                    2236
Other doctoral degree (Ph.D., Ed.D., etc.)                                             2169
Primary/elementary school                                                              1806
Something else                                                                         1247
Professional degree (JD, MD, etc.)                                                     1121
Name: EdLevel, dtype: int64

In [8]:
df.groupby('DevType' )['ConvertedCompYearly'].mean()

DevType
Academic researcher                                             111513.238095
Academic researcher;Blockchain                                  133374.000000
Academic researcher;Blockchain;Security professional                      NaN
Academic researcher;Cloud infrastructure engineer                37116.000000
Academic researcher;Cloud infrastructure engineer;Blockchain      2579.000000
                                                                    ...      
System administrator                                            259908.698413
System administrator;Blockchain                                   3096.000000
System administrator;Blockchain;Security professional                     NaN
System administrator;Marketing or sales professional             48000.000000
System administrator;Security professional                       62012.500000
Name: ConvertedCompYearly, Length: 9984, dtype: float64

In [9]:
df.groupby( 'EdLevel' )['ConvertedCompYearly'].mean()

EdLevel
Associate degree (A.A., A.S., etc.)                                                   192651.583661
Bachelor’s degree (B.A., B.S., B.Eng., etc.)                                          183214.077204
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)                                       155733.868786
Other doctoral degree (Ph.D., Ed.D., etc.)                                            195173.994876
Primary/elementary school                                                             290821.318584
Professional degree (JD, MD, etc.)                                                     81958.780449
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)    143931.798423
Some college/university study without earning a degree                                159608.905270
Something else                                                                        135111.158774
Name: ConvertedCompYearly, dtype: float64

## Visualization

In [10]:
def plot_line_chart(df, column, line=""):
    if line == '':
        line = df[column].value_counts().keys()[0:20]
    data = df[column].value_counts()[0:20]
    fig=px.line(x=line,y=data)
    fig.show()
    
def plot_bar_chart(df, column, line=""):
    if line == '':
        line = df[column].value_counts().keys()[0:20]
    data = df[column].value_counts()[0:20]
    fig=px.bar(x=line,y=data)
    fig.show()
    
def plot_pie_chart(df, column, line = ''):
    if line == '':
        line = df[column].value_counts().keys()[0:20]
    data = df[column].value_counts()[0:20]
    fig=px.pie(names=line,values=data)
    fig.show()

In [11]:
df.columns

Index(['ResponseId', 'MainBranch', 'Employment', 'RemoteWork',
       'CodingActivities', 'EdLevel', 'LearnCode', 'LearnCodeOnline',
       'LearnCodeCoursesCert', 'YearsCode', 'YearsCodePro', 'DevType',
       'OrgSize', 'PurchaseInfluence', 'BuyNewTool', 'Country', 'Currency',
       'CompTotal', 'CompFreq', 'LanguageHaveWorkedWith',
       'LanguageWantToWorkWith', 'DatabaseHaveWorkedWith',
       'DatabaseWantToWorkWith', 'PlatformHaveWorkedWith',
       'PlatformWantToWorkWith', 'WebframeHaveWorkedWith',
       'WebframeWantToWorkWith', 'MiscTechHaveWorkedWith',
       'MiscTechWantToWorkWith', 'ToolsTechHaveWorkedWith',
       'ToolsTechWantToWorkWith', 'NEWCollabToolsHaveWorkedWith',
       'NEWCollabToolsWantToWorkWith', 'OpSysProfessional use',
       'OpSysPersonal use', 'VersionControlSystem', 'VCInteraction',
       'VCHostingPersonal use', 'VCHostingProfessional use',
       'OfficeStackAsyncHaveWorkedWith', 'OfficeStackAsyncWantToWorkWith',
       'OfficeStackSyncHaveWork

In [12]:
plot_bar_chart(df,"Country")

In [13]:
line = ['developer', 'student', 'half-developer', 'hobby', 'neither','used-developer' ]
plot_pie_chart(df, 'MainBranch', line)

In [14]:
plot_bar_chart(df,"EdLevel")

In [15]:
plot_bar_chart(df,"YearsCodePro")

In [16]:
plot_bar_chart(df,"Age")

In [17]:
plot_pie_chart(df,"RemoteWork")

In [18]:
df_DevType = pd.DataFrame()
num = 0
str_list = []
for i in df['DevType']:
    num = num + 1
    if pd.isnull(i):
        continue
    data = i.split(";")
    for j in data:
        str_list.append(j)
df_DevType = pd.DataFrame(str_list, columns=['DevType'])
df_DevType.value_counts()

DevType                                      
Developer, full-stack                            28701
Developer, back-end                              26595
Developer, front-end                             15915
Developer, desktop or enterprise applications     9546
Developer, mobile                                 7634
DevOps specialist                                 6170
Student                                           5595
Cloud infrastructure engineer                     5283
Database administrator                            4934
System administrator                              4908
Developer, embedded applications or devices       3923
Project manager                                   3897
Designer                                          3764
Engineer, data                                    3600
Engineering manager                               3574
Data scientist or machine learning specialist     3424
Data or business analyst                          3201
Developer, QA or te

In [19]:
plot_bar_chart(df_DevType,"DevType")
plot_pie_chart(df_DevType,"DevType")

In [20]:
df_Employment = pd.DataFrame()
num = 0
str_list = []
for i in df['Employment']:
    num = num + 1
    if pd.isnull(i):
        continue
    data = i.split(";")
    for j in data:
        str_list.append(j)
df_Employment = pd.DataFrame(str_list, columns=['Employment'])
df_Employment.value_counts()

Employment                                          
Employed, full-time                                     49199
Student, full-time                                      10932
Independent contractor, freelancer, or self-employed    10721
Employed, part-time                                      4154
Student, part-time                                       3722
Not employed, but looking for work                       3381
Not employed, and not looking for work                   1244
I prefer not to say                                       611
Retired                                                   396
dtype: int64

In [21]:
plot_bar_chart(df_Employment,"Employment")
plot_pie_chart(df_Employment,"Employment")

## Feature Selection

In [22]:
df = df[["Country", "EdLevel", "YearsCodePro", "Age", "Employment","Gender", "RemoteWork", "ConvertedCompYearly"]]
df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)

In [23]:
df.columns

Index(['Country', 'EdLevel', 'YearsCodePro', 'Age', 'Employment', 'Gender',
       'RemoteWork', 'Salary'],
      dtype='object')

In [24]:
df.shape

(73268, 8)

In [25]:
df = df[df["Salary"].notnull()]

In [26]:
df.shape

(38071, 8)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38071 entries, 2 to 73121
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country       38071 non-null  object 
 1   EdLevel       38054 non-null  object 
 2   YearsCodePro  37935 non-null  object 
 3   Age           37984 non-null  object 
 4   Employment    38067 non-null  object 
 5   Gender        37938 non-null  object 
 6   RemoteWork    38015 non-null  object 
 7   Salary        38071 non-null  float64
dtypes: float64(1), object(7)
memory usage: 2.6+ MB


In [28]:
df.isnull().sum()

Country           0
EdLevel          17
YearsCodePro    136
Age              87
Employment        4
Gender          133
RemoteWork       56
Salary            0
dtype: int64

In [29]:
df["Country"].value_counts()

United States of America                                8707
Germany                                                 2912
United Kingdom of Great Britain and Northern Ireland    2657
India                                                   2173
Canada                                                  1481
                                                        ... 
Fiji                                                       1
Palau                                                      1
Monaco                                                     1
Saint Lucia                                                1
Seychelles                                                 1
Name: Country, Length: 159, dtype: int64

In [30]:
df["EdLevel"].value_counts()

Bachelor’s degree (B.A., B.S., B.Eng., etc.)                                          18056
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)                                        9877
Some college/university study without earning a degree                                 4497
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)     1776
Other doctoral degree (Ph.D., Ed.D., etc.)                                             1366
Associate degree (A.A., A.S., etc.)                                                    1273
Professional degree (JD, MD, etc.)                                                      624
Something else                                                                          359
Primary/elementary school                                                               226
Name: EdLevel, dtype: int64

In [31]:
def print_unique_col_values(df):
       for column in df:
            if df[column].dtypes=='object':
                print(f'{column}: {df[column].unique()}') 

In [32]:
print_unique_col_values(df)

Country: ['United Kingdom of Great Britain and Northern Ireland' 'Israel'
 'Netherlands' 'United States of America' 'Czech Republic' 'Austria'
 'Italy' 'Canada' 'Germany' 'Ireland' 'Poland' 'Madagascar' 'Norway'
 'Taiwan' 'France' 'Brazil' 'Uruguay' 'Sweden' 'Spain' 'Turkey' 'Romania'
 'Singapore' 'India' 'Belgium' 'Bulgaria' 'Greece' 'Portugal'
 'Russian Federation' 'Saudi Arabia' 'Mexico' 'Kenya' 'Switzerland'
 'Latvia' 'South Africa' 'Thailand' 'China' 'Montenegro' 'Finland'
 'Slovakia' 'Japan' 'Denmark' 'Australia' 'Viet Nam' 'Argentina' 'Hungary'
 'Tunisia' 'Bangladesh' 'Ukraine' 'Maldives' 'Hong Kong (S.A.R.)' 'Egypt'
 'Serbia' 'Pakistan' 'Nepal' 'Croatia' 'Indonesia'
 'Bosnia and Herzegovina' 'Armenia' 'Lithuania'
 'Iran, Islamic Republic of...' 'Belarus' 'Costa Rica' 'Mauritius'
 'Estonia' 'Kazakhstan' 'Morocco' 'Philippines' 'Chile' 'New Zealand'
 'Slovenia' 'Ecuador' 'Cyprus' 'Peru' 'Colombia' 'Afghanistan' 'Nicaragua'
 'Andorra' 'Republic of Korea' 'Lebanon' 'South Korea' 'M

In [33]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [34]:
country_map = shorten_categories(df.Country.value_counts(), 199)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

United States of America                                8707
Other                                                   4381
Germany                                                 2912
United Kingdom of Great Britain and Northern Ireland    2657
India                                                   2173
Canada                                                  1481
France                                                  1378
Brazil                                                  1283
Spain                                                    996
Poland                                                   977
Netherlands                                              900
Australia                                                833
Italy                                                    812
Sweden                                                   665
Russian Federation                                       534
Switzerland                                              520
Turkey                  

In [35]:
df = df[df["Salary"] <= 200000]
df = df[df["Salary"] >= 10000]
df = df[df['Country'] != 'Other']

In [36]:
fig = px.box(df, x="Country", y="Salary")
fig.show()

In [37]:
df["YearsCodePro"].unique()

array(['5', '6', '2', '10', '4', '22', '20', '9', '14', '21', '15', '3',
       '25', '7', '8', '12', '19', '1', '17', '24', '11', '23',
       'Less than 1 year', '18', '40', '37', '46', '13', '32', '31', '39',
       '27', '28', '16', '30', '34', '35', '26', '42', '38', '36', '43',
       '41', '44', '29', nan, '33', '45', '48', '50', '47',
       'More than 50 years', '49'], dtype=object)

In [38]:
def clean_experience(x):
    if x ==  'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(clean_experience)

In [39]:
def clean_Gender(x):
    if x in ['Man' ,'Woman']:
        return x
    else:
        return 'other'

df['Gender'] = df['Gender'].apply(clean_Gender)

In [40]:
df.tail(10)

Unnamed: 0,Country,EdLevel,YearsCodePro,Age,Employment,Gender,RemoteWork,Salary
73107,Brazil,"Professional degree (JD, MD, etc.)",10.0,25-34 years old,"Employed, full-time",Man,"Hybrid (some remote, some in-person)",72707.0
73110,Canada,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",9.0,25-34 years old,"Employed, full-time",Woman,Fully remote,60906.0
73111,United States of America,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",5.0,25-34 years old,"Employed, full-time",Man,"Hybrid (some remote, some in-person)",115000.0
73112,Austria,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",3.0,25-34 years old,"Employed, full-time",Man,"Hybrid (some remote, some in-person)",52255.0
73113,United States of America,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",5.0,25-34 years old,"Employed, full-time",Man,Fully remote,94000.0
73114,Germany,"Associate degree (A.A., A.S., etc.)",2.0,18-24 years old,"Employed, full-time;Independent contractor, fr...",Man,"Hybrid (some remote, some in-person)",41058.0
73116,United States of America,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",16.0,35-44 years old,"Employed, full-time",Man,"Hybrid (some remote, some in-person)",115000.0
73119,United States of America,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",1.0,25-34 years old,"Employed, full-time",Man,"Hybrid (some remote, some in-person)",70000.0
73120,Spain,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",2.0,,"Employed, full-time",other,Fully remote,35192.0
73121,United Kingdom of Great Britain and Northern I...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",3.0,25-34 years old,"Employed, full-time",other,"Hybrid (some remote, some in-person)",75384.0


In [41]:
df['EdLevel'].value_counts()

Bachelor’s degree (B.A., B.S., B.Eng., etc.)                                          12908
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)                                        7703
Some college/university study without earning a degree                                 3346
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)     1361
Other doctoral degree (Ph.D., Ed.D., etc.)                                             1107
Associate degree (A.A., A.S., etc.)                                                    1009
Professional degree (JD, MD, etc.)                                                      439
Something else                                                                          288
Primary/elementary school                                                               153
Name: EdLevel, dtype: int64

In [42]:
def clean_education(x):
    if x =='Bachelor’s degree (B.A., B.S., B.Eng., etc.)':
        return 'Bachelor’s degree'
    if x=='Master’s degree (M.A., M.S., M.Eng., MBA, etc.)':
        return 'Master’s degree'
    if x in ['Professional degree (JD, MD, etc.)','Other doctoral degree (Ph.D., Ed.D., etc.)']:
        return 'Post grad'
    return 'Less than a Bachelors'

df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [48]:
df['Age'].value_counts()

25-34 years old       13935
35-44 years old        7486
18-24 years old        3166
45-54 years old        2589
older than 55          1126
Under 18 years old       24
Name: Age, dtype: int64

In [47]:
def clean_age(x):
    if x=='Under 18 years old':
        return 'Under 18 years old' 
    if x=='18-24 years old':
        return '18-24 years old' 
    if x =='25-34 years old':
        return '25-34 years old'
    if x =='35-44 years old':
        return '35-44 years old'
    if x =='45-54 years old':
        return '45-54 years old'
    return 'older than 55'

df['Age'] = df['Age'].apply(clean_age)

In [49]:
df.sample(10)

Unnamed: 0,Country,EdLevel,YearsCodePro,Age,Employment,Gender,RemoteWork,Salary
6253,United Kingdom of Great Britain and Northern I...,Bachelor’s degree,8.0,25-34 years old,"Employed, full-time",Man,Full in-person,36180.0
34917,France,Master’s degree,20.0,35-44 years old,"Employed, full-time;Independent contractor, fr...",Man,Full in-person,53322.0
1824,India,Bachelor’s degree,6.0,25-34 years old,"Employed, full-time",Man,"Hybrid (some remote, some in-person)",70935.0
11074,Canada,Less than a Bachelors,3.0,25-34 years old,"Employed, full-time",Man,Fully remote,62467.0
60039,Czech Republic,Master’s degree,9.0,35-44 years old,"Employed, full-time;Independent contractor, fr...",Man,Fully remote,59760.0
12290,United States of America,Bachelor’s degree,15.0,35-44 years old,"Employed, full-time",other,Fully remote,182000.0
14472,United States of America,Bachelor’s degree,15.0,35-44 years old,"Employed, full-time",Man,Fully remote,97000.0
21620,Germany,Master’s degree,8.0,35-44 years old,"Employed, full-time",Man,Fully remote,78916.0
34401,Czech Republic,Less than a Bachelors,9.0,35-44 years old,"Employed, full-time;Independent contractor, fr...",Man,Fully remote,31176.0
8524,Finland,Less than a Bachelors,4.0,35-44 years old,"Employed, full-time",Woman,"Hybrid (some remote, some in-person)",40956.0


In [50]:
df.shape

(28326, 8)

In [51]:
df['Employment'].value_counts()

Employed, full-time                                                                             23773
Independent contractor, freelancer, or self-employed                                             1954
Employed, full-time;Independent contractor, freelancer, or self-employed                         1806
Employed, part-time                                                                               474
Independent contractor, freelancer, or self-employed;Employed, part-time                          144
Employed, full-time;Employed, part-time                                                            79
Employed, full-time;Independent contractor, freelancer, or self-employed;Employed, part-time       56
Retired                                                                                            15
I prefer not to say                                                                                14
Employed, part-time;Retired                                                       

In [52]:
df = df[df["Employment"] == "Employed, full-time"]
df = df.drop("Employment", axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23773 entries, 2 to 73121
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country       23773 non-null  object 
 1   EdLevel       23773 non-null  object 
 2   YearsCodePro  23701 non-null  float64
 3   Age           23773 non-null  object 
 4   Gender        23773 non-null  object 
 5   RemoteWork    23772 non-null  object 
 6   Salary        23773 non-null  float64
dtypes: float64(2), object(5)
memory usage: 1.5+ MB


In [53]:
df=df.dropna()

In [54]:
df.isnull().sum()

Country         0
EdLevel         0
YearsCodePro    0
Age             0
Gender          0
RemoteWork      0
Salary          0
dtype: int64

## Pycaret 

In [55]:
!pip install --pre pycaret -q

In [56]:
from pycaret.regression import *

In [57]:
setup(data = df,target = 'Salary',session_id = 85)

Unnamed: 0,Description,Value
0,Session id,85
1,Target,Salary
2,Target type,Regression
3,Original data shape,"(23700, 7)"
4,Transformed data shape,"(23700, 19)"
5,Transformed train set shape,"(16590, 19)"
6,Transformed test set shape,"(7110, 19)"
7,Numeric features,1
8,Categorical features,5
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x7c0983e33340>

In [58]:
compare_models() 

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,20619.9155,792706011.1527,28149.1243,0.6341,0.4005,0.3486,0.663
catboost,CatBoost Regressor,20641.8384,800188919.9207,28281.5558,0.6306,0.3996,0.3457,3.435
xgboost,Extreme Gradient Boosting,21074.7785,835199283.2,28892.8393,0.6145,0.4107,0.3524,0.988
br,Bayesian Ridge,21469.3095,837930625.8408,28938.9489,0.6132,0.4289,0.3644,0.16
lr,Linear Regression,21470.1626,838053209.5055,28941.1258,0.6131,0.429,0.3644,0.59
lasso,Lasso Regression,21470.4127,838079541.3522,28941.5808,0.6131,0.429,0.3644,0.272
ridge,Ridge Regression,21470.028,838022863.7841,28940.592,0.6131,0.4289,0.3644,0.203
llar,Lasso Least Angle Regression,21470.5276,838079710.113,28941.5847,0.6131,0.429,0.3644,0.163
en,Elastic Net,22056.7969,880516900.7707,29665.572,0.5935,0.4223,0.3783,0.163
rf,Random Forest Regressor,22504.1494,937116620.762,30601.5513,0.5675,0.4307,0.3764,1.509


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

## Train Test Split

In [59]:
X = df.drop("Salary", axis=1)
y = df["Salary"]

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10, random_state=85)

In [61]:
X_train.sample(5)

Unnamed: 0,Country,EdLevel,YearsCodePro,Age,Gender,RemoteWork
44167,Turkey,Post grad,8.0,25-34 years old,Man,"Hybrid (some remote, some in-person)"
64180,United States of America,Bachelor’s degree,17.0,35-44 years old,Man,Fully remote
4350,South Africa,Master’s degree,22.0,45-54 years old,Man,Full in-person
8412,United Kingdom of Great Britain and Northern I...,Bachelor’s degree,3.0,25-34 years old,Man,Fully remote
64151,France,Master’s degree,17.0,35-44 years old,Man,"Hybrid (some remote, some in-person)"


# Column Transformer

In [62]:

# Define the preprocessing steps for each type of column
categorical_preprocessor = OneHotEncoder(handle_unknown='ignore')
# ordinal_preprocessor = LabelEncoder()
numeric_preprocessor = StandardScaler()
poly_features = PolynomialFeatures()

# Define the steps for the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_preprocessor, ['Country', 'Age', 'Gender', 'RemoteWork', 'EdLevel']),
        ('num', numeric_preprocessor, ['YearsCodePro'])
#         
    ])

In [63]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Hyper Parameter tunning

In [64]:
def parameter_finder (model, parameters):
    
    start = time.time()
    
    grid = GridSearchCV(model, 
                        param_grid = parameters, 
                        refit = True, 
                        cv = KFold(shuffle = True, random_state = 1), 
                        n_jobs = -1)
    grid_fit = grid.fit(X_train_transformed, y_train)
    y_train_pred = grid_fit.predict(X_train_transformed)
    y_pred = grid_fit.predict(X_test_transformed)
    
    train_score =grid_fit.score(X_train_transformed, y_train)
    test_score = grid_fit.score(X_test_transformed, y_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    
    model_name = str(model).split('(')[0]
    
    end = time.time()
    
    print(f"The best parameters for {model_name} model is: {grid_fit.best_params_}")
    print("--" * 10)
    print(f"(R2 score) in the training set is {train_score:0.2%} for {model_name} model.")
    print(f"(R2 score) in the testing set is {test_score:0.2%} for {model_name} model.")
    print(f"RMSE is {RMSE:,} for {model_name} model.")
    print("--" * 10)
    print(f"Runtime of the program is: {end - start:0.2f}")
    
       
    return train_score, test_score, RMSE

## Make Model

In [65]:
import time
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor


## GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor(random_state = 1)
param_gbr = {'n_estimators': np.arange(50, 201,50),
            'learning_rate': [0.001, 0.01,0.1],
            'max_depth': np.arange(3,11,4),
            'min_samples_split' : np.arange(3,11, 4),
            'min_samples_leaf' : np.arange(3,11, 4)}

gbr_train_score, gbr_test_score,gbr_RMSE = parameter_finder(gbr, param_gbr)

## XGBRegressor

In [None]:
# xgboost = XGBRegressor(n_jobs = -1)
# param_xgboost = {'n_estimators': [100,300],
#              'learning_rate': [0.1,0.05],
#              'subsample': [0.75],
#              'colsample_bytree': [1],
#              'max_depth': [3,4,5],
#              'gamma': [0]}

# xgboost_train_score, xgboost_test_score, xgboost_RMSE = parameter_finder(xgboost, param_xgboost)

## Pipeline

In [66]:
# Define the steps for the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))  # GradientBoostingRegressor 
])

In [67]:
pipeline.fit(X_train, y_train)

In [68]:
y_pred = pipeline.predict(X_test)

In [70]:
r2_score(y_test, y_pred)

0.6200793693485266

## Save Model

In [71]:
import joblib
joblib.dump(pipeline, 'pipeline.joblib')

['pipeline.joblib']