# Build a Random Forest Regression Model

Import libraries

In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error


## Read the data

Read in the .csv file.  This .csv file has been reduced to only the necessary columns.  
  
Look at the data shape to determine the number of rows and columns.

In [2]:
#  import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/mikecolbert/3300-salary-prediction-model/refs/heads/main/2022_kaggle_survey_results_public.csv')
print(data.shape) # rows, columns

data.head() # look at the first few rows

(23997, 9)


Unnamed: 0,age,gender,country,highest_deg,code_experience,current_title,industry,company_size,annual_comp
0,30-34,Man,India,,,,,,
1,30-34,Man,Algeria,Master’s degree,1-3 years,,,,
2,18-21,Man,Egypt,Bachelor’s degree,1-3 years,,,,
3,55-59,Man,France,Some college/university study without earning ...,10-20 years,Data Scientist,Online Service/Internet-based Services,0-49 employees,"25,000-29,999"
4,45-49,Man,India,Bachelor’s degree,5-10 years,,,,


<br>

## Clean the data

How many cells are NULL in each column?

In [3]:
data.isnull().sum() # check for missing values in each column

age                    0
gender                 0
country                0
highest_deg          599
code_experience      754
current_title      13367
industry           14903
company_size       14931
annual_comp        15861
dtype: int64

What proportion of the data contains one or more missing values (NULL) in the row?

In [4]:
(data.shape[0] - data.dropna().shape[0])/data.shape[0] # proportion of rows with missing values
#24% of the rows have missing values

0.6609576197024628

We could imputate the missing values with the mean of the column but we are not focused on tuning so we will just drop the rows with missing values.

In [5]:
# drop rows with missing values (NULLs)
data.dropna(inplace=True)

How has the data shape changed from the original?  
rows, columns  

In [6]:
data.shape  # rows, columns

(8136, 9)

Look at the first few rows of the data frame.

In [7]:
data.head()

Unnamed: 0,age,gender,country,highest_deg,code_experience,current_title,industry,company_size,annual_comp
3,55-59,Man,France,Some college/university study without earning ...,10-20 years,Data Scientist,Online Service/Internet-based Services,0-49 employees,"25,000-29,999"
7,30-34,Man,Germany,Bachelor’s degree,10-20 years,Software Engineer,Insurance/Risk Assessment,250-999 employees,"100,000-124,999"
8,70+,Man,Australia,Doctoral degree,20+ years,Research Scientist,Government/Public Service,"1000-9,999 employees","100,000-124,999"
13,40-44,Man,United States of America,Doctoral degree,10-20 years,Developer Advocate,Computers/Technology,"1000-9,999 employees","200,000-249,999"
16,40-44,Man,United States of America,Master’s degree,10-20 years,Data Scientist,Computers/Technology,"1000-9,999 employees","200,000-249,999"


<br>

## Prep the variables to use in our model

Split the data into x and y data frames separating independent and dependent variables for prediction.

In [8]:
x = data.loc[:,['age','gender','country','highest_deg','code_experience','current_title', 'company_size']]
y = data.loc[:,['annual_comp']]

Review the first few rows of each data frame.

In [9]:
y.head()

Unnamed: 0,annual_comp
3,"25,000-29,999"
7,"100,000-124,999"
8,"100,000-124,999"
13,"200,000-249,999"
16,"200,000-249,999"


In [10]:
x.head()

Unnamed: 0,age,gender,country,highest_deg,code_experience,current_title,company_size
3,55-59,Man,France,Some college/university study without earning ...,10-20 years,Data Scientist,0-49 employees
7,30-34,Man,Germany,Bachelor’s degree,10-20 years,Software Engineer,250-999 employees
8,70+,Man,Australia,Doctoral degree,20+ years,Research Scientist,"1000-9,999 employees"
13,40-44,Man,United States of America,Doctoral degree,10-20 years,Developer Advocate,"1000-9,999 employees"
16,40-44,Man,United States of America,Master’s degree,10-20 years,Data Scientist,"1000-9,999 employees"


In the x data frame, map the categorical variables to numerical values for use in the regression model.

In [11]:
# perform label encoding since we have categorical variables
# from sklearn.preprocessing import LabelEncoder

# List of columns you want to encode
columns_to_encode = [
    "age",
    "gender",
    "country",
    "highest_deg",
    "code_experience",
    "current_title",
    "company_size",
]

# Loop through each column and apply LabelEncoder individually
for column in columns_to_encode:
    # Create a new instance of LabelEncoder for each column
    le = LabelEncoder()

    # Fit and transform the column and assign it back to the DataFrame
    x[column +'_enc'] = le.fit_transform(x[column])

You should now see both the categorical and encoded columns (_enc) in the data frame.

In [12]:
x.head()

Unnamed: 0,age,gender,country,highest_deg,code_experience,current_title,company_size,age_enc,gender_enc,country_enc,highest_deg_enc,code_experience_enc,current_title_enc,company_size_enc
3,55-59,Man,France,Some college/university study without earning ...,10-20 years,Data Scientist,0-49 employees,8,0,15,6,1,4,0
7,30-34,Man,Germany,Bachelor’s degree,10-20 years,Software Engineer,250-999 employees,3,0,16,0,1,11,3
8,70+,Man,Australia,Doctoral degree,20+ years,Research Scientist,"1000-9,999 employees",10,0,2,1,2,10,2
13,40-44,Man,United States of America,Doctoral degree,10-20 years,Developer Advocate,"1000-9,999 employees",5,0,55,1,1,5,2
16,40-44,Man,United States of America,Master’s degree,10-20 years,Data Scientist,"1000-9,999 employees",5,0,55,3,1,4,2


Print a map of the categorical values and the corresponding encoded values.  Copy the returned data and paste it somewhere you can refer back to it frequently.

In [13]:
# make a map of the categorical values and their corresponding encoded values
# from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

columns_to_encode = [
    "age",
    "gender",
    "country",
    "highest_deg",
    "code_experience",
    "current_title",
    "company_size",
]

# Loop through each column and apply LabelEncoder individually
for column in columns_to_encode:
    # Fit the encoder to the column
    le.fit(x[column])

    # Create a mapping dictionary from label to encoded value using the encoder's classes_
    mapping = {
        label: code for code, label in zip(le.classes_, range(len(le.classes_)))
    }

    # Display the mapping
    print(column, "mapping:", mapping)

age mapping: {0: '18-21', 1: '22-24', 2: '25-29', 3: '30-34', 4: '35-39', 5: '40-44', 6: '45-49', 7: '50-54', 8: '55-59', 9: '60-69', 10: '70+'}
gender mapping: {0: 'Man', 1: 'Nonbinary', 2: 'Prefer not to say', 3: 'Prefer to self-describe', 4: 'Woman'}
country mapping: {0: 'Algeria', 1: 'Argentina', 2: 'Australia', 3: 'Bangladesh', 4: 'Belgium', 5: 'Brazil', 6: 'Cameroon', 7: 'Canada', 8: 'Chile', 9: 'China', 10: 'Colombia', 11: 'Czech Republic', 12: 'Ecuador', 13: 'Egypt', 14: 'Ethiopia', 15: 'France', 16: 'Germany', 17: 'Ghana', 18: 'Hong Kong (S.A.R.)', 19: 'I do not wish to disclose my location', 20: 'India', 21: 'Indonesia', 22: 'Iran, Islamic Republic of...', 23: 'Ireland', 24: 'Israel', 25: 'Italy', 26: 'Japan', 27: 'Kenya', 28: 'Malaysia', 29: 'Mexico', 30: 'Morocco', 31: 'Nepal', 32: 'Netherlands', 33: 'Nigeria', 34: 'Other', 35: 'Pakistan', 36: 'Peru', 37: 'Philippines', 38: 'Poland', 39: 'Portugal', 40: 'Romania', 41: 'Russia', 42: 'Saudi Arabia', 43: 'Singapore', 44: 'Sout

Review the data frame again. We must drop the non-encoded columns.

In [14]:
x.head()

Unnamed: 0,age,gender,country,highest_deg,code_experience,current_title,company_size,age_enc,gender_enc,country_enc,highest_deg_enc,code_experience_enc,current_title_enc,company_size_enc
3,55-59,Man,France,Some college/university study without earning ...,10-20 years,Data Scientist,0-49 employees,8,0,15,6,1,4,0
7,30-34,Man,Germany,Bachelor’s degree,10-20 years,Software Engineer,250-999 employees,3,0,16,0,1,11,3
8,70+,Man,Australia,Doctoral degree,20+ years,Research Scientist,"1000-9,999 employees",10,0,2,1,2,10,2
13,40-44,Man,United States of America,Doctoral degree,10-20 years,Developer Advocate,"1000-9,999 employees",5,0,55,1,1,5,2
16,40-44,Man,United States of America,Master’s degree,10-20 years,Data Scientist,"1000-9,999 employees",5,0,55,3,1,4,2


Drop the categorical columns. Rename the encoded columns.

In [15]:

# drop the categroical columns
x.drop('age',inplace=True,axis=1) 
x.drop("gender", inplace=True, axis=1)  
x.drop("country", inplace=True, axis=1)
x.drop("highest_deg", inplace=True, axis=1)  
x.drop("code_experience", inplace=True, axis=1) 
x.drop("current_title", inplace=True, axis=1) 
x.drop("company_size", inplace=True, axis=1)

# rename the _enc columns to the original column names
x.rename(columns={"age_enc": "age"}, inplace=True)
x.rename(columns={"gender_enc": "gender"}, inplace=True) 
x.rename(columns={"country_enc": "country"}, inplace=True)  
x.rename(columns={"highest_deg_enc": "highest_deg"}, inplace=True)
x.rename(columns={"code_experience_enc": "code_experience"}, inplace=True)
x.rename(columns={"current_title_enc": "current_title"}, inplace=True)
x.rename(columns={"company_size_enc": "company_size"}, inplace=True)


Review the x data frame again to ensure dropping and renaming happened correctly.

In [16]:
x.head()

Unnamed: 0,age,gender,country,highest_deg,code_experience,current_title,company_size
3,8,0,15,6,1,4,0
7,3,0,16,0,1,11,3
8,10,0,2,1,2,10,2
13,5,0,55,1,1,5,2
16,5,0,55,3,1,4,2


Review the y data frame. Notice the y data is also categorical.

In [17]:
y.head()

Unnamed: 0,annual_comp
3,"25,000-29,999"
7,"100,000-124,999"
8,"100,000-124,999"
13,"200,000-249,999"
16,"200,000-249,999"


What are the unique categorical values in the annual_comp column?

In [18]:
y.annual_comp.unique()

array(['25,000-29,999', '100,000-124,999', '200,000-249,999',
       '150,000-199,999', '90,000-99,999', '30,000-39,999', '3,000-3,999',
       '50,000-59,999', '125,000-149,999', '15,000-19,999', '5,000-7,499',
       '10,000-14,999', '20,000-24,999', '$0-999', '7,500-9,999',
       '4,000-4,999', '80,000-89,999', '2,000-2,999', '250,000-299,999',
       '1,000-1,999', '$500,000-999,999', '70,000-79,999',
       '60,000-69,999', '40,000-49,999', '>$1,000,000', '300,000-499,999'],
      dtype=object)

<br>

### <font color="crimson"> This step is super-sketchy. </font> 

I did this so the model would return a specific predicted salary value rather than a categorical salary range.

I create a random integer between the low and high values in the categorical salary range.

In [19]:
# import pandas as pd'
# import random

# function to calculate the midpoint of a salary range
def calculate_midpoint(salary_range):
    if salary_range == ">$1,000,000":
        # Handle this case as needed; here I'm assuming $1,000,000 as a placeholder
        return 1000000
    else:
        # Remove any commas and dollar signs, then split by '-'
        low, high = salary_range.replace(",", "").replace("$", "").split("-")
        # Calculate and return the midpoint
        # midpoint = round((int(low) + int(high)) / 2)
        # return (int(low) + midpoint)
        return round(random.randint(int(low), int(high)))

# Apply the function to the 'annual_comp' column
y["annual_comp"] = y["annual_comp"].apply(calculate_midpoint)

# Display the updated DataFrame
print(y)

       annual_comp
3            25149
7           112788
8           102077
13          240417
16          240534
...            ...
23984       115184
23989         2869
23990          708
23994        29674
23995        18029

[8136 rows x 1 columns]


Review the data frame. Have the categorical ranges been replaced by random values?

In [20]:
y.head()

Unnamed: 0,annual_comp
3,25149
7,112788
8,102077
13,240417
16,240534


<br>

## Build the machine learning model

Split the x and y data frames into training and testing data for the model.

In [21]:
# create a train test split
# from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [22]:
x_test.shape

(1628, 7)

Train a random forest regression model.

I'm not a machine learning model person. I'm not entirely sure what n_estimators and max_depth are doing in this part of the code.

In [23]:
# train a random forest regressor model
# from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=70, max_depth=7)
model.fit(x_train, y_train.values.ravel())  # if y_train is a dataframe
y_predict = model.predict(x_test)

<br>

## Evaluate the model

Get the mean absolute error and r2 score for the model. 

If you are a machine learning person, continue to tune the model to get the best possible scores.

In [24]:
# from sklearn.metrics import mean_absolute_error, r2_score
print("MAE : ", mean_absolute_error(y_test,y_predict))
print("r2 score : ", r2_score(y_test,y_predict))

MAE :  37669.38643787937
r2 score :  0.26277569139081336


<br>

## Export the model

When you are happy with your model performance, dump it out to a file.

In [25]:
# save the model
# import joblib
joblib.dump(model,'salary_predict_model.ml')

['salary_predict_model.ml']

<br>

## Test the exported model

To test your model, load the model file you created in the step above.

In [26]:
# load the model we just created
# import joblib
model = joblib.load('salary_predict_model.ml')

Then run it, passing in dependent variables. These are the mapped numerical labels of the categorical values. You can get the numeric values by looking at the mapping you copied and pasted out to a file a few steps earlier.

To predict a salary, you must pass in the values in the correct order (age,gender,country,highest_deg,coding_exp, title, company_size)

Making a salary prediction for myself in 5 years:

1: '22-24'  
0: 'Man'  
55: 'United States of America'  
0: 'Bachelor’s degree'  
3: '3-5 years'  
1: 'Data Analyst (Business, Marketing, Financial, Quantitative, etc)'  
1: '10,000 or more employees'  

The returned value will be different for everyone, because the salary values are randomly generated.

I made a salary prediction for what I think my salary may look like in in the next 5 years. 

In [27]:
model.predict([[1,0,55,0,3,1,1]]) # predict the salary for a new data point

# generates a warning because we're passing just a plain list, not a pandas dataframe with 
# the same feature names the model was trained on




array([120889.58942364])

In [28]:
# to get rid of the warning, we can generate a dataframe with the correct column names

input_data = pd.DataFrame([[1, 0, 55, 0, 3, 1, 1]],
    columns=["age", "gender", "country", "highest_deg", "code_experience", "current_title", "company_size"])


In [29]:
print(input_data)

   age  gender  country  highest_deg  code_experience  current_title  \
0    1       0       55            0                3              1   

   company_size  
0             1  


In [30]:
prediction = model.predict(input_data)
print(f"predicted salary: ${prediction[0]:,.2f}")

predicted salary: $120,889.59
