In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [85]:
job_postings_data = pd.read_csv('job_postings.csv')

In [86]:
# look at the missing data
missing_data = job_postings_data.isna().sum()
# print(missing_data)
job_postings_data = job_postings_data[job_postings_data['description'].notna()]

# now drop the max, min salaries with NaN values
job_postings_data = job_postings_data[job_postings_data['max_salary'].notna()]
# before dropping the median salary, save a copy of the dataframe
jobs_with_median = job_postings_data
# drop the median salary for now-- just since there are so many NaN values
job_postings_data = job_postings_data.drop('med_salary', axis=1)

# now observe all the columns that still have missing data
missing_data = job_postings_data.isna().sum()
print(missing_data)



job_id                           0
company_id                     117
title                            0
description                      0
max_salary                       0
min_salary                       0
pay_period                       0
formatted_work_type              0
location                         0
applies                       1939
original_listed_time             0
remote_allowed                4624
views                          647
job_posting_url                  0
application_url               2396
application_type                 0
expiry                           0
closed_time                   5243
formatted_experience_level    1610
skills_desc                   5488
listed_time                      0
posting_domain                2784
sponsored                        0
work_type                        0
currency                         0
compensation_type                0
dtype: int64


**Predicting Entry Level Jobs based on Min and Max Salary**

In [68]:
# create a df with just the salary
job_postings_data = job_postings_data[job_postings_data['formatted_experience_level'].notna()]
experience_level = job_postings_data[["max_salary", "min_salary"]]

# create our X and y
X = experience_level
y = job_postings_data["formatted_experience_level"]

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)

predictions = forest_model.predict(X_test)
print(f"RandomForestClassifier Classification: \n {classification_report(y_test, predictions)}")


RandomForestClassifier Classification: 
                   precision    recall  f1-score   support

       Associate       0.31      0.21      0.25       126
        Director       0.48      0.27      0.35        48
     Entry level       0.61      0.55      0.58       215
       Executive       0.00      0.00      0.00         6
      Internship       0.50      0.29      0.36         7
Mid-Senior level       0.64      0.78      0.70       381

        accuracy                           0.59       783
       macro avg       0.42      0.35      0.37       783
    weighted avg       0.56      0.59      0.57       783



I got an error about the classification having multiple classes, so for now, since I mostly care about entry level, I'm going to do a one-hot encoding and just look at entry level

In [69]:
one_hot_data = pd.get_dummies(job_postings_data, columns=['formatted_experience_level'], dtype=int)
# print(one_hot_data.columns)
# split the data
X = one_hot_data[['min_salary', 'max_salary']]
y = one_hot_data['formatted_experience_level_Entry level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)

predictions = forest_model.predict(X_test)
print(f"RandomForestClassifier Classification: \n {classification_report(y_test, predictions)}")

RandomForestClassifier Classification: 
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       568
           1       0.64      0.51      0.57       215

    accuracy                           0.79       783
   macro avg       0.73      0.70      0.71       783
weighted avg       0.78      0.79      0.78       783



In [70]:
# now run a grid search with different parameters
# 
param_grid = {"n_estimators": [100, 200],
              "max_depth": [4,5,6,7,8],
              "criterion": ['gini', 'entropy']}
forest_grid = GridSearchCV(forest_model, param_grid, scoring='f1')
# run the search
forest_grid.fit(X_train, y_train)
print(f"Best Parameters: {forest_grid.best_params_}\n")
print(f"Best Score: {forest_grid.best_score_}\n")

# predict the labels for the test set
forest_grid_predicted = forest_grid.predict(X_test)
# create the confusion matrix
confusion = confusion_matrix(y_test, forest_grid_predicted)
print(f"Confusion Matrix: \n{confusion}")

Best Parameters: {'criterion': 'gini', 'max_depth': 8, 'n_estimators': 100}

Best Score: 0.5516687067322682

Confusion Matrix: 
[[520  48]
 [124  91]]


In [95]:
print(jobs_with_median.columns)

Index(['job_id', 'company_id', 'title', 'description', 'max_salary',
       'med_salary', 'min_salary', 'pay_period', 'formatted_work_type',
       'location', 'applies', 'original_listed_time', 'remote_allowed',
       'views', 'job_posting_url', 'application_url', 'application_type',
       'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc',
       'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type'],
      dtype='object')


I started trying to figure out if having the median salary improved the predictions, but I'm pretty sure dropping the NaN values for both "formatted_experience_level" and "med_salary" gets rid of formatted_experience_level. I'll look more into this later.

In [108]:
# # now look at it with the median salary
# jobs_with_median = jobs_with_median[jobs_with_median['formatted_experience_level'].notna()]
# jobs_with_median = jobs_with_median[jobs_with_median['med_salary'].notna()]

# one_hot_median = pd.get_dummies(jobs_with_median, columns=['formatted_experience_level'], dtype=int)
# print(one_hot_median.columns)
# # split the data
# X = one_hot_median[['min_salary', 'max_salary', 'med_salary']]
# y = one_hot_median['formatted_experience_level_Entry level']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# forest_model = RandomForestClassifier()
# forest_model.fit(X_train, y_train)

# predictions = forest_model.predict(X_test)
# print(f"RandomForestClassifier Classification: \n {classification_report(y_test, predictions)}")


Now I want to see how well salary can predict the other experience levels that aren't entry-level  

In [11]:
# take a few steps back with the data
job_postings_data = pd.read_csv('job_postings.csv')
missing_data = job_postings_data.isna().sum()
# print(missing_data)
job_postings_data = job_postings_data[job_postings_data['description'].notna()]

# now drop the max, min salaries with NaN values
job_postings_data = job_postings_data[job_postings_data['max_salary'].notna()]
# drop the median salary for now-- just since there are so many NaN values
job_postings_data = job_postings_data.drop('med_salary', axis=1)

job_postings_data = job_postings_data[job_postings_data['formatted_experience_level'].notna()]

In [12]:
# create a function for predicting based off of formatted experience level
def experience_level_predictions(experience_level):
    X = one_hot_data[['min_salary', 'max_salary']]
    y = one_hot_data[experience_level]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    forest_model = RandomForestClassifier()
    forest_model.fit(X_train, y_train)

    predictions = forest_model.predict(X_test)

    # now run a grid search with different parameters
    # 
    param_grid = {"n_estimators": [100, 200],
                "max_depth": [4,5,6,7,8],
                "criterion": ['gini', 'entropy']}
    forest_grid = GridSearchCV(forest_model, param_grid, scoring='f1')
    # run the search
    forest_grid.fit(X_train, y_train)
    print(f"Best Parameters: {forest_grid.best_params_}\n")
    print(f"Best Score: {forest_grid.best_score_}\n")

    # predict the labels for the test set
    forest_grid_predicted = forest_grid.predict(X_test)
    # create the confusion matrix
    confusion = confusion_matrix(y_test, forest_grid_predicted)
    print(f"Confusion Matrix: \n{confusion}")

In [14]:
one_hot_data = pd.get_dummies(job_postings_data, columns=['formatted_experience_level'], dtype=int)
print(one_hot_data.columns)


Index(['job_id', 'company_id', 'title', 'description', 'max_salary',
       'min_salary', 'pay_period', 'formatted_work_type', 'location',
       'applies', 'original_listed_time', 'remote_allowed', 'views',
       'job_posting_url', 'application_url', 'application_type', 'expiry',
       'closed_time', 'skills_desc', 'listed_time', 'posting_domain',
       'sponsored', 'work_type', 'currency', 'compensation_type',
       'formatted_experience_level_Associate',
       'formatted_experience_level_Director',
       'formatted_experience_level_Entry level',
       'formatted_experience_level_Executive',
       'formatted_experience_level_Internship',
       'formatted_experience_level_Mid-Senior level'],
      dtype='object')


Associate

In [15]:
experience_level_predictions('formatted_experience_level_Associate')

Best Parameters: {'criterion': 'gini', 'max_depth': 8, 'n_estimators': 100}

Best Score: 0.0294307577454274

Confusion Matrix: 
[[657   0]
 [126   0]]


Director

In [16]:
experience_level_predictions('formatted_experience_level_Director')

Best Parameters: {'criterion': 'gini', 'max_depth': 8, 'n_estimators': 200}

Best Score: 0.16320685434516521

Confusion Matrix: 
[[731   4]
 [ 42   6]]


Entry level (just for organization's sake)

In [17]:
experience_level_predictions('formatted_experience_level_Entry level')

Best Parameters: {'criterion': 'gini', 'max_depth': 8, 'n_estimators': 100}

Best Score: 0.5564285457502075

Confusion Matrix: 
[[521  47]
 [125  90]]


Executive

In [18]:
experience_level_predictions('formatted_experience_level_Executive')

Best Parameters: {'criterion': 'gini', 'max_depth': 6, 'n_estimators': 200}

Best Score: 0.0808080808080808

Confusion Matrix: 
[[777   0]
 [  6   0]]


Internship

In [19]:
experience_level_predictions('formatted_experience_level_Internship')

Best Parameters: {'criterion': 'gini', 'max_depth': 7, 'n_estimators': 200}

Best Score: 0.5219047619047619

Confusion Matrix: 
[[776   0]
 [  6   1]]


Mid-Senior Level

In [20]:
experience_level_predictions('formatted_experience_level_Mid-Senior level')

Best Parameters: {'criterion': 'gini', 'max_depth': 8, 'n_estimators': 200}

Best Score: 0.7226236581282536

Confusion Matrix: 
[[268 134]
 [107 274]]


From this, we can conclude that the data makes it so that the best accuracy is predicting mid-senior level jobs, and also that for every experience level, the gini index was the best criterion for our grid search  