In [47]:
import numpy as np
import pandas as pd


### Load pre-processed data

In [72]:
class Data():
    
    def __init__(self, data, target):
        
        if type(target) == str:
            self._target_data = data[target]
            data.drop(target, axis=1)
            self._feature_data = data
        else:
            self._target_data = data.iloc[:, -1]
            data.iloc[:, 0:-1]
            self._feature_data = data
            
    def get_features(self):
        return self._feature_data
    
    def get_target(self):
        return self._target_data
    

In [97]:
df_train = pd.read_csv('../data.nosync/processed/one_hot_data.csv', index_col=[0])
df_train['salary'] = pd.read_csv('../data.nosync/processed/salary_data.csv', index_col=[0])
df_train.drop(["jobLevel", "degree", "major", "industry"], axis=1, inplace=True)
df_train.drop(0, axis=0, inplace=True)  # frst salary alue is Nan removed it
df_train.head()

Unnamed: 0,1HOT_jobLevel_CEO,1HOT_jobLevel_CFO,1HOT_jobLevel_CTO,1HOT_jobLevel_JANITOR,1HOT_jobLevel_JUNIOR,1HOT_jobLevel_MANAGER,1HOT_jobLevel_SENIOR,1HOT_jobLevel_VICE_PRESIDENT,1HOT_degree_BACHELORS,1HOT_degree_DOCTORAL,...,1HOT_major_NONE,1HOT_major_PHYSICS,1HOT_industry_AUTO,1HOT_industry_EDUCATION,1HOT_industry_FINANCE,1HOT_industry_HEALTH,1HOT_industry_OIL,1HOT_industry_SERVICE,1HOT_industry_WEB,salary
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,101.0
2,0,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,1,0,0,0,137.0
3,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,142.0
4,0,0,0,0,0,0,0,1,1,0,...,0,1,0,0,1,0,0,0,0,163.0
5,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,113.0


In [98]:
data = Data(df_train, target='salary')

### Find a base-line model

In this section, we will build 4 models (with default paramters) and evaluate them using root-mean-square-error and cross-validation. We will then choose model with lowest RMSE and thrive to optimize that.


In [105]:
from sklearn.model_selection import cross_val_score

class ModelSelector():
    
    def __init__(self, models_dict, data, score_metric='neg_mean_squared_error'):
        self._data = data
        self._models = models_dict
        self._score_metric = score_metric
        self._is_trained = {x:False for x in self._models.keys()}
        self._best_model = None
        self._min_error = float("inf")
    
    def get_best_model(self):
        if not all(self._is_trained.values()):
            for model_name, rgr_model in self._models.items():
                if not self._is_trained[model_name]:
                    print(f"calculating the score for {model_name}")
                    score = -1 * cross_val_score(rgr_model, self._data.get_features(), self._data.get_target(), scoring=self._score_metric, cv=5).mean()
                    if score < self._min_error:
                        self._min_error = score
                        self._best_model = {"model_name": model_name, "model":rgr_model, "score": self._min_error}
                    self._is_trained[model_name] = True
        return self._best_model
    
    def update_models(self, models_dict):
        self._is_trained.update({model_name:False for model_name in models_dict.keys()})
        self._models.update(models_dict)
    

In [106]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

models_dict = dict()
models_dict['LinearRegression'] = LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
models_dict['RandomForest'] = RandomForestRegressor()
models_dict['KNN'] = KNeighborsRegressor(weights='distance')
models_dict['GradientBoost'] = GradientBoostingRegressor(loss='lad')

model_selector = ModelSelector(data=data, models_dict=models_dict)
best_model = model_selector.get_best_model()
print(f"best model is {best_model['model_name']} with RMSE={best_model['score']}")


calculating the score for LinearRegression
calculating the score for RandomForest




calculating the score for KNN
calculating the score for GradientBoost
best model is LinearRegression with RMSE=3.1391796097506038e-24
