<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Train-test split</a></span></li><li><span><a href="#Fit-the-model" data-toc-modified-id="Fit-the-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Fit the model</a></span></li></ul></div>

In [32]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import pickle

In [33]:
df = pd.read_csv('df_model_eu.csv')

In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,gender,country,education,experience_prog,experience_ml,industry,company_size_g,team_size_g,role_group,programming_lang_c,salary_avg
0,10,4,0,10,4,4,4,1,1,0,0,4,12500.0
1,11,7,0,14,2,5,1,8,2,0,2,2,2500.0
2,17,2,1,6,3,2,0,0,0,0,1,2,27500.0
3,24,2,0,5,3,1,0,8,0,0,3,2,35000.0
4,38,0,0,0,2,2,0,0,0,3,1,2,500.0


In [35]:
df = df.drop(['Unnamed: 0'], axis=1)

### Train-test split

In [36]:
X = df.drop('salary_avg', axis=1)
y = df['salary_avg']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [16]:
X_train_num = X_train.select_dtypes(np.number)
X_test_num = X_test.select_dtypes(np.number)

In [17]:
transformer = StandardScaler()
transformer.fit(X_train_num)

# saving in a pickle
with open('std_transformer.pickle', 'wb') as file:
    pickle.dump(transformer, file)
    
# loading from a pickle  
with open('std_transformer.pickle', 'rb') as file:
    loaded_transformer = pickle.load(file)

X_train_ = loaded_transformer.transform(X_train_num)
X_test_ = loaded_transformer.transform(X_test_num)

### Fit the model

In [38]:
clf = RandomForestRegressor(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.14854292656918644


In [39]:
from sklearn.model_selection import cross_val_score
clf = RandomForestRegressor(max_depth=2, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(cross_val_scores)
#print(np.mean(cross_val_scores))

[0.12768419 0.13713707 0.14748605 0.04608074 0.13593804 0.13326222
 0.16367541 0.11816777 0.15816573 0.18505628]


In [19]:
from sklearn.metrics import accuracy_score
scores =[]
for k in range(1, 200):
    rfc = RandomForestRegressor(n_estimators=k)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))

import matplotlib.pyplot as plt
%matplotlib inline

# plot the relationship between K and testing accuracy
plt.plot(x_axis, y_axis)
plt.plot(range(1, 200), scores)
plt.xlabel('Value of n_estimators for Random Forest Classifier')
plt.ylabel('Testing Accuracy')


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [20]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [70, 80, 90, 100, 300],
    'min_samples_split': [5, 10, 15, 20, 30],
    #'min_leaf_size': [50, 60, 70],
    'min_samples_leaf' : [1, 3, 2, 4, 5, 7, 9],
    'max_depth': [12, 14, 16, 18, 20],
    'max_features': ['sqrt', 'auto', 'log2']
#    'max_samples' : ['None', 0.5]
    }
clf = RandomForestRegressor(random_state=100)

grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1)
grid_search.fit(X_train,y_train)
grid_search.best_params_ #To check the best set of parameters returned


{'max_depth': 12,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'min_samples_split': 15,
 'n_estimators': 80}

In [None]:
n_estimators = [50, 100, 150, 200, 500, 1000]
# number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# max number of level in the trees
max_depth = [2,4,6,8,10]
# min number of samples required to split the node
min_samples_split = [2,4]
# min number of samples 
min_samples_leaf = [1, 2]

In [21]:
clf = RandomForestRegressor(random_state=0, max_features='sqrt', min_samples_leaf=3,min_samples_split=15, max_depth=12, n_estimators=80)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(cross_val_scores)
#print(np.mean(cross_val_scores))

[0.28340773 0.27589047 0.28078926 0.15164818 0.24323967 0.302079
 0.26413083 0.17323278 0.36615569 0.32222087]


In [22]:
clf = RandomForestRegressor(random_state=0, max_features='sqrt', min_samples_leaf=3, min_samples_split=15, max_depth=12, n_estimators=80)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.32235871717068154


In [23]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [24]:
from sklearn.metrics import mean_squared_error
t= mean_squared_error(y_test, y_pred, squared=False)


In [25]:
#MSE = np.square(np.subtract(y_actual,y_predicted)).mean() 
import math
RMSE = math.sqrt(t)
print(RMSE)

189.63502365623583


In [None]:
#h2o 

In [40]:
clf.fit(X_train, y_train)
X_train.head()
feature_names = X_train.columns
feature_names = list(feature_names)

df_fs = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df_fs.columns = ['columns_name', 'score_feature_importance']
df_fs.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
5,experience_ml,0.594259
9,role_group,0.175988
2,country,0.115091
4,experience_prog,0.067443
0,age,0.045657
6,industry,0.001562
1,gender,0.0
3,education,0.0
7,company_size_g,0.0
8,team_size_g,0.0


In [41]:
df1t = df.drop(['gender','education','company_size_g','team_size_g','programming_lang_c'], axis=1)

In [42]:
df1t.head()

Unnamed: 0,age,country,experience_prog,experience_ml,industry,role_group,salary_avg
0,4,10,4,4,1,0,12500.0
1,7,14,5,1,8,2,2500.0
2,2,6,2,0,0,1,27500.0
3,2,5,1,0,8,3,35000.0
4,0,0,2,0,0,1,500.0


In [43]:
X1 = df1t.drop('salary_avg', axis=1)
y1 = df1t['salary_avg']

In [49]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.20, random_state=42)

In [50]:
clf = RandomForestRegressor(max_depth=2, random_state=0)
clf.fit(X1_train, y1_train)
print(clf.score(X1_test, y1_test))

0.14854292656918644


In [51]:
clf = RandomForestRegressor(random_state=0, max_features='sqrt', min_samples_leaf=3, min_samples_split=15, max_depth=12, n_estimators=80)
clf.fit(X1_train, y1_train)
print(clf.score(X1_test, y1_test))

0.32203550656151725


In [47]:
df2t = df.drop(['company_size_g','team_size_g'],axis=1)

In [48]:
X2 = df2t.drop('salary_avg', axis=1)
y2 = df2t['salary_avg']

In [52]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.20, random_state=42)

In [54]:
clf = RandomForestRegressor(max_depth=2, random_state=0)
clf.fit(X2_train, y2_train)
print(clf.score(X2_test, y2_test))

0.14854292656918644


In [55]:
clf = RandomForestRegressor(random_state=0, max_features='sqrt', min_samples_leaf=3, min_samples_split=15, max_depth=12, n_estimators=80)
clf.fit(X2_train, y2_train)
print(clf.score(X2_test, y2_test))

0.31309728630435196
