In [1]:
# ✅ to remove, just for testing models
import math
import pandas as pd
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from xgboost.sklearn import XGBRegressor

import numpy as np
from sklearn.metrics import  mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import get_scorer, make_scorer
from interpret.glassbox import ExplainableBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_preprocessed = pd.read_csv(r"data_preprocessed\with_outliers")

# Models

In [3]:
## Train / test split 
y = df_preprocessed.pop('average_rating')

X_train, X_test, y_train, y_test = train_test_split(df_preprocessed, y, test_size=0.3, random_state=42)

In [4]:
## Result Tab
global_result = pd.DataFrame(columns=['model name','RMSE','mae','r2_score'])

## Baseline

In [5]:
len(y_test)

3339

In [6]:
mean_ratings = y_test.mean().round(2)
y_pred = np.full((len(y_test)), mean_ratings)
y_pred

array([3.93, 3.93, 3.93, ..., 3.93, 3.93, 3.93])

##  Random forest

In [7]:
parameters = {
    'n_estimators': [50, 100, 200],
    'max_depth': [7, 10, 12, 15],
    'min_samples_split': [2,5],
    'min_samples_leaf': [5, 10, 15]
}
regr = RandomForestRegressor(random_state=0)

In [8]:
clf = GridSearchCV(estimator = regr, param_grid = parameters,cv=3,verbose=2, n_jobs = -1)

In [9]:
clf.fit(X_train,y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [10]:
clf.best_params_

{'max_depth': 15,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 200}

In [11]:
best_rf = clf.best_estimator_
y_pred = clf.predict(X_test)

In [12]:
importance_feature = pd.DataFrame(best_rf.feature_importances_,index=X_train.columns,columns=['Importance']).sort_values(by=['Importance'],ascending=False)
importance_feature.sort_values(by=['Importance'],ascending=True,inplace=True)
importance_feature

Unnamed: 0,Importance
times_d1,0.00391
times_d2,0.004779
editions_by_work,0.006861
people_d1,0.00733
is_series,0.00894
people_d2,0.008982
places_d2,0.012698
places_d1,0.013936
text_reviews_count,0.020405
work_ratings_count,0.041147


In [13]:
plt.figure(figsize=(18,8))
plt.barh(y=importance_feature.index,width=importance_feature.values.reshape((-1,)))
plt.show()

NameError: name 'plt' is not defined

In [None]:
### Metrics
# calculate errors
errors = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(errors)
print(f'RMSE : {rmse}')
mae = mean_absolute_error(y_test,y_pred)
print(f'mae : {mae}')
r2 = r2_score(y_test,y_pred).round(2)
print(f'r2_score : {r2.round(2)}')

# Create dictionnary to aggregte all result 
dict = {'model name' : 'Random Forest','RMSE':rmse,'mae':mae,'r2_score':r2 }
global_result.loc[len(global_result)] = pd.Series(data=dict,index=dict.keys())


RMSE : 0.25939264882569735
mae : 0.18935588668323555
r2_score : 0.21


In [None]:
pred = pd.DataFrame({'Actual': y_test.tolist(), 'Predicted': y_pred.tolist()}).head(25)
pred.head(5)

Unnamed: 0,Actual,Predicted
0,3.92,3.989234
1,3.58,3.694723
2,4.13,3.974168
3,3.73,3.838373
4,4.05,3.76585


## XGBoost

In [None]:
xgb = XGBRegressor()

In [None]:
parameters = {'nthread':[6],
              'objective':['reg:linear'],
              'learning_rate': [0.01,.03], #so called `eta` value
              'max_depth': [5,7,15],
              'subsample': [0.5],
              'n_estimators': [500,1000]}

In [None]:
xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

In [None]:
xgb_grid.fit(X_train,y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


In [None]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

0.17544607457941103
{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000, 'nthread': 6, 'objective': 'reg:linear', 'subsample': 0.5}


In [None]:
optimal_xgb = xgb_grid.best_estimator_

In [None]:
y_pred = optimal_xgb.predict(X_test)

In [None]:
### Metrics
# calculate errors
errors = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(errors)
print(f'RMSE : {rmse}')
mae = mean_absolute_error(y_test,y_pred)
print(f'mae : {mae}')
r2 = r2_score(y_test,y_pred).round(2)
print(f'r2_score : {r2.round(2)}')

# Create dictionnary to aggregte all result 
dict = {'model name' : 'XGBoost','RMSE':rmse,'mae':mae,'r2_score':r2 }
global_result.loc[len(global_result)] = pd.Series(data=dict,index=dict.keys())

pred = pd.DataFrame({'Actual': y_test.tolist(), 'Predicted': y_pred.tolist()}).head(25)

RMSE : 0.258106578611859
mae : 0.18755382690789565
r2_score : 0.22


In [None]:
#script for cross validation



#function to preview the results
def preview_metrics(YTrue, YPred):
        
        MAE = mean_absolute_error(YTrue, YPred)
        MSE = mean_squared_error(YTrue, YPred)
        R2 = r2_score(YTrue, YPred)
               
        print("MAE", MAE.round(4))
        print("RMSE", np.sqrt(MSE).round(4))
        print("R2:",R2.round(2))


models = [
     
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    XGBRegressor(),
    AdaBoostRegressor(),
    ExplainableBoostingRegressor(),
    MLPRegressor(),
    #GaussianProcessRegressor()
]

models_names = [ 
    
       
    'Linear Regression',
    'Decision Tree',
    'Random Forest',
    'XGB',
    'Ada Boost',
    'Explainable Boosting', 
    'MLP',
    #'GaussianProcessRegressor'
        
]



for model, name in zip(models, models_names):
        
    #fit model
    #model.fit(X_train,y_train)
    
    scoring = ['r2','neg_mean_absolute_error','neg_root_mean_squared_error']
    scores = cross_validate(model, X_train,y_train, cv=10, scoring=scoring, return_estimator=True)
    #print("HERE ScorEs",scores['test_neg_root_mean_squared_error'])    
    INDEX = np.argmax(scores['test_r2'])    
    #print('INDEX', INDEX)    
    bestModel = scores['estimator'][INDEX]
    pred_y_test = bestModel.predict(X_test)
    pred_y_train = bestModel.predict(X_train)
       
    print("\n***************************")
    print("*****", name ,'******')
    print("***************************")
    #print("\n******", "Training Results" ,'******\n')
    #preview_metrics(y_train, pred_y_train)  
    print("******", "Testing Results" ,'******\n')
    preview_metrics(y_test, pred_y_test)




***************************
***** Linear Regression ******
***************************
****** Testing Results ******

MAE 0.2262
RMSE 0.3677
R2: 0.05

***************************
***** Decision Tree ******
***************************
****** Testing Results ******

MAE 0.2827
RMSE 0.4383
R2: -0.35


KeyboardInterrupt: 

In [None]:
from sklearn_genetic.space import Categorical, Integer, Continuous
from sklearn.model_selection import  StratifiedKFold
from sklearn_genetic import GASearchCV


clf =  RandomForestRegressor()
#Choose the parameters range for the genetic algorithm
#This part will change according to the classifier
param_grid = { 
            #'C': Continuous(0.0,10.0),
              #'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
              #'gamma': Categorical(['scale', 'auto']),
              #'degree': Integer(3, 5),
              #'gamma': Continuous(0.0, 10.0),
              #'shrinking': Categorical([True, False]),
              #'probability': Categorical([True, False]),
              #'break_ties': Categorical([True, False]),
              #'tol':Continuous(0.0,0.1)
              #'decision_function_shape': Categorical(['ovr']),
              #'kernel': Categorical([True, False]),
              'max_depth': Integer(10, 100),
              #'max_leaf_nodes': Integer(2, 35),
              'n_estimators': Integer(100, 500),
              'min_samples_split': Integer(2, 50),
              'min_samples_leaf': Integer(5,50),
    
            }


evolved_estimator = GASearchCV(estimator=clf,
                               cv=10,
                               scoring='r2',
                               population_size=5,
                               generations=2,
                               tournament_size=3,
                               elitism=True,
                               crossover_probability=0.8,
                               mutation_probability=0.1,
                               param_grid=param_grid,
                               criteria='max',
                               #algorithm='eaMuPlusLambda',
                               n_jobs=-1,
                               verbose=True,
                               keep_top_k=4)


evolved_estimator.fit(X_train,y_train)

print(evolved_estimator.best_params_)

pred_y_test = evolved_estimator.predict(X_test)

print("******", "Testing Results" ,'******\n')
preview_metrics(y_test, pred_y_test)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	5     	0.202089	0.0123714  	0.215418   	0.178726   
1  	10    	0.214516	0.00440196 	0.219009   	0.206075   
2  	9     	0.21734 	0.00162335 	0.219379   	0.215418   
{'max_depth': 63, 'n_estimators': 295, 'min_samples_split': 5, 'min_samples_leaf': 18}
****** Testing Results ******

MAE 0.2055
RMSE 0.3195
R2: 0.28
