In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import os

In [2]:
from tqdm import tqdm
import time

In [3]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [4]:
dir_path = '/content/gdrive/My Drive/ME354 Project'

In [5]:
def get_files(dir_path, filename):
    return pd.read_csv(os.path.join(dir_path, filename), header=None)

In [6]:
data = get_files(dir_path=dir_path, filename='data.csv')

In [7]:
data

Unnamed: 0,0,1,2,3,4
0,AT,V,AP,RH,PE
1,14.96,41.76,1024.07,73.17,463.26
2,25.18,62.96,1020.04,59.08,444.37
3,5.11,39.4,1012.16,92.14,488.56
4,20.86,57.32,1010.24,76.64,446.48
...,...,...,...,...,...
9564,16.65,49.69,1014.01,91,460.03
9565,13.19,39.18,1023.67,66.78,469.62
9566,31.32,74.33,1012.92,36.48,429.57
9567,24.48,69.45,1013.86,62.39,435.74


In [8]:
data = data.rename(columns=data.iloc[0])

In [9]:
data = data.drop(0)

In [10]:
data["AT"] = data.AT.astype(float)
data["V"] = data.V.astype(float)
data["AP"] = data.AP.astype(float)
data["RH"] = data.RH.astype(float)
data["PE"] = data.PE.astype(float)

In [11]:
y = data['PE']

In [12]:
X = data.drop(['PE'], axis=1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=None)

In [14]:
X_train.dtypes

AT    float64
V     float64
AP    float64
RH    float64
dtype: object

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

In [16]:
model = XGBRegressor(max_depth=5, n_estimators=120)
# evaluate model
model.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=120,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [17]:
y_pr= model.predict(X_train)
mean_absolute_error(y_train, y_pr)

2.318480929376439

In [18]:
yt_pr= model.predict(X_test)
mean_absolute_error(y_test, yt_pr)

2.561559850041022

In [19]:
r2_score(y_test, yt_pr)

0.9632783565440705

In [20]:
mean_squared_error(y_test, yt_pr)

10.864315088228356

In [21]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [30]:
xgb1 = XGBRegressor()
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 10)]
# Number of features to consider at every split
max_features = [24
                ,25,23]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 35, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [8, 6, 7]
# Minimum number of samples required at each leaf node
min_samples_leaf = [4, 5, 3]
gamma =[0.013,0.01275,0.0125]
learning_rate=[0.065,0.0675,0.07]
# Method of selecting samples for training each tree
bootstrap = [True, False]
min_child_weight= [1,2,3]
alpha=[0.00275, 0.0025,0.00225]
colsample_bytree = [0.75, 0.7,0.8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'gamma' : gamma,
               'learning_rate' : learning_rate,
               'bootstrap' : bootstrap,
               'min_child_weight' : min_child_weight,
               'alpha' : alpha,
               'colsample_bytree' : colsample_bytree}
print(random_grid)

{'n_estimators': [500, 611, 722, 833, 944, 1055, 1166, 1277, 1388, 1500], 'max_features': [24, 25, 23], 'max_depth': [5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, None], 'min_samples_split': [8, 6, 7], 'min_samples_leaf': [4, 5, 3], 'gamma': [0.013, 0.01275, 0.0125], 'learning_rate': [0.065, 0.0675, 0.07], 'bootstrap': [True, False], 'min_child_weight': [1, 2, 3], 'alpha': [0.00275, 0.0025, 0.00225], 'colsample_bytree': [0.75, 0.7, 0.8]}


In [31]:
rf_random = RandomizedSearchCV(estimator = xgb1, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 33.8min finished




RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha=...
                                        'learning_rate': [0.065, 0.0675, 0.07],
                                        'max_depth': [5, 8, 11, 14, 17, 20, 23,
                            

In [33]:
rf_random.best_params_

{'alpha': 0.0025,
 'bootstrap': True,
 'colsample_bytree': 0.75,
 'gamma': 0.01275,
 'learning_rate': 0.0675,
 'max_depth': 8,
 'max_features': 24,
 'min_child_weight': 2,
 'min_samples_leaf': 4,
 'min_samples_split': 7,
 'n_estimators': 1055}

In [34]:
model = XGBRegressor(alpha= 0.0025,
 bootstrap= True,
 colsample_bytree= 0.75,
 gamma= 0.01275,
 learning_rate= 0.0675,
 max_depth= 8,
 max_features= 24,
 min_child_weight= 2,
 min_samples_leaf= 4,
 min_samples_split=7,
 n_estimators= 1055)
# evaluate model
model.fit(X_train, y_train)



XGBRegressor(alpha=0.0025, base_score=0.5, booster='gbtree', bootstrap=True,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.75,
             gamma=0.01275, importance_type='gain', learning_rate=0.0675,
             max_delta_step=0, max_depth=8, max_features=24, min_child_weight=2,
             min_samples_leaf=4, min_samples_split=7, missing=None,
             n_estimators=1055, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [35]:
y_pr= model.predict(X_train)
mean_absolute_error(y_train, y_pr)

0.5430322713779661

In [36]:
yt_pr= model.predict(X_test)
mean_absolute_error(y_test, yt_pr)

1.978136399606926

In [37]:
r2_score(y_test, yt_pr)

0.9758898190290755

In [38]:
np.sqrt(mean_squared_error(y_test, yt_pr))

2.6707937829198585

In [None]:
model = XGBRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

MAE: -3.003 (0.071)
