In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
import csv as csv
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import median_absolute_error, mean_absolute_error
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
sns.set()
%matplotlib inline

In [12]:
data = pd.concat([pd.read_csv('data1.csv'), pd.read_csv('data2.csv'), pd.read_csv('data3.csv')]
                   , axis = 0)

In [14]:
data.drop(data.columns[0], axis=1, inplace=True) ## removing the row index no
data.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count,meter,day_of_week,month_,day_,meter_reading,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,Education,7432,2008.0,,0,0,1,4,0.0,12.666667,2.428571,4.429167,0.666667,1015.621739,318.333333,4.283333
1,0,1,Education,2720,2004.0,,0,0,1,4,0.0,12.666667,2.428571,4.429167,0.666667,1015.621739,318.333333,4.283333
2,0,2,Education,5376,1991.0,,0,0,1,4,0.0,12.666667,2.428571,4.429167,0.666667,1015.621739,318.333333,4.283333
3,0,3,Education,23685,2002.0,,0,0,1,4,0.0,12.666667,2.428571,4.429167,0.666667,1015.621739,318.333333,4.283333
4,0,4,Education,116607,1975.0,,0,0,1,4,0.0,12.666667,2.428571,4.429167,0.666667,1015.621739,318.333333,4.283333


In [15]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['meter_reading']), 
                                                    data[['meter_reading']], 
                                                    test_size=0.25, 
                                                    random_state=42, shuffle=True)

In [16]:
features = ['site_id','building_id','square_feet','meter','month_','air_temperature','dew_temperature',
            'wind_speed','cloud_coverage','wind_direction','day_','precip_depth_1_hr']

In [17]:
le = LabelEncoder()
X_train.primary_use = le.fit_transform(X_train['primary_use'])
X_test.primary_use = le.transform(X_test['primary_use'])

In [18]:
X_train.columns = [col.rstrip('_') for col in X_train.columns] 
X_test.columns = [col.rstrip('_') for col in X_test.columns] 

In [19]:
def huber_approx_obj(train, preds):
    """
    Function returns gradient and hessein of the Pseudo-Huber function.
    """
    d = preds - train
    h = 1  ## constant
    scale = 1 + (d / h) ** 2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt
    hess = 1 / scale / scale_sqrt
    return grad, hess

In [20]:
## define huber loss - minimizing it means maximizing its negative
def huber_loss(preds, train):
    d = preds - train
    h = 1
    return -1 * np.sum(np.sqrt(1 + (d/h)**2) - 1)

In [88]:
#pipeline

num_features = ['site_id', 'building_id', 'primary_use', 'square_feet', 'meter', 'month',
                'day', 'air_temperature', 'dew_temperature', 'wind_speed', 'wind_direction',
                'cloud_coverage', 'precip_depth_1_hr']

num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', preprocessing.StandardScaler())])


preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features)])

regressor = RandomForestRegressor(random_state=42,n_estimators = 100)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])
        

In [95]:
#crossvalidation

# Number of features to consider at every split
max_features = range(5,14)
# Maximum number of levels in tree
max_depth = [5,8,10,12,15,18,20,22,25]
# Minimum number of samples required to split a node
min_samples_split = np.linspace(0.1, 1.0, 10, endpoint=True)
# Minimum number of samples required at each leaf node
min_samples_leaf = range(1,8)
# Method of selecting samples for training each tree
bootstrap = [True, False]


hyperparameters = {'regressor__max_depth' : max_depth,
           'regressor__max_features': max_features,
           'regressor__min_samples_split': min_samples_split,
           'regressor__min_samples_leaf': min_samples_leaf,
           'regressor__bootstrap': bootstrap}

search = RandomizedSearchCV(pipeline, param_distributions=hyperparameters, n_iter=40, 
                            scoring='neg_median_absolute_error', random_state=42, cv=5, 
                            verbose=1, n_jobs=-1, return_train_score=True)
search.fit(X_train[num_features], y_train)



Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.4s finished
  self._final_estimator.fit(Xt, y, **fit_params)


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocessor',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('num',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                     

In [96]:
def report_best_scores(results, n_top=3):
    """Function gives hyperparameters for the top n models"""
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [97]:
report_best_scores(search.cv_results_, 10)

Model with rank: 1
Mean validation score: -0.716 (std: 0.076)
Parameters: {'regressor__min_samples_split': 0.5, 'regressor__min_samples_leaf': 2, 'regressor__max_features': 2, 'regressor__bootstrap': True}

Model with rank: 2
Mean validation score: -0.727 (std: 0.069)
Parameters: {'regressor__min_samples_split': 0.5, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 1, 'regressor__bootstrap': True}

Model with rank: 3
Mean validation score: -0.812 (std: 0.095)
Parameters: {'regressor__min_samples_split': 0.6, 'regressor__min_samples_leaf': 3, 'regressor__max_features': 2, 'regressor__bootstrap': True}

Model with rank: 4
Mean validation score: -0.821 (std: 0.104)
Parameters: {'regressor__min_samples_split': 0.8, 'regressor__min_samples_leaf': 2, 'regressor__max_features': 2, 'regressor__bootstrap': True}

Model with rank: 4
Mean validation score: -0.821 (std: 0.104)
Parameters: {'regressor__min_samples_split': 0.7000000000000001, 'regressor__min_samples_leaf': 3, 'regressor_

In [98]:
y_train_pred = search.predict(X_train[num_features]) 
y_test_pred = search.predict(X_test[num_features])

In [99]:
train_medae = median_absolute_error(y_train, y_train_pred)
test_medae = median_absolute_error(y_test, y_test_pred)
print(f'MEDAE: Train = {train_medae:.2f} , Test = {test_medae:.2f}')

MEDAE: Train = 0.65 , Test = 0.88


In [100]:
train_medae = huber_loss(y_train.values.ravel(), y_train_pred)
test_medae = median_absolute_error(y_test.values.ravel(), y_test_pred)
print(f'MEDAE: Train = {train_medae:.2f} , Test = {test_medae:.2f}')

MEDAE: Train = -330.26 , Test = 0.88
