In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from urllib.request import urlopen
import optuna 
import functools
from scipy import stats
import datetime

def opt(X_train, y_train, X_test, y_test, trial): 
    #param_list
    n_estimators = trial.suggest_int('n_estimators', 100, 400)
#     tree_method = trial.suggest_categorical('tree_method', ['exact','approx','hist'])
    objective = trial.suggest_categorical('objective', ['reg:linear'])
    eval_metric = trial.suggest_categorical('eval_metric', ['rmse'])
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 5)
    learning_rate = trial.suggest_discrete_uniform('learning_rate', 0.1, 0.11, 0.01)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1)
    xgboost_tuna = xgb.XGBRegressor(
        random_state=0, 
        n_estimators = n_estimators,
#         tree_method = tree_method,
        objective = objective,
        eval_metric =eval_metric,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        learning_rate = learning_rate,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
    )
    xgboost_tuna.fit(X_train, y_train)
    tuna_pred_test = xgboost_tuna.predict(X_test)
#     return (median_absolute_error(y_test, tuna_pred_test))
    return (np.sqrt(MSE(y_test, tuna_pred_test)))


def weekendflag(df,weekdays):
    df9=df.copy()
    df9['ml']=df9['date'].apply(lambda x: x.weekday())
    df9['weekend']=df9['ml'].apply(lambda x: 1 if(x in weekdays) else 0)
    df9.drop(['ml'], axis=1,inplace=True)
    return df9


def XGBForecaster(df,frequency = 'hourly'):
    if frequency == 'hourly':
        df['date'] = pd.to_datetime(df['date'],format='%d-%m-%Y %H:%M')
        df5 = df.copy()
#         df5=zscore(df5)
        df5 = create_features_hourly(df)
        df5=weekendflag(df5,[5,6])
        df5.sort_values('date',inplace=True)
#         print(df5.info())
#         print(df5)
        df5["value"] = df5["value"].fillna(value=df5["value"].mean())
        df5 = df.set_index('date')
        #     print(df5.info())
        #     print(df5.head())

        X = df5.loc[:,['temperature','var1','pressure','windspeed','var2','hour','dayofweek','quarter','month','year','dayofyear','dayofmonth','weekofyear','weekend']]
        y=df5.loc[:,'value']
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=123)
        print(y)
        study = optuna.create_study()
        study.optimize(functools.partial(opt, X_train, y_train, X_test, y_test), n_trials=100)
        xg_reg5 = xgb.XGBRegressor(**study.best_params)
        m5 =xg_reg5.fit(X,y)
        
        data_pred = pd.read_csv(r'test_pavJagI.csv')
        data_pred5 =data_pred.copy()
        data_pred5 =data_pred5.iloc[:,1:]
        data_pred5['var2'] = data_pred5['var2'].map({'A': 0, 'B': 1,'C': 2})
        data_pred5.rename(columns = {'datetime':'date'}, inplace = True)
        data_pred5['date'] = pd.to_datetime(data_pred5['date'],format='%d-%m-%Y %H:%M')
        data_pred5 = create_features_hourly1(data_pred5)
        data_pred5=weekendflag(data_pred5,[5,6])
        data = pd.DataFrame()
        for year in data_pred5['date'].dt.year.unique():
            data_pred6 = data_pred5[data_pred5['date'].dt.year == year]
            data_pred6['date'] = pd.to_datetime(data_pred6['date'],format='%d-%m-%Y %H:%M')
            for month in data_pred6['date'].dt.month.unique():
                data_pred7 = data_pred6[data_pred6['date'].dt.month == month]
                data_pred7['date'] = pd.to_datetime(data_pred7['date'],format='%d-%m-%Y %H:%M')
                data_pred7.index=data_pred7['date']
                data_pred7.drop('date',axis=1,inplace = True)
                y_pred7 = pd.DataFrame(m5.predict(data_pred7),columns=['value']).values.flatten()
    #     print(y_pred5)
                data_pred7['value'] = y_pred7
                data_pred7['date'] = data_pred7.index
                outputDf=data_pred7[['date','value']]
                print(outputDf.head(1))
                data = data.append(outputDf)
#                 print(data.head(2))
#                 print(data.info())
        data.to_csv('pred_all3.csv')   
    return data
  

def create_features_hourly(df):
    """
    Creates time series features from datetime index
    """
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear

    X = df[['temperature','var1','pressure','windspeed','var2',
            'date',
            'hour',
            'dayofweek',
            'quarter',
            'month',
            'year',
            'dayofyear',
            'dayofmonth',
            'weekofyear','value']]
    return X

def create_features_hourly1(df):
    """
    Creates time series features from datetime index
    """
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear

    X2 = df[['temperature','var1','pressure','windspeed','var2',
             'date',
             'hour',
             'dayofweek',
             'quarter',
             'month',
             'year',
             'dayofyear',
             'dayofmonth',
             'weekofyear']]
    return X2


import os
cwd = os.getcwd()
print(cwd,flush=True)
import sys


C:\Users\sourav.modi\Documents


In [None]:
class test(object):
    def runMlAlgorithms(self,inputData2,upperLimit,lowerLimit,frequency='hourly'):
        outputDf = XGBForecaster(inputData2,frequency = 'hourly')
        outputDf['value'] = np.where(outputDf['value']<lowerLimit,lowerLimit,outputDf['value'])
        outputDf['value'] = np.where(outputDf['value']>upperLimit,upperLimit,outputDf['value'])


In [None]:
if __name__ == "__main__":
    t = test()
    inputData = pd.read_csv(r'train_6BJx641.csv')
    inputData = inputData.iloc[:,1:]
    inputData['var2'] = inputData['var2'].map({'A': 0, 'B': 1,'C': 2})
#     print(inputData['var2'].dtypes)
#     print(inputData['var2'].values)
    inputData.rename(columns = {'datetime':'date', 'electricity_consumption':'value'}, inplace = True)
    inputData['date'] = pd.to_datetime(inputData['date'],format='%Y-%m-%d %H:%M')
    for year in inputData['date'].dt.year.unique():
        inputData1 = inputData[inputData['date'].dt.year == year]
        inputData1['date'] = pd.to_datetime(inputData1['date'],format='%Y-%m-%d %H:%M')
        for month in inputData1['date'].dt.month.unique():
            inputData2 = inputData1[inputData1['date'].dt.month == month]
            inputData2['date'] = pd.to_datetime(inputData2['date'],format='%Y-%m-%d %H:%M')
            t.runMlAlgorithms(inputData2,10000000000000000,0,frequency='hourly')

In [9]:
        import pandas as pd
        from sklearn.model_selection import train_test_split
        df5 = pd.read_csv(r'train_6BJx641.csv')
        X = df5.loc[:,['temperature','var1','pressure','windspeed','var2','hour','dayofweek','quarter','month','year','dayofyear','dayofmonth','weekofyear','weekend']]
        y=df5.loc[:,'electricity_consumption']
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

float64
