In [69]:
import pandas as pd
import numpy as np
import datetime as dt
import sklearn as sklearn
from sklearn.impute import SimpleImputer
import dateparser
#!pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import imageio
import os

Read the Preprocessed CSV File

In [73]:
df= pd.read_csv("final_data.csv")
cols=list(df.columns)
cols

['Unnamed: 0', 'Datetime', 'AEP_MW']

In [74]:
df=pd.DataFrame.drop(df,labels=cols[0],axis=1)
cols=list(df.columns)
cols[0]


'Datetime'

In [75]:
df[cols[0]]=pd.to_datetime(df[cols[0]], infer_datetime_format=True)
df

Unnamed: 0,Datetime,AEP_MW
0,2004-10-01 01:00:00,12379.0
1,2004-10-01 02:00:00,11935.0
2,2004-10-01 03:00:00,11692.0
3,2004-10-01 04:00:00,11597.0
4,2004-10-01 05:00:00,11681.0
...,...,...
98611,2015-12-31 20:00:00,15030.0
98612,2015-12-31 21:00:00,14642.0
98613,2015-12-31 22:00:00,14212.0
98614,2015-12-31 23:00:00,13843.0


In [76]:
cols=list(df.columns)
cols
df.dtypes

Datetime    datetime64[ns]
AEP_MW             float64
dtype: object

Create Features 

In [77]:
def hours(x):
    return x.hour
def minutes(x):
    return x.minute
def seconds(x):
    return x.second
def years(x):
    return x.year
def months(x):
    return x.month
def days(x):
    return x.day
def quarters(x):
    return x.quarter
def daysofweek(x):
    return x.dayofweek
def weeksofyear(x):
    return x.weekofyear
def daysofyear(x):
    return x.dayofyear

In [78]:
def create_features(df):
    """
    Creates time series features from datetime index
    """
    df['hour'] = df[cols[0]].apply(hours)
    df['dayofweek'] = df[cols[0]].apply(daysofweek)
    df['quarter'] = df[cols[0]].apply(quarters)
    df['month'] = df[cols[0]].apply(months)
    df['year'] = df[cols[0]].apply(years)
    df['dayofmonth'] = df[cols[0]].apply(days)
    df['weekofyear'] = df[cols[0]].apply(weeksofyear)
    df['dayofyear'] = df[cols[0]].apply(daysofyear)
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofmonth','weekofyear','dayofyear']]
    return X

In [79]:
create_features(df)

Unnamed: 0,hour,dayofweek,quarter,month,year,dayofmonth,weekofyear,dayofyear
0,1,4,4,10,2004,1,40,275
1,2,4,4,10,2004,1,40,275
2,3,4,4,10,2004,1,40,275
3,4,4,4,10,2004,1,40,275
4,5,4,4,10,2004,1,40,275
...,...,...,...,...,...,...,...,...
98611,20,3,4,12,2015,31,53,365
98612,21,3,4,12,2015,31,53,365
98613,22,3,4,12,2015,31,53,365
98614,23,3,4,12,2015,31,53,365


In [80]:
total_rows = df[cols[1]].size

In [86]:
train=df.loc[0:round(0.8*total_rows),]
train=train[[cols[0],cols[1]]]

In [89]:
test=df.loc[(round(0.8*total_rows)+1):,]
test=test[[cols[0],cols[1]]]

In [90]:
test

Unnamed: 0,Datetime,AEP_MW
78894,2013-10-01 07:00:00,13104.0
78895,2013-10-01 08:00:00,13905.0
78896,2013-10-01 09:00:00,13955.0
78897,2013-10-01 10:00:00,14186.0
78898,2013-10-01 11:00:00,14640.0
...,...,...
98611,2015-12-31 20:00:00,15030.0
98612,2015-12-31 21:00:00,14642.0
98613,2015-12-31 22:00:00,14212.0
98614,2015-12-31 23:00:00,13843.0


Unnamed: 0,Datetime,AEP_MW
0,2004-10-01 01:00:00,12379.0
1,2004-10-01 02:00:00,11935.0
2,2004-10-01 03:00:00,11692.0
3,2004-10-01 04:00:00,11597.0
4,2004-10-01 05:00:00,11681.0
...,...,...
78889,2013-10-01 02:00:00,11279.0
78890,2013-10-01 03:00:00,10980.0
78891,2013-10-01 04:00:00,10921.0
78892,2013-10-01 05:00:00,11047.0


In [91]:
X_train, y_train = create_features(train), train[cols[1]]
X_test, y_test   = create_features(test), test[cols[1]]

X_train.shape, y_train.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/panda

((78894, 8), (78894,))

In [98]:
reg = xgb.XGBRegressor(n_estimators=1000)
model=reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=50, #stop if 50 consequent rounds without decrease of error
        verbose=False)

In [99]:
print(model)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)


In [100]:
xgb_model=model.save_model
