In [1]:
import numpy as np
import sklearn
import pandas as pd

In [2]:
df = pd.read_csv("train_electricity.csv")

print("Dataset has", len(df), "entries.")

print(f"\n\t{'Column':20s} | {'Type':8s} | {'Min':12s} | {'Max':12s}\n")
for col_name in df.columns:
    col = df[col_name]
    print(f"\t{col_name:20s} | {str(col.dtype):8s} | {col.min():12.1f} | {col.max():12.1f}")

Dataset has 419403 entries.

	Column               | Type     | Min          | Max         

	Date                 | int64    | 1262487660.0 | 1514947775.0
	Consumption_MW       | float64  |         44.0 |      26209.0
	Coal_MW              | float64  |       -485.0 |       5702.0
	Gas_MW               | float64  |       -414.0 |       2666.0
	Hidroelectric_MW     | float64  |          0.0 |       4728.0
	Nuclear_MW           | float64  |          0.0 |       1450.0
	Wind_MW              | float64  |       -521.0 |       7944.0
	Solar_MW             | float64  |         -6.0 |        859.0
	Biomass_MW           | float64  |          0.0 |        110.0
	Production_MW        | float64  |          0.0 |      11295.0


In [3]:
## 2. Adding some datetime related features

def add_datetime_features(df):
    features = ["Year", "Week", "Day", "Dayofyear", "Month", "Dayofweek",
                "Is_year_end", "Is_year_start", "Is_month_end", "Is_month_start",
                "Hour", "Minute",]
    one_hot_features = ["Month", "Dayofweek"]

    datetime = pd.to_datetime(df.Date * (10 ** 9))

    df['Datetime'] = datetime  # We won't use this for training, but we'll remove it later

    for feature in features:
        new_column = getattr(datetime.dt, feature.lower())
        if feature in one_hot_features:
            df = pd.concat([df, pd.get_dummies(new_column, prefix=feature)], axis=1)
        else:
            df[feature] = new_column
    return df

df = add_datetime_features(df)
df.columns

Index(['Date', 'Consumption_MW', 'Coal_MW', 'Gas_MW', 'Hidroelectric_MW',
       'Nuclear_MW', 'Wind_MW', 'Solar_MW', 'Biomass_MW', 'Production_MW',
       'Datetime', 'Year', 'Week', 'Day', 'Dayofyear', 'Month_1', 'Month_2',
       'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8',
       'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Dayofweek_0',
       'Dayofweek_1', 'Dayofweek_2', 'Dayofweek_3', 'Dayofweek_4',
       'Dayofweek_5', 'Dayofweek_6', 'Is_year_end', 'Is_year_start',
       'Is_month_end', 'Is_month_start', 'Hour', 'Minute'],
      dtype='object')

In [4]:
## 3. Split data into train / validation (leaving the last six months for validation)

from dateutil.relativedelta import relativedelta

eval_from = df['Datetime'].max() + relativedelta(months=-6)  # Here we set the 6 months threshold
train_df = df[df['Datetime'] < eval_from]
valid_df = df[df['Datetime'] >= eval_from]

print(f"Train data: {train_df['Datetime'].min()} -> {train_df['Datetime'].max()} | {len(train_df)} samples.")
print(f"Valid data: {valid_df['Datetime'].min()} -> {valid_df['Datetime'].max()} | {len(valid_df)} samples.")

Train data: 2010-01-03 03:01:00 -> 2017-07-03 02:43:44 | 392461 samples.
Valid data: 2017-07-03 02:53:34 -> 2018-01-03 02:49:35 | 26942 samples.


In [7]:
## 4. Prepare data for XGBoosting (DataFrame --> DMatrix)

import xgboost as xgb

label_col = "Consumption_MW"  # The target values are in this column
to_drop = [label_col, "Date", "Datetime"]  # Columns we do not need for training

xg_trn_data = xgb.DMatrix(train_df.drop(columns=to_drop), label=train_df[label_col])
xg_vld_data = xgb.DMatrix(valid_df.drop(columns=to_drop), label=valid_df[label_col])

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [8]:
## 5. Train (mostly with default parameters; it overfits like hell)
num_round = 300
xgb_param = {"objective": "reg:squarederror" if xgb.__version__ > '0.82' else 'reg:linear',
            'eta': 0.1, 'booster': 'gbtree', 'max_depth': 5}
watchlist = [(xg_trn_data, "train"), (xg_vld_data, "valid")]

bst = xgb.train(xgb_param, xg_trn_data, num_round, watchlist)

[0]	train-rmse:6003	valid-rmse:6268.89
[1]	train-rmse:5405	valid-rmse:5676.09
[2]	train-rmse:4866.97	valid-rmse:5145.04
[3]	train-rmse:4382.97	valid-rmse:4668.05
[4]	train-rmse:3947.54	valid-rmse:4237.93
[5]	train-rmse:3555.93	valid-rmse:3846.19
[6]	train-rmse:3203.62	valid-rmse:3496.25
[7]	train-rmse:2886.87	valid-rmse:3184.39
[8]	train-rmse:2602	valid-rmse:2898.13
[9]	train-rmse:2345.94	valid-rmse:2639.74
[10]	train-rmse:2115.87	valid-rmse:2405.6
[11]	train-rmse:1909.15	valid-rmse:2193.43
[12]	train-rmse:1723.3	valid-rmse:2006.79
[13]	train-rmse:1556.63	valid-rmse:1840.91
[14]	train-rmse:1406.97	valid-rmse:1678.69
[15]	train-rmse:1272.7	valid-rmse:1542.74
[16]	train-rmse:1152.28	valid-rmse:1421.52
[17]	train-rmse:1044.56	valid-rmse:1303.82
[18]	train-rmse:948.282	valid-rmse:1203.31
[19]	train-rmse:861.91	valid-rmse:1112.92
[20]	train-rmse:785.227	valid-rmse:1038.98
[21]	train-rmse:716.919	valid-rmse:970.202
[22]	train-rmse:655.579	valid-rmse:906.738
[23]	train-rmse:601.609	valid-rmse

[190]	train-rmse:168.971	valid-rmse:310.033
[191]	train-rmse:168.851	valid-rmse:309.838
[192]	train-rmse:168.473	valid-rmse:309.723
[193]	train-rmse:168.3	valid-rmse:309.348
[194]	train-rmse:168.014	valid-rmse:307.924
[195]	train-rmse:167.937	valid-rmse:307.474
[196]	train-rmse:167.662	valid-rmse:307.039
[197]	train-rmse:167.478	valid-rmse:307.257
[198]	train-rmse:167.232	valid-rmse:306.921
[199]	train-rmse:167.107	valid-rmse:306.674
[200]	train-rmse:166.888	valid-rmse:306.415
[201]	train-rmse:166.548	valid-rmse:306.776
[202]	train-rmse:166.285	valid-rmse:306.886
[203]	train-rmse:166.26	valid-rmse:306.878
[204]	train-rmse:165.925	valid-rmse:306.751
[205]	train-rmse:165.882	valid-rmse:306.64
[206]	train-rmse:165.738	valid-rmse:306.313
[207]	train-rmse:165.629	valid-rmse:306.141
[208]	train-rmse:165.309	valid-rmse:305.995
[209]	train-rmse:165.136	valid-rmse:305.813
[210]	train-rmse:164.884	valid-rmse:305.838
[211]	train-rmse:164.589	valid-rmse:304.49
[212]	train-rmse:164.416	valid-rmse:3

In [1]:
## 6. Read test dataset, use the bst for prediction, save submission csv

test_df = pd.read_csv("input/test_electricity.csv")
test_df = add_datetime_features(test_df)
xgb_test_data = xgb.DMatrix(test_df.drop(columns=["Date", "Datetime"]))

solution_df = pd.DataFrame(test_df["Date"])
solution_df["Consumption_MW"] = bst.predict(xgb_test_data)
solution_df.to_csv("sample_submission.csv", index=False)
print("Done!")

NameError: name 'pd' is not defined