# Questions to the data
1. Does the number of hauses in all houses groups remain stable or there were a construction side finished in the given time frame?
2. What was the weather in the region?
3. What was the price for gas/oil/elictricity?

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from typing import List, Tuple, Any

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
from data.starting_kit.utils import create_submission
warnings.filterwarnings("ignore")

In [9]:
data = pd.read_csv('../data/public_data/train.csv')
final_index = data['pseudo_id']

KeyboardInterrupt: 

In [None]:
# drop index for feature preparation
data_ = data.drop(columns='pseudo_id')
# convert dates to pandas datetime
data_.columns = [datetime.strptime(c, "%Y-%m-%d %H:%M:%S") for c in data_.columns]
data_.head()

# GROUP BY DAY

In [None]:
# Aggregate energy use values per day
data_ = data_.T.groupby(data_.T.index.date).sum()
data_.head()

In [None]:
# features["hour"] = data.index.hour
data_.columns = [c+1 for c in range(len(data_.columns))]
data_.head()

In [None]:
# check wether datetime in ascending order , it is important for time series
print(data_.index.is_monotonic)

In [None]:
# Set dates for development phase
new_date_range = pd.date_range(start="2017-01-01", end="2019-03-31", freq="D")
# Add test dates in the data frame
data_ = data_.reindex(new_date_range)
# using dummy values in test set , fill_value = 100
# df_ = df_.T

#data_.head()

In [None]:
# saving indexes for next steps
idx_test_date = data_.index[data_[1].isna()]
idx_test_date

In [None]:
data_ = data_.fillna(method="backfill")

In [None]:
#plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':160})
#data_[1].plot()
#plt.title('Energy use forecasts for houshold group' + " 1")
#plt.show()

In [None]:
data_\
    .isna()\
    .sum()\
    .to_frame()\
    .assign(perc = lambda row: 100 * row[0] / data_.shape[0])\
    .rename(columns={0: 'Number of missed data  points', 'perc': '% of missed data points'})

# Adding features

In [None]:
df = data_.copy(deep=True)
#df.drop(columns=['ord_mean_week','mean','std'],inplace=True)
df["weekday"] = data_.index.weekday
df["dayofyear"] = data_.index.dayofyear

In [None]:
df["is_weekend"] = data_.index.weekday.isin([5, 6]).astype(np.int32)
#df["weekofyear"] = data_.index.isocalendar
df["month"] = data_.index.month
df["season"] = (data_.index.month % 12 + 3) // 3

In [None]:
def create_features(houshold_id,df=df) -> pd.DataFrame:
    df_new = df[[houshold_id, "weekday", "dayofyear", "is_weekend", "season", "month"]]
    df_new['std'] = df_new[houshold_id].rolling(7).std().fillna(method="backfill")
    df_new['mean'] = df_new[houshold_id].rolling(7).mean().fillna(method="backfill")
    df_new['lag_1'] = df[houshold_id].shift(1).fillna(method="backfill")
    df_new['lag_2'] = df[houshold_id].shift(2).fillna(method="backfill")
    df_new['lag_3'] = df_new[houshold_id].shift(3).fillna(method="backfill")
    df_new['lag_4'] = df_new[houshold_id].shift(4).fillna(method="backfill")
    df_new['lag_5'] = df_new[houshold_id].shift(5).fillna(method="backfill")
    df_new['lag_6'] = df_new[houshold_id].shift(6).fillna(method="backfill")
    df_new['lag_7'] = df_new[houshold_id].shift(7).fillna(method="backfill")
    return df_new

In [None]:
def get_weeks(idx)-> List[List[pd._libs.tslibs.timestamps.Timestamp]]:
    idx = list(idx)
    count = 0
    weeks_to_predict = []
    week = []
    for i in range(len(idx)):
        if count > 6:
            weeks_to_predict.append(week)
            week = []
            count = 0
        week.append(idx[i])
        count += 1
    return weeks_to_predict

In [None]:
weeks_test_date = get_weeks(idx_test_date)

In [None]:
def data_split_accumlated(result, weeks, n, houshold) -> Tuple[Any, Any]:
    print(weeks[n])
    result_splitted = result[result.index < weeks[n][0]]
    result_splitted_features = result_splitted.drop([houshold], axis=1)
    result_splitted_target = result_splitted[houshold]
    print("target: ", result_splitted_target)
    result_splitted_to_predict = result[(result.index >= weeks[n][0]) & (result.index <= weeks[n][6])]
    result_splitted_to_predict_features = result_splitted_to_predict.drop([houshold], axis=1)
    result_splitted_to_predict_target = result_splitted_to_predict[houshold]
    result_splitted_to_predict_target = result_splitted_to_predict_target.to_frame()

    return result_splitted_features, result_splitted_target, result_splitted_to_predict_features, result_splitted_to_predict_target

In [None]:
all_predictions = []
 
for houshold in data_.columns:
    houshold_predictions = []
    print("***** Houshold " + str(houshold) + " dataset created ****** ")
    for week in range(len(weeks_test_date)):
        #print("week "+  str(week) + " splitting started")
        result = create_features(houshold)
        features, target, features_predict, target_predict = data_split_accumlated(result, weeks_test_date, week, houshold)
        model_linear = LinearRegression()
        model_linear.fit(features, target)
        print("trained on " + str(week))
        prediction = model_linear.predict(features_predict)
        houshold_predictions.append(prediction)
        week_timestamps = weeks_test_date[week]
        target_predict['predict'] = prediction
        df.loc[week_timestamps[0]:week_timestamps[6],houshold] = target_predict['predict']
        print("=============================================")
        print(prediction)
        #print(df.loc[week_timestamps[0]:week_timestamps[6],houshold])
        print("=============================================")
    all_predictions.append(houshold_predictions)

In [None]:
weeks_columns = [week for weeks in weeks_test_date for week in weeks]
weeks_columns

In [None]:
def flatten(hous):
    return [week for weeks in hous for week in weeks]

In [None]:
flatened_predictions = []
for hous in all_predictions:
    flatened_predictions.append(flatten(hous))

In [None]:
df_from_list = pd.DataFrame([i for i in flatened_predictions], columns= [weeks_columns],index = final_index)

In [None]:
df_from_list.reset_index(inplace = True)

In [None]:
df_from_list.to_csv("./sample_submission_daily_max.csv", index = False)

In [None]:
df_from_list

In [3]:
daily = pd.read_csv("./sample_submission_daily_max.csv")
hourly = pd.read_csv("./sample_submission_hourly_max.csv")

In [4]:
create_submission(daily,hourly)

wrote submission-2022-06-25_11-10-25.780745.zip
