In [64]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

ModuleNotFoundError: No module named 'xgboost'

In [2]:
df_train = pd.read_csv(
    'Data/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

In [3]:
# store_nbr, item_nbr, date, id, onpromotion
# Dates: 8/16/2017 -> 8/31/2017
df_test = pd.read_csv(
    "Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "Data/items.csv",
).set_index("item_nbr")


In [5]:
# COLUMNS:  date	store_nbr	item_nbr	unit_sales	onpromotion
# dates from 1/1/2017 -> 8/15/2017
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [6]:
# Creates a dataframe with dates as columns: 1/1/2017 -> 8/15/2017 
# index: store_nbr, item_nbr
# values: T/F is OnPromotion?
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [7]:
# adds 0:00:00 to columns
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [8]:
# Columns: dates from 8/16/2017 -> 8/31/2017
# Index: store_nbr, item_nbr
# Values: T/F onpromotion?
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)

In [9]:
# adds 0:00:00 to columns
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [10]:
# Probably fills in the blanks for any store-item combo that sold before 8/15 but not between 8/15 and 8/31/2017
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [11]:
# Combines train and test. 
# Columns: dates from 1/1/2017 to 8/31/2017
# Index: store_nbr, item_nbr
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)

In [12]:
del promo_2017_test, promo_2017_train

In [13]:
# Columns: dates from 1/1/2017 to 8/15/2017
# Index: store_nbr, item_nbr
# Values: unit_sales
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)

In [14]:
# adds 0:00:00 to columns
df_2017.columns = df_2017.columns.get_level_values(1)

In [15]:
# Index: item_nbr
# Columns: family, class, perishable
items = items.reindex(df_2017.index.get_level_values(1))

In [16]:
# Gets values for columns that match from dt-minus up to dt (# of periods)
# Subtracts minus days from dt, then displays the next periods days
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [44]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        #"mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        #"mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        #"mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        #"mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        #"mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        #"mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        #"promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        #"promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        #"promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    #for i in range(7):
        #X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        #X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    #for i in range(16):
        #X["promo_{}".format(i)] = promo_2017[
            #t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        # gets the unit sales for the 16 days following t2017
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [62]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):  # i in 0, 1, 2, 3, 4, 5
    # creates timedelta objects of 0, 7, 14, 21, 28, 35 days
    delta = timedelta(days=7 * i)
    #y_tmp provides the unit_sales from t2017 + delta and for the next 16days
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)



X_train = pd.concat(X_l, axis=0)
# y_train appears to be unit sales for 16 days following 5/31/2017 up to 6/15/2017
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
#
# y_val consists of unit sales of 16 days following 7/26/2017 up to 8/10/2017
X_val, y_val = prepare_dataset(date(2017, 7, 26))

X_test = prepare_dataset(date(2017, 8, 16), is_train=False)


Preparing dataset...


In [65]:
my_model = GradientBoostingRegressor()
my_model.fit(X_train, y_train)

ValueError: bad input shape (1005090, 16)