In [None]:
from pathlib import Path
from warnings import simplefilter

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'


# annotations: https://stackoverflow.com/a/49238256/5769929
def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax


def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv',parse_dates=['date'])
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv', parse_dates=['date'])

In [None]:
print(train_df)

In [None]:
test_df.head()

In [None]:
train_gropued = train_df.groupby(['country','store','product'])
print(train_gropued.head(2))

In [None]:
month_trend_gr = train_gropued['num_sold'].rolling(window=30, min_periods=15, center=True).mean()
#print(month_trend.name)
#month_trend['month_trend'] = month_trend['num_sold']
month_trend = month_trend_gr.reset_index().set_index(['level_3']).sort_index()['num_sold']
print(month_trend)
#print(month_trend.reset_index().set_index(['level_3']).sort_index())

In [None]:
week_trend_gr = train_gropued['num_sold'].rolling(window=7, min_periods=4, center=True).mean()
#print(week_trend_gr)
#month_trend['month_trend'] = month_trend['num_sold']
week_trend = week_trend_gr.reset_index().set_index(['level_3']).sort_index()['num_sold']
print(week_trend)

In [None]:
year_trend_gr = train_gropued['num_sold'].rolling(window=365, min_periods=180, center=True).mean()
#print(year_trend_gr)
#month_trend['month_trend'] = month_trend['num_sold']
year_trend = year_trend_gr.reset_index().set_index(['level_3']).sort_index()['num_sold']
print(year_trend)

In [None]:
#encode dates
def transform_date(df):
    df = df.copy()
    df = df.set_index('date').to_period('D')
    df['day_of_week'] = df.index.dayofweek
    df['day_of_year'] = df.index.dayofyear
    df['year'] = df.index.year
    df['week'] = df.index.week
    return df
trending_train = transform_date(df=train_df)
print(trending_train)


In [None]:
#encode predictores, country, store, product
from sklearn.preprocessing import LabelEncoder
def transform_labels(df):
    le = LabelEncoder()
    df['country'] = le.fit_transform(df['country'])
    df['store'] = le.fit_transform(df['store'])
    df['product'] = le.fit_transform(df['product'])
    return df
trending_train = transform_labels(df=trending_train)
print(trending_train)

In [None]:
X_trending = trending_train.drop(columns=['row_id','num_sold'])
print(X_trending)

In [None]:
#year trend model
from sklearn.model_selection import train_test_split
X_train_year, X_test_year, y_train_year, y_test_year = train_test_split(X_trending, year_trend, test_size=0.2, random_state=42)

import numpy as np
def smape(A, F):    
    N = A.shape[0] 
    return 100 / N * np.sum(np.divide(np.abs(A - F), np.divide(np.abs(A) + np.abs(F), 2)))

from xgboost import XGBRegressor

year_model = XGBRegressor(eta=0.25) #eta = 0.15 sin lags
year_model.fit(X_train_year, y_train_year)
y_pred_year = year_model.predict(X_test_year)
smape(y_pred_year,y_test_year)


In [None]:
#month trend model
from sklearn.model_selection import train_test_split
X_train_month, X_test_month, y_train_month, y_test_month = train_test_split(X_trending, month_trend, test_size=0.2, random_state=42)

import numpy as np
def smape(A, F):    
    N = A.shape[0] 
    return 100 / N * np.sum(np.divide(np.abs(A - F), np.divide(np.abs(A) + np.abs(F), 2)))

from xgboost import XGBRegressor

month_model = XGBRegressor(eta=0.3) #eta = 0.15 sin lags
month_model.fit(X_train_month, y_train_month)
y_pred_month = month_model.predict(X_test_month)
smape(y_pred_month,y_test_month)

In [None]:
#week trend model
from sklearn.model_selection import train_test_split
X_train_week, X_test_week, y_train_week, y_test_week = train_test_split(X_trending, week_trend, test_size=0.2, random_state=42)

import numpy as np
def smape(A, F):    
    N = A.shape[0] 
    return 100 / N * np.sum(np.divide(np.abs(A - F), np.divide(np.abs(A) + np.abs(F), 2)))

from xgboost import XGBRegressor

week_model = XGBRegressor(eta=0.5) #eta = 0.15 sin lags
week_model.fit(X_train_week, y_train_week)
y_pred_week = week_model.predict(X_test_week)
smape(y_pred_week,y_test_week)

In [None]:
#fourier dataset
X_fourier = train_df.copy()
X_fourier = X_fourier.groupby(['country','store','product'])
#X_fourier = X_fourier.set_index(['date']).to_period('D')
print(X_fourier.head())

In [None]:
for gr in X_fourier:
    print(gr[1].columns)

In [None]:
#for key, group in X_fourier:
#    print(group)
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

def make_fourier(df_grouped):
    data = []
    fourier = CalendarFourier(freq="A", order=12)  # 10 sin/cos pairs for "A"nnual seasonality
    Xs = []
    for df in df_grouped:
        df = df[1].set_index(['date']).to_period('D')
        dp = DeterministicProcess(
            index=df.index,
            constant=True,               # dummy feature for bias (y-intercept)
            order=1,                     # trend (order 1 means linear)
            seasonal=True,               # weekly seasonality (indicators)
            additional_terms=[fourier],  # annual seasonality (fourier)
            drop=True,                   # drop terms to avoid collinearity
        )
        X = dp.in_sample()
        X['id'] = df['row_id']
        Xs.append(X)
    X_data = pd.concat(Xs)
    return X_data
data = make_fourier(X_fourier)    

In [None]:
print(data.sort_values(by = ['id']))

In [None]:
X = data.sort_values(by = ['id']).drop(columns=['id'])
X = pd.concat([X.reset_index().drop(columns=['date']),
               X_trending.reset_index().drop(columns=['date'])], axis=1)

               #year_trend.reset_index().drop(columns=['level_3']),
               #month_trend.reset_index().drop(columns=['level_3']),
               #week_trend.reset_index().drop(columns=['level_3'])
X['year_trend'] = year_trend.reset_index().drop(columns=['level_3'])
X['month_trend'] = month_trend.reset_index().drop(columns=['level_3'])
X['week_trend'] = week_trend.reset_index().drop(columns=['level_3'])
print(X)

In [None]:
Y = train_df['num_sold']
print(Y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

def smape(A, F):    
    N = A.shape[0] 
    return 100 / N * np.sum(np.divide(np.abs(A - F), np.divide(np.abs(A) + np.abs(F), 2)))

print(X_train.shape)
print(X_test.shape)

In [None]:
#from xgboost import XGBRegressor

#model = XGBRegressor(eta=0.01) #eta = 0.12 sin lags eta = 1, smape=4.34 con los dias encoding y lags y trend y fourier
#model.fit(X_train, Y_train)
#Y_pred = model.predict(X_test)

#smape(Y_test, Y_pred)

In [None]:
#from sklearn.linear_model import LinearRegression
#model = LinearRegression(fit_intercept=False)
#model.fit(X_train, Y_train)
#Y_pred = model.predict(X_test)

#smape(Y_test, Y_pred)

In [None]:
from tensorflow import keras
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_nn = scaler.fit_transform(X_train)
X_test_nn = scaler.transform(X_test)

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu",kernel_initializer="he_normal", input_shape=X_train.shape[1:]),
    keras.layers.Dense(10, activation="relu",kernel_initializer="he_normal", input_shape=X_train.shape[1:]),
    keras.layers.Dense(5, activation="relu",kernel_initializer="he_normal", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1)
    ])


optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.8)
model.compile(loss='mean_absolute_percentage_error', optimizer=optimizer)
history = model.fit(X_train_nn, Y_train, epochs=200, validation_data=(X_test_nn, Y_test))

In [None]:
Y_pred = model.predict(X_test_nn)
Y_pred = np.squeeze(Y_pred, axis=1)
smape(Y_test, Y_pred)

In [None]:
print(test_df)

In [None]:
### predict in the original test set
## prepare the dataset

def transform_date(df):
    df = df.copy()
    df = df.set_index('date').to_period('D')
    df['day_of_week'] = df.index.dayofweek
    df['day_of_year'] = df.index.dayofyear
    df['year'] = df.index.year
    df['week'] = df.index.week
    return df
test_trending = transform_date(df=test_df)
print(test_trending)

In [None]:
#encode labels in test_trending
from sklearn.preprocessing import LabelEncoder
def transform_labels(df):
    le = LabelEncoder()
    df['country'] = le.fit_transform(df['country'])
    df['store'] = le.fit_transform(df['store'])
    df['product'] = le.fit_transform(df['product'])
    return df
test_trending = transform_labels(df=test_trending)
print(test_trending)

In [None]:
X_test_trending = test_trending.drop(columns = ['row_id'])
X_test_trending.head()

In [None]:
#add year trending
year_trend_test = year_model.predict(X_test_trending)
print(year_trend_test[:10])

In [None]:
#add month trending
month_trend_test = month_model.predict(X_test_trending)
print(month_trend_test[:10])

In [None]:
#add week trending
week_trend_test = week_model.predict(X_test_trending)
print(week_trend_test[:10])

In [None]:
#fourier dataset
X_fourier_test = test_df.copy()
X_fourier_test = X_fourier_test.groupby(['country','store','product'])
#X_fourier = X_fourier.set_index(['date']).to_period('D')
print(X_fourier_test.head())

In [None]:
#add to X_test_trending
data = make_fourier(X_fourier_test)
X = data.sort_values(by = ['id']).drop(columns=['id'])
X = pd.concat([X.reset_index().drop(columns=['date']),
               X_test_trending.reset_index().drop(columns=['date'])], axis=1)

               #year_trend.reset_index().drop(columns=['level_3']),
               #month_trend.reset_index().drop(columns=['level_3']),
               #week_trend.reset_index().drop(columns=['level_3'])
X['year_trend'] = year_trend_test
X['month_trend'] = month_trend_test
X['week_trend'] = week_trend_test

print(X)


In [None]:
#predict in the original test set
#Y_hat = model.predict(X)
#print(Y_hat.shape)

In [None]:
#transform original x test
X = scaler.transform(X)

In [None]:
Y_hat = model.predict(X)
Y_hat = np.squeeze(Y_hat, axis=1)
print(Y_hat.shape)

In [None]:
#
#Y_hat = np.squeeze(Y_hat, axis=1)
#result = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')
#result['num_sold'] = pd.Series(Y_hat)

#result.to_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv', index=False)


In [None]:
my_submission = pd.DataFrame({'row_id': test_df.row_id, 'num_sold': Y_hat})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [None]:
#from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

#fourier = CalendarFourier(freq="A", order=12)  # 10 sin/cos pairs for "A"nnual seasonality

#dp = DeterministicProcess(
#    index=X_fourier.index,
#    constant=True,               # dummy feature for bias (y-intercept)
#    order=1,                     # trend (order 1 means linear)
#    seasonal=True,               # weekly seasonality (indicators)
#    additional_terms=[fourier],  # annual seasonality (fourier)
#    drop=True,                   # drop terms to avoid collinearity
#)

#X = dp.in_sample() 

In [None]:
#train = train_df[(train_df.country == 'Finland') & (train_df.store == 'KaggleMart') & (train_df['product'] == 'Kaggle Hat')]
#train = train.loc[:,['date','num_sold']]
#train = train.set_index('date').to_period('D')
#print(train.head())
#print(train.shape)

In [None]:
#month_trend = train['num_sold'].rolling(window=30, min_periods=15, center=True).mean()
#print(month_trend.head())
#month_trend.shape

In [None]:
#week_trend = train['num_sold'].rolling(window=7, min_periods=4, center=True).mean()
#week_trend.head()

In [None]:
#year_trend = train['num_sold'].rolling(window=365, min_periods=180, center=True).mean()
#year_trend.head()

In [None]:
#trending_train = train.copy()
#trending_train['day_of_week'] = trending_train.index.dayofweek
#trending_train['day_of_year'] = trending_train.index.dayofyear
#trending_train['year'] = trending_train.index.year
#trending_train['week'] = trending_train.index.week
#def transform_date(df):
#    df = df.copy()
#    df['day_of_week'] = df.index.dayofweek
#    df['day_of_year'] = df.index.dayofyear
#    df['year'] = df.index.year
#    df['week'] = df.index.week
#    return df
#trending_train = transform_date(df=train)
#print(trending_train.head())
#print(trending_train.tail())
#train = transform_date(df=train)

In [None]:
#train.head()

In [None]:
#Y_trending = week_trend
#X_trending = trending_train.drop(columns=['num_sold'])
#print(Y_trending.head())
#print(X_trending.head())

In [None]:
#from sklearn.model_selection import train_test_split
#X_train_trending, X_test_trending, y_train_trending, y_test_trending = train_test_split(X_trending, Y_trending, test_size=0.2, random_state=42)

#def smape(A, F):    
#    N = A.shape[0] 
#    return 100 / N * np.sum(np.divide(np.abs(A - F), np.divide(np.abs(A) + np.abs(F), 2)))

#print(X_train_trending.shape)
#print(X_test_trending.shape)

In [None]:
#Y_trending.plot()

In [None]:
#y_test_trending.reset_index().plot()

In [None]:
#from xgboost import XGBRegressor

#model_trending = XGBRegressor(eta=0.1) #eta = 0.15 sin lags
#model_trending.fit(X_train_trending, y_train_trending)
#y_pred_trending = model_trending.predict(X_test_trending)

In [None]:
#from sklearn.metrics import mean_squared_error
#import math
#print(mean_squared_error(y_test_trending, y_pred_trending))
#print(math.sqrt(mean_squared_error(Y_test, Y_predicted)))
#pd.DataFrame(y_pred_trending).plot()

In [None]:
#import numpy as np
#def smape(A, F):    
#    N = A.shape[0] 
#    return 100 / N * np.sum(np.divide(np.abs(A - F), np.divide(np.abs(A) + np.abs(F), 2)))
#smape(y_pred_trending,y_test_trending)

In [None]:
#plot_periodogram(train.num_sold);

In [None]:
#from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

#fourier = CalendarFourier(freq="A", order=12)  # 10 sin/cos pairs for "A"nnual seasonality

#dp = DeterministicProcess(
#    index=train.index,
#    constant=True,               # dummy feature for bias (y-intercept)
#    order=1,                     # trend (order 1 means linear)
#    seasonal=True,               # weekly seasonality (indicators)
#    additional_terms=[fourier],  # annual seasonality (fourier)
#    drop=True,                   # drop terms to avoid collinearity
#)

#X = dp.in_sample() 

In [None]:
#X.head()

In [None]:
#X.shape

In [None]:
## add lags
#def make_lags(ts, lags):
#    return pd.concat(
#        {
#            f'y_lag_{i}': ts.shift(i)
#            for i in range(1, lags + 1)
#        },
#        axis=1)


#X_lags = make_lags(train.num_sold, lags=6)
#X_lags = X_lags.fillna(0.0)

In [None]:
#X_lags.head()

In [None]:
#X_lags.shape

In [None]:
#X_join = pd.concat([X, X_lags], axis=1)

In [None]:
#X_join.shape


In [None]:
#X = X_join.copy()

In [None]:
#X['month_trend'] = month_trend
#X['week_trend'] = week_trend
#X['year_trend'] = year_trend
#X = transform_date(X)
#print(X.head())

In [None]:
#Y = train['num_sold']

#from sklearn.model_selection import train_test_split
#X_train, X_dev, y_train, y_dev = train_test_split(X, Y, test_size=0.2, random_state=42)

#def smape(A, F):    
#    N = A.shape[0] 
#    return 100 / N * np.sum(np.divide(np.abs(A - F), np.divide(np.abs(A) + np.abs(F), 2)))

#print(X_train.shape)
#print(X_dev.shape)

In [None]:
#X_train.head()

In [None]:
#model = LinearRegression(fit_intercept=False)
#model.fit(X_train, y_train)

In [None]:
#y_pred = model.predict(X_dev)
#import numpy as np
#smape(y_dev, y_pred)

In [None]:
#y_pred[0:10]

In [None]:
#y_dev.head()

In [None]:
#X_fore = dp.out_of_sample(steps=365)
#y_fore = pd.Series(model.predict(X_fore), index=X_fore.index)

#ax = Y.plot(color='0.25', style='.', title="Tunnel Traffic - Seasonal Forecast")
#ax = y_pred.plot(ax=ax, label="", color='C3',style='.')

#ax = y_fore.plot(ax=ax, label="Seasonal Forecast", color='C3',style='.')


In [None]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X_train_nn = scaler.fit_transform(X_train)
#X_dev_nn = scaler.transform(X_dev)

In [None]:
#from tensorflow import keras

#model.add(keras.layers.Conv1D(filters=64, kernel_size =100, strides=(20),activation="relu", X_train.shape[1:]))
#model.add(keras.layers.Conv1D(filters=16, kernel_size =2, strides=(2),activation="relu"))
#model.add(keras.layers.MaxPooling1D(pool_size=3,strides=1))

#model = keras.models.Sequential([
#    keras.layers.Conv1D(filters=5, kernel_size =5, strides=(2),activation="relu", input_shape=(X_train.shape[1],1)),
#    keras.layers.MaxPooling1D(pool_size=3,strides=1),
#    keras.layers.Dense(10, activation="relu",kernel_initializer="he_normal"),
#    keras.layers.Dense(5, activation="relu",kernel_initializer="he_normal"),
#    keras.layers.Dense(1)
#    ])
#optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9)
#model.compile(loss='mean_absolute_percentage_error', optimizer=optimizer)


In [None]:
#history = model.fit(X_train_nn, y_train, epochs=100, validation_data=(X_dev_nn, y_dev))

In [None]:
#y_pred = model.predict(X_dev_nn)
#Y_p = np.squeeze(y_pred, axis=1)
#smape(y_dev, Y_p)

In [None]:
#from sklearn.svm import SVR
#svm_poly_reg = SVR(kernel="rbf",C=100, epsilon=0.01)
#svm_poly_reg.fit(X_train,y_train)

#y_pred = svm_poly_reg.predict(X_dev)

#smape(y_dev, y_pred)

In [None]:
#from xgboost import XGBRegressor

#model = XGBRegressor(eta=0.1) #eta = 0.12 sin lags eta = 1, smape=4.34 con los dias encoding y lags y trend y fourier
#model.fit(X_train, y_train)
#y_pred = model.predict(X_dev)

#smape(y_dev, y_pred)