In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from datetime import date, datetime
import time
import calendar

In [None]:
train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv' , index_col = 'id')
oil = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv')
test =pd.read_csv('../input/store-sales-time-series-forecasting/test.csv' , index_col = 'id')
submission = pd.read_csv('../input/store-sales-time-series-forecasting/sample_submission.csv')

In [None]:
def preprocess_train(df):
    df['date'] = df['date'].map(lambda x: date.fromisoformat(x))
    df['weekday'] = df['date'].map(lambda x: x.weekday())
    df['year'] = df['date'].map(lambda x: x.year)
    df['month'] = df['date'].map(lambda x: x.month)
    df['day'] = df['date'].map(lambda x: x.day)
    df['eomd'] = df['date'].map(lambda x: calendar.monthrange(x.year, x.month)[1])
    df['payday'] = ((df['day'] == df['eomd'])|(df['day'] == 15)).astype(int)
    return df

train = preprocess_train(train)
test = preprocess_train(test)

In [None]:
train.head()

In [None]:
def preprocess_oil(oil):
    oil['month'] = oil['date'].map(lambda x: int(x.replace('-', '')[:6]))
    oil['month_avg'] = oil.groupby('month')['dcoilwtico'].transform('mean')
    oil['tmp'] = oil['dcoilwtico'].map(np.isnan)
    oil['month_avg'] = oil['tmp'] * oil['month_avg']
    oil['dcoilwtico'].fillna(0, inplace=True)
    oil['dcoilwtico'] = oil['dcoilwtico'] + oil['month_avg']
    oil = oil.drop(['month', 'month_avg', 'tmp'], axis=1)
    oil['date'] = oil['date'].map(lambda x: date.fromisoformat(x))
    return oil

oil = preprocess_oil(oil)

In [None]:
def merge_tables(df):
    df = df.merge(oil, on='date', how='left')

    return df

train = merge_tables(train)
test = merge_tables(test)

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
y = train['sales']
train.drop(columns = ['sales'] , inplace = True , axis = 1)

In [None]:
y.nunique()

In [None]:
train['family'].nunique()

In [None]:
test.isnull().sum()

In [None]:
test['family'].nunique()

In [None]:
import datetime
train['date_ti'] = pd.to_datetime(train['date'] , errors = 'coerce')
train['date_conv'] = train['date_ti'].dt.strftime("%Y%m%d").astype(int)

In [None]:
train['date_ti'].dtype

In [None]:
train.drop(columns = ['date' , 'date_ti'] , axis = 1 , inplace = True)

In [None]:
test['date'] = pd.to_datetime(test['date'] , errors = 'coerce')
test['date_conv'] = test['date'].dt.strftime("%Y%m%d").astype(int)

In [None]:
test.drop('date' , axis = 1 , inplace = True)

In [None]:
test.head()

In [None]:
from sklearn.metrics import mean_squared_log_error
def rmsle(y1, y2):
    return mean_squared_log_error(y1, y2) ** 0.5

In [None]:
def time_series_k_fold(x, y, n_splits = 10):
    step = len(y) // n_splits
    accuracies = []
    for n in range(1, n_splits):
        x_train = x[0 : step * n]
        x_test = x[step * n: step * (n + 1)]
        y_train = y[0 : step * n]
        y_test = y[step * n: step * (n + 1)]
        
        model = LinearRegression()
        # model = XGBRegressor()
        model.fit(x_train , y_train)
        
        y_pred = model.predict(x_test)
        y_pred = [max(item, 0) for item in y_pred]
        print(y_test.shape)
        accuracies.append(rmsle(y_pred, y_test))
        print(accuracies)
    return accuracies

In [None]:
train =pd.get_dummies(train, columns = ['weekday', 'month', 'year', 'store_nbr', 'family'])
test = pd.get_dummies(test, columns = ['weekday', 'month', 'year', 'store_nbr', 'family'])

In [None]:
train.head()

In [None]:
y.shape

In [None]:
time_series_k_fold(train, y)

In [None]:
from xgboost import XGBRegressor
model = LinearRegression()
# model = XGBRegressor()
model.fit(train , y)

In [None]:
train_pre = model.predict(train)

In [None]:
test

In [None]:
pre  = model.predict(test)

In [None]:
submission['sales'] = pre

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv' , index = False)

In [None]:
submission