In [None]:
#import all libraries
import pandas as pd
from datetime import datetime as dt
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import seaborn as sns

In [None]:
#import data as csv
categories=pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
items=pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
train=pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
shop=pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')

In [None]:
#view first few rows of data
train.head()

In [None]:
#turn data column into a datetime object
train['date']= pd.to_datetime(train['date'])

In [None]:
#sort values by date
train = train.sort_values(by="date")

In [None]:
#create a new column representing rev_per_day
train['rev_per_day']=(train['item_price'])*(train['item_cnt_day'])

In [None]:
train.head()

In [None]:
#create a new df after grouping data 
train_shops= train.groupby(['date', 'shop_id']).sum().reset_index
train_overall=train.groupby(['date']).sum().reset_index()

In [None]:
train_overall.head()

In [None]:
#finding the rolling mean
train_overall['rolling_mean']=train_overall['rev_per_day'].rolling(10).mean().shift(1)

In [None]:
#plotting all data points and the rolling mean
fig = px.scatter(train_overall, x="date", y="rev_per_day", title='rev per day')
fig.add_scatter(x=train_overall['date'], y=train_overall['rolling_mean'], mode='lines')
fig.show()

In [None]:
#selecting 2 columns
ready=train_overall[['rev_per_day','date']]

In [None]:
ready.head()

In [None]:
#creating moving time horison 10 periods back as our features 
a=[]
b=[]
c=[]
d=[]
e=[]
f=[]
g=[]
h=[]
l=[]
j=[]
k=[]
for i in range (len(ready)):
    if i>9:
        a.append(ready['rev_per_day'][i])
        b.append(ready['rev_per_day'][i-1])
        c.append(ready['rev_per_day'][i-2])
        d.append(ready['rev_per_day'][i-3])
        e.append(ready['rev_per_day'][i-4])
        f.append(ready['rev_per_day'][i-5])
        g.append(ready['rev_per_day'][i-6])
        h.append(ready['rev_per_day'][i-7])
        l.append(ready['rev_per_day'][i-8])
        j.append(ready['rev_per_day'][i-9])
        k.append(ready['rev_per_day'][i-10])


    

In [None]:
ready.head(11)

In [None]:
#adding lists to our df
previous_periods = pd.DataFrame(
    {'i': a,
     'i-1': b,
     'i-2': c,
     'i-3': d,
     'i-4': e,
     'i-5': f,
     'i-6': g,
     'i-7': h,
     'i-8': l,
     'i-9': j,
     'i-10': k




    })

In [None]:
previous_periods.head()

In [None]:
#merging our data frames
df=ready.merge(previous_periods, left_on='rev_per_day', right_on='i')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
#dropping redundant columns
df=df.drop(columns=['i'])

In [None]:
#plotting current period and periods as far as 3 periods back
fig = px.scatter(df, x="date", y="rev_per_day", title='rev per day')
fig.add_scatter(x=df['date'], y=df['i-1'], mode='lines')
fig.add_scatter(x=df['date'], y=df['i-2'], mode='lines')
fig.add_scatter(x=df['date'], y=df['i-3'], mode='lines')


fig.show()

In [None]:
df.head()

In [None]:
#seperating date time in order to be able to feed it into model
df['year'] = df['date'].apply(lambda time: time.year)

df['month'] = df['date'].apply(lambda time: time.month)

df['day'] = df['date'].apply(lambda time: time.day)
df=df.drop(columns=['date'])

In [None]:
df.head()

In [None]:
#df = pd.DataFrame(df, columns = ['rev_per_day','year','month','day','i-1','i-2','i-3','i-4','i-5','i-6','i-7','i-8','i-9','i-10'])

In [None]:
df.head()

In [None]:
#creating traing and test sets and seperating features and labels
df_train=df.iloc[:900]
df_test=df.iloc[900:]
df_train_label=df_train['rev_per_day']
df_test_label=df_test['rev_per_day']
df_train=df_train.drop(columns=['rev_per_day'])
df_test=df_test.drop(columns=['rev_per_day'])

In [None]:
#listing all columns
df_train.columns

In [None]:
df_train.head()

In [None]:
#information about dataset
df_train.info()

In [None]:
#definging models
regr = RandomForestRegressor()
lasso = linear_model.Lasso()
pas=PassiveAggressiveRegressor()
xgbr = xgb.XGBRegressor() 

In [None]:
#cross validating model
random_forest_crossval=cross_val_score(regr, df_train, df_train_label,scoring='neg_mean_absolute_percentage_error', cv=10)
print(np.mean(random_forest_crossval))

In [None]:
lasso_crossval=cross_val_score(lasso, df_train, df_train_label,scoring='neg_mean_absolute_percentage_error', cv=10)
print(np.mean(lasso_crossval))

In [None]:
passiveaggresive_crossval=cross_val_score(pas, df_train, df_train_label,scoring='neg_mean_absolute_percentage_error', cv=10)
print(np.mean(passiveaggresive_crossval))

In [None]:
xgb_crossval=cross_val_score(xgbr, df_train, df_train_label,scoring='neg_mean_absolute_percentage_error', cv=10)
print(np.mean(xgb_crossval))

In [None]:
#hyperparameter optamization and cross validation
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [True], 'n_estimators': [3, 10, 30], 'max_features': [2, 3, 4,10]},
]


grid_search = GridSearchCV(regr, param_grid, cv=10, refit=True,                          
scoring='neg_mean_absolute_percentage_error',
return_train_score=True)
grid_search.fit(df_train, df_train_label)
optimised_random_forest = grid_search.best_estimator_
print(grid_search.best_score_)

In [None]:
param_grid = [
{'subsample': [0.1, 0.3, 0.75], 'gamma': [0, 2, 6, 8]},
{'max_depth': [0,1, 3 ,75], 'num_parallel_tree': [1,2,10,25]},
]
grid_search = GridSearchCV(xgbr, param_grid, cv=10, refit=True,                          
scoring='neg_mean_absolute_percentage_error',
return_train_score=True)

grid_search.fit(df_train, df_train_label)
optimised_xgbr = grid_search.best_estimator_
print(grid_search.best_score_)

In [None]:
#creating a list that contans redicted values
prediction=optimised_xgbr.predict(df_test)

In [None]:
#merging datetime columns in order to plot
cols = ['year', 'month', 'day']
df_test['date'] = df[cols].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)
df_train['date'] = df[cols].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)



In [None]:
#merging columns again
df_train=pd.concat([df_train, df_train_label.reindex(df_train.index)], axis=1)
df_test=pd.concat([df_test, df_test_label.reindex(df_test.index)], axis=1)

In [None]:
#converting datetime column back to object type dattime
df_train['date']= pd.to_datetime(df_train['date'])
df_test['date']= pd.to_datetime(df_test['date'])

In [None]:
df_train.head()

In [None]:
#creating prediction column
df_test['prediction']=prediction

In [None]:
#score for prediction
mean_absolute_percentage_error(df_test['rev_per_day'], df_test['prediction'])

In [None]:
#plotting train set test set and prediction

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=df_train['date'], y=df_train['rev_per_day'], name="train data"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=df_test['date'], y=df_test['rev_per_day'], name="test data"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=df_test['date'], y=df_test['prediction'], name="prediction data"),
    secondary_y=False,
)
