# Import all requires libraries for data analysis
In this step, we will import libraries and data to be analysed

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

get data into dataframe

In [None]:
df_submission=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
df_test=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
df_sales_train=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')

Get the length of all dataframes

In [None]:
print('submission ' +str(len(df_submission)))
print('sales train ' +str(len(df_sales_train)))
print('tests ' +str(len(df_test)))

In [None]:
def get_basic_df_info(df):
    print("------ top 3 records ----- ")
    print (df.head(3));
    print("------- data information ----")
    print(df.info())
    print("--------- Describe ----------")
    print(df.describe())
    print ("-------- Columns values ---------")
    print(df.columns)
    print ("------- data types values -------")
    print(df.dtypes)
    print("-------- Missing values --------")
    print(df.isnull().sum())
    print("-------- Nan values ---------")
    print(df.isna().sum())
    print("-------- Data shape values -------")
    print(df.shape)

In [None]:
get_basic_df_info(df_sales_train) 

Merge data to get a complete dataset with all data

In [None]:

df_sales_train['date'] = pd.to_datetime(df_sales_train['date'])
df_sales_train.head(3)
#print(df_sales_train.date.value_counts().head(3))
#get train dates sorted
#train_dates = df_sales_train.date.value_counts()
#train_dates = train_dates.sort_index()
#print(train_dates.head(10))

In [None]:
import matplotlib as plot
import matplotlib.pyplot as plt
import seaborn as sns
### get saisonality per month
month = df_sales_train.date.dt.month
year = df_sales_train.date.dt.year
dates = df_sales_train.date
item  = df_sales_train.item_id
shop = df_sales_train.shop_id
sales = df_sales_train.item_cnt_day *  df_sales_train.item_price

del df_sales_train
monthly_data = pd.concat([year,month,dates,item,shop,sales], axis=1)
monthly_data.columns = ['year','month','dates','item','shop','sales']
monthly_data.head(3)

In [None]:
#Plot data
trend = monthly_data.groupby(monthly_data.dates)['sales'].mean()
trend.plot(kind='line')
plt.show()

In [None]:
#plotting months
#understand saisonality - are they trends for sales during months ?
m = monthly_data.groupby(monthly_data.month)['sales'].mean()
m.plot(kind='line')
plt.show()

In [None]:
#reduce the data
def reduce_mem_usage(train_data):
    """ iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage. 
    """
    start_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in train_data.columns:
        col_type = train_data[col].dtype

        if col_type != object:
            c_min = train_data[col].min()
            c_max = train_data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    train_data[col] = train_data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    train_data[col] = train_data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    train_data[col] = train_data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    train_data[col] = train_data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    train_data[col] = train_data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    train_data[col] = train_data[col].astype(np.float32)
                else:
                    train_data[col] = train_data[col].astype(np.float64)
        else:
            train_data[col] = train_data[col].astype('category')

    end_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return train_data

Explaining the model **SARIMA** is used for non-stationary series, that is, where the data do not fluctuate around the same mean, variance and co-variance. This model can identify trend and seasonality, which makes it so important. 

The SARIMA consists of other forecasting models: 
* **AR**: Auto regressive model (can be a simple, multiple or non-linear regression) 
* **MA**: Moving averages model. The moving average models can use weighting factors, where the observations are weighted by a trim factor (for the oldest data in the series) and with a higher weight for the most recent observations.

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller

In [None]:
data = monthly_data[["dates", "item","sales"]]
df_train = data.set_index(['dates'])
df_train.sort_index(inplace=True)
df_train.index.name = 'datetimeindex'
df_train.head(3)

In [None]:
#plot df_train 
df_train.plot(figsize=(19, 4))
plt.show()

In [None]:
#reduce the train data
df_train = reduce_mem_usage(df_train)

In [None]:
#test if data is stationary or not
#H0 It is not stationary
#H1 it is stationary

def test_fuhler(train_sales):
    ad_fuller_result = adfuller(train_sales)
    print(f'ADF Statistic: {ad_fuller_result[0]}')
    print(f'p-value: {ad_fuller_result[1]}')
    print('ADF Statistic: %f' % ad_fuller_result[0])
    if ad_fuller_result[1] <= 0.05:
        print('This a strong evidence against the null hypothesis(H0), reject H0 - the data is stationary')
    else:
        print('Wee evidence against the null hypothesis(H0) - the data is not stationary')

In [None]:
#issue here - should investigate!
#test_fuhler(df_train['sales'])

In [None]:
df_train['sales'].shift(1)

# Stationary and differencing

A **stationary** time series is one whose properties do not depend on the time at which the series is observed.Thus, time series with trends, or with seasonality, are not stationary — the trend and seasonality will affect the value of the time series at different times. On the other hand, a white noise series is stationary — it does not matter when you observe it, it should look much the same at any point in time.

Some cases can be confusing — a time series with cyclic behaviour (but with no trend or seasonality) is stationary. This is because the cycles are not of a fixed length, so before we observe the series we cannot be sure where the peaks and troughs of the cycles will be.

In general, **a stationary time series will have no predictable patterns in the long-term**. Time plots will show the series to be roughly horizontal (although some cyclic behaviour is possible), with constant variance. So, stationary time series is one whose mean and variance is constant over time.

When we have determined that we have stationarity, you can model it using the ARIMA (AutoRegressive Moving Average). For stationary data you can approximated with stationary ARIMA mode (SARIMA).

In [None]:
### differiantion
#assume data is not stationary - data is not seasonal 
df_train['sales diff'] = df_train['sales'] - df_train['sales'].shift(1)
#if seasonal then you should shift by 12 (you seen pattern during months)
df_train['sales saison'] = df_train['sales'] - df_train['sales'].shift(12)

In [None]:
df_train.head(15)

In [None]:
#issue here - should investigate!
#test_fuhler(df_train['sales'])

In [None]:
df_train['sales diff'].plot()

In [None]:
df_train['sales saison'].plot()

# Data investigation - other

In [None]:
#data merge
#run only if necessary
''''_item_cat=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')  
df_item=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_shop=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
df_item_cat_all = df_item.merge(df_item_cat, left_on='item_category_id', right_on='item_category_id')
df_train_item = df_sales_train.merge(df_item_cat_all, left_on='item_id', right_on='item_id')
df_sales_train = df_train_item.merge(df_shop, left_on='shop_id', right_on='shop_id')

#del df_sales_train, df_item_cat_all, df_train_item
def del_not_used_df(*args):
    for arg in args:
        del arg
        
def set_test_df (df):
    get_shop =  df['shop_id']==1
    return df[get_shop]

#print(len(df))
#df.head(10)

del df_item_cat_all
del df_train_item
''''''

In [None]:
ds = df_sales_train.pivot_table(index = ['shop_id', 'item_id'], values = ['item_cnt_day'], columns =['date_block_num'], fill_value = 0, aggfunc='sum')
ds.reset_index(inplace=True)
#merge data set with shop and item
ds = pd.merge(df_sales_train, ds, on = ['item_id', 'shop_id'], how = 'left')
ds.head(3)

In [None]:
#plot head plot - head graph
import seaborn as sns
monthly_data_h = monthly_data.groupby(['year','month','dates']).size()
monthly_data_h = monthly_data_h.reset_index()
monthly_data_h = monthly_data_h.rename(columns={0 :'size'})
monthly_data_h = monthly_data_h.groupby(['year','month']).mean()['size']
monthly_data_h = monthly_data_h.unstack(level=1)
print(monthly_data_h)

sns.heatmap(monthly_data_h)
plt.show()

In [None]:
#get information by shops / year / months
monthly_shop_sales = monthly_data.copy()
monthly_shop_sales = monthly_shop_sales.reset_index()
monthly_shop_sales = monthly_shop_sales.groupby(['shop','month']).mean()['sales']
print(monthly_shop_sales.head(10))
monthly_shop_sales = monthly_shop_sales.unstack(level=0)
print(monthly_shop_sales.head(3))
sns.heatmap(monthly_shop_sales)
plt.show()

In [None]:
#check the trend on quarter (jan-feb-mar), (apr-mai-jun) - (jul-aug-sep) - (oct-nov-dec)
# data re-sampled based on each month
month_sales = monthly_data.resample('MS', on='dates').sales.sum()
quarter_sales = monthly_data.resample('Q', on='dates').sales.sum()


# aggregating multiple fields for each motnh
monthly_data.resample('Q', on='dates').agg({'sales':'sum', 'item':'count','shop':'nunique'})

In [None]:
# Grouping data based on month and store type
monthly_data.groupby([pd.Grouper(key='dates', freq='M'), 'shop']).sales.sum().head(15)

# grouping data and named aggregation on item, item_count, and sales
monthly_data.groupby([pd.Grouper(key='dates', freq='M'), 'item']).agg(unique_items=('item', 'nunique'),
         total_quantity=('item','count'),
         total_amount=('sales','sum'))

In [None]:
#let's normalize by total amount per month for each item and shop
month_sale_pct = monthly_data.groupby(monthly_data.dates).apply(lambda x: x /x.sum())
print(month_sale_pct)

In [None]:
#found min max of each shop - get month count from min date to max date
monthly_data.groupby(['shop']).agg({'dates': [np.min,np.max]})

In [None]:
#run only if necessary
'''
df_train_array = np.column_stack((df_train['item'].values,df_train['sales'].values))
df_train_array
del df_train_array
'''

### 