In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as  np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.graphics.tsaplots as sgt
import statsmodels.tsa.stattools as sts
import statsmodels.tsa.seasonal as seasonal_decompose
from statsmodels.tsa.arima_model import ARMA,ARIMA
import warnings
warnings.filterwarnings("ignore")

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file)
    df = reduce_mem_usage(df)
    return df

# **DATA EXPLORATION**

In [None]:
train= import_data("../input/store-sales-time-series-forecasting/train.csv")
train.sample(6)

In [None]:
train['date']=pd.to_datetime(train.date,dayfirst=True)

In [None]:
train.info()

In [None]:
test= import_data("../input/store-sales-time-series-forecasting/test.csv")
test.sample(6)

In [None]:
test['date']=pd.to_datetime(test.date,dayfirst=True)

In [None]:
test.info()

In [None]:
holidays= import_data("../input/store-sales-time-series-forecasting/holidays_events.csv")
holidays.sample(6)

In [None]:
holidays['date']=pd.to_datetime(holidays.date,dayfirst=True)

In [None]:
holidays.info()

In [None]:
oil= import_data("../input/store-sales-time-series-forecasting/oil.csv")
oil.sample(6)

In [None]:
oil['date']=pd.to_datetime(oil.date,dayfirst=True)

In [None]:
oil.info()

In [None]:
sample= import_data("../input/store-sales-time-series-forecasting/sample_submission.csv")
sample.sample(6)

In [None]:
sample.info()

In [None]:
stores= import_data("../input/store-sales-time-series-forecasting/stores.csv")
stores.sample(6)

In [None]:
stores.info()

In [None]:
transactions= import_data("../input/store-sales-time-series-forecasting/transactions.csv")
transactions.sample(6)

In [None]:
transactions['date']=pd.to_datetime(transactions.date,dayfirst=True)

In [None]:
transactions.info()

In [None]:
train1=train.merge(stores,on='store_nbr')\
.merge(transactions,on=['store_nbr','date']).sort_values('date')\
.merge(oil,on='date')
train1

In [None]:
train1.isna().sum()

In [None]:
train1.fillna(value={'dcoilwtico':0.0},inplace=True)

In [None]:
train1.isna().sum()

In [None]:
train1['day']=train['date'].dt.day
train1['month'] = train['date'].dt.strftime('%B')
train1['year']=train['date'].dt.year
train1['quarter']=train['date'].dt.quarter
train1['dayofweek']=train['date'].dt.dayofweek
train1['dayofyear']=train['date'].dt.dayofyear
train1['weekofyear']=train['date'].dt.weekofyear

In [None]:
train1.replace({'quarter': {1:'First', 2:'Second',3:"Third",4:'Fourth'}},inplace=True)

In [None]:
train1

In [None]:
grp_fm_yr=pd.DataFrame(train1.groupby(['family','year'])['sales'].sum())
grp_fm_yr.reset_index(inplace=True)
grp_fm_yr

In [None]:
grp_fm_mn=pd.DataFrame(train1.groupby(['family','month'])['sales'].sum())
grp_fm_mn.reset_index(inplace=True)
grp_fm_mn

In [None]:
grp_fm_qt=pd.DataFrame(train1.groupby(['family','quarter'])['sales'].sum())
grp_fm_qt.reset_index(inplace=True)
grp_fm_qt

# **Visualization**

In [None]:
plt.figure(figsize=(20,5))
plt.xticks(rotation=90,fontsize=15)
plt.title("Sales per year",fontsize=20)
sns.barplot(x="family", y="sales",hue = 'year',data=grp_fm_yr)

In [None]:
plt.figure(figsize=(20,10))
plt.xticks(rotation=90,fontsize=13)
plt.title("Sales per month",fontsize=20)
sns.barplot(x="family", y="sales",hue = 'month',data=grp_fm_mn)

In [None]:
plt.figure(figsize=(20,5))
plt.xticks(rotation=90,fontsize=13)
plt.title("Sales per quarter",fontsize=20)
sns.barplot(x="family", y="sales",hue = 'quarter',data=grp_fm_qt)

In [None]:
train1.set_index('date',inplace=True)

In [None]:
train1

In [None]:
train1.sales.plot(figsize=(20,5))

In [None]:
train1.describe()

In [None]:
train=train1.iloc[:int(len(train1)*0.7)]
train

In [None]:
test=train1.iloc[int(len(train1)*0.7):]
test

# **Modeling**

In [None]:
model=ARMA(train.sales,order=(0,5))
model=model.fit()
print(model.summary())

In [None]:
train.to_csv('submission.csv')