In [None]:
import pandas            as pd
import numpy             as np
import seaborn           as sns
import matplotlib.pyplot as plt
import matplotlib.dates  as mdates
import seaborn           as sns
import heapq
import datetime

from statsmodels.tsa.stattools import adfuller

In [None]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [None]:
def check_stationarity(series):
    # Copied from https://machinelearningmastery.com/time-series-data-stationary-python/

    result = adfuller(series.values)

    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))

    if (result[1] <= 0.05) & (result[4]['5%'] > result[0]):
        print("\u001b[32mStationary\u001b[0m")
    else:
        print("\x1b[31mNon-stationary\x1b[0m")

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
test  = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")

In [None]:
train.drop(['row_id'],axis=1,inplace=True)
test.drop(['row_id'],axis=1,inplace=True)

In [None]:
train = downcast_dtypes(train)
test  = downcast_dtypes(test)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
print("Training dataset column name, Training dataset column type")
for col in train.columns:
    print(col,train[col].dtype)
print("Test dataset column name, Test dataset column type")
for col in test.columns:
    print(col,test[col].dtype)

In [None]:
missing_data_count = train.isnull().sum()
print("Missing Data in Training Set:")
print(missing_data_count)
sns.heatmap(train.isnull())
plt.title("Missing Data in Training Set")
plt.show()

missing_data_count = test.isnull().sum()
print("Missing Data in Test Set:")
print(missing_data_count)
sns.heatmap(test.isnull())
plt.title("Missing Data in Test Set")
plt.show()

total_data = np.product(train.shape)
total_missing = missing_data_count.sum()
percent_missing = (total_missing/total_data)*100
print("\nPercentage of Missing Data:")
print(percent_missing)

In [None]:
object_cols = []
number_cols = []
for col in train.columns:
    if (train[col].dtype == 'object' and col != 'date'):
        object_cols.append(col)
    elif (col != 'date'):
        number_cols.append(col)
print("Object Columns")
print(object_cols)
print("Numerical Columns")
print(number_cols)

In [None]:
for col in object_cols:
    print(col)
    print(train[col].value_counts())
    print("#########################")

In [None]:
groupedTrain = train.groupby(['date',"country",'store','product']).agg(['sum'])

In [None]:
fig, _ = plt.subplots(2,3)
m = 0
for country in train['country'].value_counts().index:
    for store in train['store'].value_counts().index:
        m = m + 1
        ax = plt.subplot(2,3,m)
        for product in train['product'].value_counts().index:
            selected = groupedTrain.loc[(groupedTrain.index.get_level_values('country') == country)
                                        & (groupedTrain.index.get_level_values('store') == store)
                                        & (groupedTrain.index.get_level_values('product') == product)]
            plt.plot(selected.index.get_level_values('date'),selected['num_sold'],label=product)
        plt.title(country + " & " + store)
        locator = mdates.MonthLocator()
        ax.xaxis.set_major_locator(locator)
        plt.xticks(rotation = 90)
        if (m == 1):
            plt.legend()
fig.set_size_inches(24,16)
fig.tight_layout()

In [None]:
for country in train['country'].value_counts().index:
    for store in train['store'].value_counts().index:
        for product in train['product'].value_counts().index:
            print(country,store,product)
            selected = groupedTrain.loc[(groupedTrain.index.get_level_values('country') == country)
                                        & (groupedTrain.index.get_level_values('store') == store)
                                        & (groupedTrain.index.get_level_values('product') == product)]
            check_stationarity(selected['num_sold'])
            print("##################################################")
            print("diff ",country,store)
            selected['num_sold_diff'] = selected['num_sold'].diff().fillna(0)
            check_stationarity(selected['num_sold_diff'])
            print("##################################################")
            print("##################################################")
            print("##################################################")
            

In [None]:
import statsmodels.api as sm
fig, _ = plt.subplots(6,3)
m = 0
for country in train['country'].value_counts().index:
    for store in train['store'].value_counts().index:
        for product in train['product'].value_counts().index:
            m = m + 1
            ax = plt.subplot(6,3,m)
            selected = groupedTrain.loc[(groupedTrain.index.get_level_values('country') == country)
                                        & (groupedTrain.index.get_level_values('store') == store)
                                        & (groupedTrain.index.get_level_values('product') == product)]
            sm.graphics.tsa.plot_pacf(selected['num_sold'], lags=14, method="ywm", ax=ax,title = country + " & " + store + " & " + product)
            plt.xticks(rotation = 90)
fig.set_size_inches(24,16)
fig.tight_layout()

In [None]:
import statsmodels.api as sm
fig, _ = plt.subplots(6,3)
m = 0
for country in train['country'].value_counts().index:
    for store in train['store'].value_counts().index:
        for product in train['product'].value_counts().index:
            m = m + 1
            ax = plt.subplot(6,3,m)
            selected = groupedTrain.loc[(groupedTrain.index.get_level_values('country') == country)
                                        & (groupedTrain.index.get_level_values('store') == store)
                                        & (groupedTrain.index.get_level_values('product') == product)]
            selected['num_sold_diff'] = selected['num_sold'].diff().fillna(0)
            sm.graphics.tsa.plot_pacf(selected['num_sold_diff'], lags=56, method="ywm", ax=ax,title = country + " & " + store + " & " + product)
            plt.xticks(rotation = 90)
fig.set_size_inches(24,16)
fig.tight_layout()

In [None]:
import statsmodels.api as sm
fig, _ = plt.subplots(6,3)
m = 0
for country in train['country'].value_counts().index:
    for store in train['store'].value_counts().index:
        for product in train['product'].value_counts().index:
            m = m + 1
            ax = plt.subplot(6,3,m)
            selected = groupedTrain.loc[(groupedTrain.index.get_level_values('country') == country)
                                        & (groupedTrain.index.get_level_values('store') == store)
                                        & (groupedTrain.index.get_level_values('product') == product)]
            sm.graphics.tsa.plot_acf(selected['num_sold'], lags=56,  ax=ax,title = country + " & " + store + " & " + product)
            plt.xticks(rotation = 90)
fig.set_size_inches(24,16)
fig.tight_layout()

In [None]:
import statsmodels.api as sm
fig, _ = plt.subplots(6,3)
m = 0
for country in train['country'].value_counts().index:
    for store in train['store'].value_counts().index:
        for product in train['product'].value_counts().index:
            m = m + 1
            ax = plt.subplot(6,3,m)
            selected = groupedTrain.loc[(groupedTrain.index.get_level_values('country') == country)
                                        & (groupedTrain.index.get_level_values('store') == store)
                                        & (groupedTrain.index.get_level_values('product') == product)]
            selected['num_sold_diff'] = selected['num_sold'].diff().fillna(0)
            sm.graphics.tsa.plot_acf(selected['num_sold_diff'], lags=56,  ax=ax,title = country + " & " + store + " & " + product)
            plt.xticks(rotation = 90)
fig.set_size_inches(24,16)
fig.tight_layout()