In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
from tqdm import tqdm
from statsmodels.stats.diagnostic import acorr_ljungbox
from pmdarima.arima import ndiffs
from pmdarima.arima import OCSBTest
from pmdarima.arima import CHTest

In [2]:
# In order to use this notebook for univarate time series analysis :-
# 1) The primary requirement is not to have missing values or categorial(string) data for time_dependent variable 
#    and time_column.
# 2) This cell requires information on file_name (only csv), time_dependent_variable, time_column, date_time format (frmt)
#    and resample grain(X). After filling the required information correctly, you can run all the cells (Cell ---> Run All)
# 3) Example :-
#   file_name               = "JetRail Avg Hourly Traffic Data - 2012-2013.csv"
#   time_dependent_variable = "Count"    (column name in your dataset)
#   time_column             = "Datetime" (column name in your dataset)
#   frmt                    = "%Y-%m-%d"
#   X                       = "D" 

file_name = "order_product_merged_file (1).csv"

def data(file_name):
    if ".csv" in file_name:
        df = pd.read_csv(file_name, parse_dates=True)
    elif ".xlsx" in file_name:
        df = pd.read_excel(file_name, parse_dates=True)
    return df

In [3]:
df = data(file_name)
df = df.drop_duplicates()
df = df[(df["Order Date"] >= '2020-07-27') & (df["Order Date"] <= '2021-12-31')]

  df = data(file_name)


In [4]:
# EXMAPLE :-
dataset = df
dt = "Order Date"
value = "Total Sales Units consolidated"
frequency = "7D"
product_id = "Product ID"
m = 12

#### DEMAND CLASSIFICATION

In [6]:
# Demand Classification is done at different hierarchies of a time-series.

Non_Zero_Demand_SKU = []
Intermittant_Demand_SKU = []
Zero_Demand_SKU = []

Non_Zero_Demand_points = []
Intermittant_Demand_points = []

Intermittant_Demand_percentage = []

Non_Zero_Demand_mean_value = []
Intermittant_Demand_mean_value = []

def classification(dataset,dt,value,frequency,product_id):

    """ dataset: DataFrame-> It contains time-series for all product_ids.
        dt:      timestamp-> It is a date-time column.
        value:       float-> It is a time dependent variate on which univariate analysis is done. 
        frequency:     str-> "7D","30D" Level at which time-series is analyzed.
        product_id:    SKU-> Level of grain column at which analysis is done.
                                                                          """    
    dataset[dt] = pd.to_datetime(dataset[dt])
    dataset.index = dataset[dt]
    
    for i in tqdm(dataset[product_id].unique()):
        resampled_data = dataset[dataset[product_id] == i].resample(frequency)
        new = pd.DataFrame(resampled_data[value].sum()).reset_index()
        percentage = new[new[value]!=0].shape[0]/new.shape[0]
        mean_value = new[value].mean()
        if percentage == 1:
            Non_Zero_Demand_SKU.append(i)
            Non_Zero_Demand_points.append(new.shape[0])
            Non_Zero_Demand_mean_value.append(mean_value)
        elif percentage == 0:
            Zero_Demand_SKU.append(i)
        else:
            Intermittant_Demand_SKU.append(i)
            Intermittant_Demand_points.append(new.shape[0])
            Intermittant_Demand_percentage.append(percentage)
            Intermittant_Demand_mean_value.append(mean_value)
    
    return {"Non_Zero_Demand": len(Non_Zero_Demand_SKU), 
            "Intermittant_Demand": len(Intermittant_Demand_SKU),
            "Zero_Demand": len(Zero_Demand_SKU)}

classification(dataset,dt,value,frequency,product_id)

100%|██████████| 13700/13700 [00:52<00:00, 261.00it/s]


{'Non_Zero_Demand': 8005, 'Intermittant_Demand': 4488, 'Zero_Demand': 1207}

#### DATA POINTS DISTRIBUTION

In [7]:
Intermittant_Demand_df = pd.DataFrame(
    {'SKU': Intermittant_Demand_SKU,
     'No_of_points': Intermittant_Demand_points,
     "%_of_non-zero_points" : Intermittant_Demand_percentage, 
     "Mean_value" : Intermittant_Demand_mean_value
    })

Non_Zero_Demand_df = pd.DataFrame(
    {'SKU': Non_Zero_Demand_SKU,
     'No_of_points': Non_Zero_Demand_points,
     "Mean_value" : Non_Zero_Demand_mean_value
    })

In [8]:
for i,row in tqdm(Non_Zero_Demand_df.iterrows()):
    if row["No_of_points"] >= 30:
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"Is_greater_30"] = "Y"
    else:
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"Is_greater_30"] = "N"


for i,row in tqdm(Intermittant_Demand_df.iterrows()):
    if row["No_of_points"] >= 30:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"Is_greater_30"] = "Y"
    else:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"Is_greater_30"] = "N"

8005it [00:03, 2571.34it/s]
4488it [00:01, 3068.32it/s]


#### NON-ZERO % POINTS - INTERMITTANT DEMAND

In [9]:
def bins(number):
    if number == 1:
        return "100%"
    elif 0.7 <= number < 1:
        return "70-100%"
    elif 0.4 <= number < 0.7:
        return "40-70%"
    elif 0 <= number < 0.4:
        return "0-40%" 


for i,row in tqdm(Intermittant_Demand_df.iterrows()):
    if row["No_of_points"] >= 30:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"%_non_zero_bin"] = bins(row['%_of_non-zero_points'])
    else:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"%_non_zero_bin"] = "Not_Applicable(<30 pts)"

4488it [00:01, 2952.60it/s]


#### WHITE NOISE

In [10]:
# H0 : It is a white noise
# H1 : It is not a white noise

def Is_white_noise(dataset,dt,value,frequency,product_id,SKU):
    
    """ dataset:     DataFrame -> It contains time-series for all product_ids.
        dt: column (timestamp) -> It is a date-time column.
        value: column  (float) -> It is a time dependent variable on which univariate analysis is done. 
        frequency:         str -> "7D","30D" Level at which time-series is analyzed.
        product_id:     column -> Level of grain column at which analysis is done.
        SKU:         str/float -> number/name to identify unique SKU """ 
    
    dataset[dt] = pd.to_datetime(dataset[dt])
    dataset.index = dataset[dt]
    ts_data = dataset[dataset[product_id] == SKU].resample(frequency)
    new = pd.DataFrame(ts_data[value].sum()).reset_index()
    T = int(new.shape[0]/5)
    return acorr_ljungbox(new[value], lags=T, return_df=True)['lb_pvalue'][T] > 0.05

In [11]:
for i,row in tqdm(Intermittant_Demand_df.iterrows()):
    if row["Is_greater_30"] == "Y":
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"Is_white_noise"] = Is_white_noise(dataset,dt,value,frequency,product_id,row['SKU'])
    else:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"Is_white_noise"] = "No_check_possible"

for i,row in tqdm(Non_Zero_Demand_df.iterrows()):
    if row["Is_greater_30"] == "Y":
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"Is_white_noise"] = Is_white_noise(dataset,dt,value,frequency,product_id,row['SKU'])
    else:
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"Is_white_noise"] = "No_check_possible"

4488it [00:50, 88.21it/s] 
8005it [00:03, 2482.57it/s]


#### TREND-STATIONARITY

In [12]:
def Is_stationary(dataset,dt,value,frequency,product_id,SKU):
    
    """ dataset:     DataFrame -> It contains time-series for all product_ids.
        dt: column (timestamp) -> It is a date-time column.
        value: column  (float) -> It is a time dependent variable on which univariate analysis is done. 
        frequency:         str -> "7D","30D" Level at which time-series is analyzed.
        product_id:     column -> Level of grain column at which analysis is done.
        SKU:         str/float -> number/name to identify unique SKU """ 
    
    dataset[dt] = pd.to_datetime(dataset[dt])
    dataset.index = dataset[dt]
    ts_data = dataset[dataset[product_id] == SKU].resample(frequency)
    new = pd.DataFrame(ts_data[value].sum()).reset_index()
    
    kpss_diffs = ndiffs(new[value], alpha=0.05, test='kpss', max_d=6)
    adf_diffs = ndiffs(new[value], alpha=0.05, test='adf', max_d=6)
    n_diffs = max(adf_diffs, kpss_diffs)
    
    if n_diffs == 0:
        return {"Is_stationary" : "trend-stationary","ndiff":n_diffs}
    else:
        return {"Is_stationary" : "trend-non-stationary","ndiff":n_diffs}


In [13]:
for i,row in tqdm(Intermittant_Demand_df.iterrows()):
    if row["Is_white_noise"] == False:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"Is_stationary"] = Is_stationary(dataset,dt,value,frequency,product_id,row['SKU'])["Is_stationary"]
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"ndiff"] = Is_stationary(dataset,dt,value,frequency,product_id,row['SKU'])["ndiff"]
    else:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"Is_stationary"] = "No_check_possible"
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"ndiff"] = "No_check_possible"

for i,row in tqdm(Non_Zero_Demand_df.iterrows()):
    if row["Is_white_noise"] == False:
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"Is_stationary"] = Is_stationary(dataset,dt,value,frequency,product_id,row['SKU'])["Is_stationary"]
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"ndiff"] = Is_stationary(dataset,dt,value,frequency,product_id,row['SKU'])["ndiff"]
    else:
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"Is_stationary"] = "No_check_possible"
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"ndiff"] = "No_check_possible"

4488it [00:05, 764.84it/s] 
8005it [00:07, 1131.03it/s]


#### SEASONAL-STATIONARITY

In [14]:
def Is_seasonal_stationary(dataset,dt,value,frequency,product_id,SKU,m):
    
    """ dataset:     DataFrame -> It contains time-series for all product_ids.
        dt: column (timestamp) -> It is a date-time column.
        value: column  (float) -> It is a time dependent variable on which univariate analysis is done. 
        frequency:         str -> "7D","30D" Level at which time-series is analyzed.
        product_id:     column -> Level of grain column at which analysis is done.
        SKU:         str/float -> number/name to identify unique SKU """ 
    
    dataset[dt] = pd.to_datetime(dataset[dt])
    dataset.index = dataset[dt]
    ts_data = dataset[dataset[product_id] == SKU].resample(frequency)
    new = pd.DataFrame(ts_data[value].sum()).reset_index()

    # Test to see if the time series is already seasonal stationary stationary
    if (CHTest(m=m).estimate_seasonal_differencing_term(new[value]) == 0) and (OCSBTest(m=m, lag_method="aic", max_lag=3).estimate_seasonal_differencing_term(new[value]) == 0):
        return {"Is_seasonal_stationary" : "seasonal_stationary","seasonal_ndiff":0}
    else :
        return {"Is_seasonal_stationary" : "seasonal_non_stationary","seasonal_ndiff":1}

In [15]:
for i,row in tqdm(Intermittant_Demand_df.iterrows()):
    if row["Is_white_noise"] == False:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"Is_seasonal_stationary"] = Is_seasonal_stationary(dataset,dt,value,frequency,product_id,row['SKU'],m)["Is_seasonal_stationary"]
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"seasonalndiff"] = Is_seasonal_stationary(dataset,dt,value,frequency,product_id,row['SKU'],m)["seasonal_ndiff"]
    else:
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"Is_seasonal_stationary"] = "No_check_possible"
        Intermittant_Demand_df.loc[Intermittant_Demand_df.index == i,"seasonalndiff"] = "No_check_possible"

for i,row in tqdm(Non_Zero_Demand_df.iterrows()):
    if row["Is_white_noise"] == False:
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"Is_seasonal_stationary"] = Is_seasonal_stationary(dataset,dt,value,frequency,product_id,row['SKU'],m)["Is_seasonal_stationary"]
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"seasonalndiff"] = Is_seasonal_stationary(dataset,dt,value,frequency,product_id,row['SKU'],m)["seasonal_ndiff"]
    else:
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"Is_seasonal_stationary"] = "No_check_possible"
        Non_Zero_Demand_df.loc[Non_Zero_Demand_df.index == i,"seasonalndiff"] = "No_check_possible"

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi