In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt


data = pd.read_csv('data/istanbul_data_preprocessed.csv')

name_cols = ['MainDistID', 'DistID', 'HcoID', 'SkuID']
categorical_cols = ['SalesRegionID_x','SalesRegionID_y', 'HcoType' ]
numerical_cols = ['PaidQty', 'FGQty', 'TotalQuantity', 'Week', 'Quarter', 'Month', 'Year']
boolean_cols = ['IsReturn', 'IsMrsOrder', 'Status']

data['Date'] = pd.to_datetime(data['Date'])
data['Week'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data['Year'] = data['Date'].dt.year

data['Status'] = data['Status'].apply(lambda x: True if x == 'Active' else False)

# convert categorical columns to 'category' data type
for col in categorical_cols:
    data[col] = data[col].astype('category')

# convert numerical columns to 'int' data type
for col in numerical_cols:
    data[col] = data[col].astype('int')

# convert name columns to str data type
for col in name_cols:
    data[col] = data[col].astype(str)

# convert boolean columns to 'bool' data type
for col in boolean_cols:
    data[col] = data[col].astype(bool)

data.info()
raw_data = data.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4201059 entries, 0 to 4201058
Data columns (total 20 columns):
 #   Column           Dtype         
---  ------           -----         
 0   MainDistID       object        
 1   DistID           object        
 2   Date             datetime64[ns]
 3   HcoID            object        
 4   SalesRegionID_x  category      
 5   BrandID          int64         
 6   SkuID            object        
 7   IsReturn         bool          
 8   PaidQty          int64         
 9   FGQty            int64         
 10  IsMrsOrder       bool          
 11  HcoType          category      
 12  District         object        
 13  SalesRegionID_y  category      
 14  Status           bool          
 15  TotalQuantity    int64         
 16  Week             int64         
 17  Month            int64         
 18  Quarter          int64         
 19  Year             int64         
dtypes: bool(3), category(3), datetime64[ns](1), int64(8), object(5)
memo

In [2]:
segment_1_sku_ids = ["568", "578", "1050"]
segment_2_sku_ids = ["319", "509", "615", "1018"]

all_sku_ids = segment_1_sku_ids + segment_2_sku_ids

data = data[data['SkuID'].isin(all_sku_ids)]

In [3]:
quarterly_data = data.copy()

quarterly_data = quarterly_data.groupby(['Year', 'Quarter', 'SkuID']).agg({'TotalQuantity': 'sum'}).reset_index()
quarterly_data

Unnamed: 0,Year,Quarter,SkuID,TotalQuantity
0,2019,1,1018,13937
1,2019,1,1050,52981
2,2019,1,319,46823
3,2019,1,509,43084
4,2019,1,568,145879
...,...,...,...,...
79,2021,4,319,76375
80,2021,4,509,36852
81,2021,4,568,225966
82,2021,4,578,109939


In [None]:
sku_319 = quarterly_data[quarterly_data['SkuID'] == "319"]
sku_319

In [None]:
import itertools
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Parametre kombinasyonlarını oluştur
p_values = range(0, 3) # Örnek olarak 0'dan 3'e kadar p değerleri
d_values = range(0, 2) # Örnek olarak 0'dan 2'ye kadar d değerleri
q_values = range(0, 3) # Örnek olarak 0'dan 3'e kadar q değerleri
param_combinations = list(itertools.product(p_values, d_values, q_values))

best_score, best_params, best_model, mape = float("inf"), None, None, None

df = sku_319.copy()
df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Quarter'].astype(str))
df.set_index('Date', inplace=True)
df.index = df.index.to_period('Q').to_timestamp('S')

# Grid search
for param in param_combinations:
    try:
        model = ARIMA(df['TotalQuantity'], order=param)
        model_fit = model.fit()
        y_pred = model_fit.forecast(steps=2)  
        mse = mean_squared_error(df['TotalQuantity'].iloc[-2:], y_pred)
        mape = mse / df['TotalQuantity'].iloc[-2:].mean()
        if mse < best_score:
            best_score, best_params, best_model, mape = mse, param, model_fit, mape
    except:
        continue

print('Best ARIMA Model:', best_params)
print('MSE:', best_score)
print('MAPE:', mape)
print(best_model.summary())

In [None]:
import itertools
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

dfSarimax = sku_319.copy()
df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Quarter'].astype(str))
df.set_index('Date', inplace=True)

# Parametre kombinasyonlarını oluştur
p_values = range(0, 3) # Örnek olarak 0'dan 3'e kadar p değerleri
d_values = range(0, 2) # Örnek olarak 0'dan 2'ye kadar d değerleri
q_values = range(0, 3) # Örnek olarak 0'dan 3'e kadar q değerleri
P_values = range(0, 3) # Örnek olarak 0'dan 3'e kadar P değerleri
D_values = range(0, 2) # Örnek olarak 0'dan 2'ye kadar D değerleri
Q_values = range(0, 3) # Örnek olarak 0'dan 3'e kadar Q değerleri
s_values = range(4, 5) # Örnek olarak 4'den 5'e kadar s değerleri
param_combinations = list(itertools.product(p_values, d_values, q_values, P_values, D_values, Q_values, s_values))

best_score, best_params, best_model, mape = float("inf"), None, None, None

# Grid search

for param in param_combinations:
    try:
        model = SARIMAX(dfSarimax['TotalQuantity'], order=param[:3], seasonal_order=param[3:])
        model_fit = model.fit()
        y_pred = model_fit.forecast(steps=2)
        mse = mean_squared_error(dfSarimax['TotalQuantity'].iloc[-2:], y_pred)
        mape = mse / dfSarimax['TotalQuantity'].iloc[-2:].mean()
        if mse < best_score:
            best_score, best_params, best_model, mape = mse, param, model_fit, mape
    except:
        continue
        
print('Best SARIMA Model:', best_params)
print('MSE:', best_score)
print('MAPE:', mape)
print(best_model.summary())