In [248]:
import pandas as pd
import numpy as np

# get hco data
hco_data_istanbul = pd.read_csv('./data/hco_data_istanbul.csv')

# get sales data
df = pd.read_csv('./data/istanbul_data.csv')
drop_columns = ['Year_Month', 'Lat', 'Lon', 'City']
df = df.drop(columns=drop_columns)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4201059 entries, 0 to 4201058
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   MainDistID       int64  
 1   DistID           int64  
 2   Date             object 
 3   HcoID            int64  
 4   SalesRegionID_x  int64  
 5   BrandID          int64  
 6   SkuID            int64  
 7   IsReturn         int64  
 8   PaidQty          float64
 9   FGQty            float64
 10  IsMrsOrder       int64  
 11  HcoType          object 
 12  District         object 
 13  SalesRegionID_y  int64  
 14  Status           object 
 15  TotalQuantity    int64  
dtypes: float64(2), int64(10), object(4)
memory usage: 512.8+ MB


In [249]:
name_cols = ['MainDistID', 'DistID', 'HcoID', 'SkuID']
categorical_cols = ['SalesRegionID_x','SalesRegionID_y', 'HcoType', 'District', 'Status', ]
numerical_cols = ['PaidQty', 'FGQty', 'TotalQuantity']
boolean_cols = ['IsReturn', 'IsMrsOrder']
df['Date'] = pd.to_datetime(df['Date'])

# convert categorical columns to 'category' data type
for col in categorical_cols:
    df[col] = df[col].astype('category')

# convert numerical columns to 'int' data type
for col in numerical_cols:
    df[col] = df[col].astype('int')
    
# convert name columns to str data type
for col in name_cols:
    df[col] = df[col].astype(str)
    
# convert boolean columns to 'bool' data type
for col in boolean_cols:
    df[col] = df[col].astype(bool)
    
df = df.set_index('Date')    
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4201059 entries, 2019-01-16 to 2021-12-30
Data columns (total 15 columns):
 #   Column           Dtype   
---  ------           -----   
 0   MainDistID       object  
 1   DistID           object  
 2   HcoID            object  
 3   SalesRegionID_x  category
 4   BrandID          int64   
 5   SkuID            object  
 6   IsReturn         bool    
 7   PaidQty          int64   
 8   FGQty            int64   
 9   IsMrsOrder       bool    
 10  HcoType          category
 11  District         category
 12  SalesRegionID_y  category
 13  Status           category
 14  TotalQuantity    int64   
dtypes: bool(2), category(5), int64(4), object(4)
memory usage: 316.5+ MB


In [250]:
# if IsReturn == 1, then make TotalQuantity negative
df['TotalQuantity'] = np.where(df['IsReturn'] == 1, -df['TotalQuantity'], df['TotalQuantity'])

In [251]:
# select isReturn column = 1
isReturn1 = df[df['IsReturn'] == 1]
isReturn1.head()

Unnamed: 0_level_0,MainDistID,DistID,HcoID,SalesRegionID_x,BrandID,SkuID,IsReturn,PaidQty,FGQty,IsMrsOrder,HcoType,District,SalesRegionID_y,Status,TotalQuantity
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-12-10,2879249,2878795,2885754,1000000,25,541,True,1,0,False,P,SANCAKTEPE,630101,Active,-1
2019-06-12,2879266,2878721,2860455,340502,107,29230,True,1,0,False,P,BAKIRKÖY,340502,Active,-1
2019-01-05,2879266,2878724,2862447,341903,21,522,True,1,0,False,P,KÜÇÜKÇEKMECE,341903,Active,-1
2019-03-28,2879266,2878724,2862447,341903,113,16870,True,3,0,False,P,KÜÇÜKÇEKMECE,341903,Active,-3
2019-04-12,2879266,2878724,2862447,341903,15,319,True,1,0,False,P,KÜÇÜKÇEKMECE,341903,Active,-1


### Seçilenler

- I. Segment: 568 (challenge olan), 578, 1050
- II. Segment: 319 (en stabil olan), 509/615, 1018 (düşük satışlı olan)

In [252]:
segment_1_sku_ids = ['568', '578', '1050']
segment_2_sku_ids = ['319', '509', '615', '1018']

In [253]:
# Create a new dataframe for the selected SKUs
selected_skus = ['319']
selected_skus_df = df[df['SkuID'].isin(selected_skus)]

In [254]:
# select features and target variable
selected_features = ['MainDistID', 'DistID', 'HcoID', 'HcoType', 'Status', 'IsReturn', 'IsMrsOrder', 'PaidQty', 'FGQty', 'TotalQuantity']
target_variable = 'TotalQuantity'

In [255]:
selected_skus_df = selected_skus_df[selected_features]
selected_skus_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 206690 entries, 2019-01-31 to 2021-12-28
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   MainDistID     206690 non-null  object  
 1   DistID         206690 non-null  object  
 2   HcoID          206690 non-null  object  
 3   HcoType        206690 non-null  category
 4   Status         206690 non-null  category
 5   IsReturn       206690 non-null  bool    
 6   IsMrsOrder     206690 non-null  bool    
 7   PaidQty        206690 non-null  int64   
 8   FGQty          206690 non-null  int64   
 9   TotalQuantity  206690 non-null  int64   
dtypes: bool(2), category(2), int64(3), object(3)
memory usage: 11.8+ MB


In [256]:
# Check for missing values
selected_skus_df.isnull().sum()

MainDistID       0
DistID           0
HcoID            0
HcoType          0
Status           0
IsReturn         0
IsMrsOrder       0
PaidQty          0
FGQty            0
TotalQuantity    0
dtype: int64

In [257]:
selected_skus_df.head()

Unnamed: 0_level_0,MainDistID,DistID,HcoID,HcoType,Status,IsReturn,IsMrsOrder,PaidQty,FGQty,TotalQuantity
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-31,2879249,2878795,2885754,P,Active,False,False,2,0,2
2019-05-07,2879266,2878720,2885754,P,Active,False,False,2,0,2
2019-06-11,2879249,2878795,2885754,P,Active,False,False,2,0,2
2019-08-26,2879249,2878795,2885754,P,Active,False,False,3,0,3
2019-10-07,2879249,2878795,2885754,P,Active,False,False,1,0,1


In [258]:
df_encoded = selected_skus_df.copy()

for col in df_encoded.columns:
    if df_encoded[col].dtype.name == 'category':
        df_encoded = pd.get_dummies(df_encoded, columns=[col], drop_first=True)
        
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 206690 entries, 2019-01-31 to 2021-12-28
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   MainDistID       206690 non-null  object
 1   DistID           206690 non-null  object
 2   HcoID            206690 non-null  object
 3   IsReturn         206690 non-null  bool  
 4   IsMrsOrder       206690 non-null  bool  
 5   PaidQty          206690 non-null  int64 
 6   FGQty            206690 non-null  int64 
 7   TotalQuantity    206690 non-null  int64 
 8   HcoType_P        206690 non-null  bool  
 9   HcoType_W        206690 non-null  bool  
 10  Status_Inactive  206690 non-null  bool  
dtypes: bool(5), int64(3), object(3)
memory usage: 12.0+ MB


In [259]:
for col in df_encoded.columns:
    if df_encoded[col].dtype.name == 'object':
        df_encoded[col] = df_encoded[col].astype('category')

In [260]:
df_encoded.head()

Unnamed: 0_level_0,MainDistID,DistID,HcoID,IsReturn,IsMrsOrder,PaidQty,FGQty,TotalQuantity,HcoType_P,HcoType_W,Status_Inactive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-31,2879249,2878795,2885754,False,False,2,0,2,True,False,False
2019-05-07,2879266,2878720,2885754,False,False,2,0,2,True,False,False
2019-06-11,2879249,2878795,2885754,False,False,2,0,2,True,False,False
2019-08-26,2879249,2878795,2885754,False,False,3,0,3,True,False,False
2019-10-07,2879249,2878795,2885754,False,False,1,0,1,True,False,False


In [261]:
# I am creating a model to predict the total quantity of sales for a given SKU
# The target variable is 'TotalQuantity'
# The features are all the other columns in the dataset

# Split the data into features and target variable
X = df_encoded.drop(columns=['TotalQuantity'])
y = df_encoded['TotalQuantity']

In [262]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [263]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_model_y_pred = lr_model.predict(X_test)

lr_model_rmse = mean_squared_error(y_test, lr_model_y_pred, squared=False)
lr_model_r2_score = r2_score(y_test, lr_model_y_pred)

print(f"RMSE: {lr_model_rmse}")
print(f"R2 Score: {lr_model_r2_score}")

RMSE: 1.137347861082006
R2 Score: 0.9071632227432751


In [264]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

rf_rmse = mean_squared_error(y_test, rf_y_pred, squared=False)
rf_r2_score = r2_score(y_test, rf_y_pred)

print(f"RMSE: {rf_rmse}")
print(f"R2 Score: {rf_r2_score}")

RMSE: 0.8288333458989702
R2 Score: 0.9506976232186054


In [265]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)

gb_rmse = mean_squared_error(y_test, rf_y_pred, squared=False)
gb_r2_score = r2_score(y_test, gb_y_pred)

print(f"RMSE: {gb_rmse}")
print(f"R2 Score: {gb_r2_score}")

RMSE: 0.8288333458989702
R2 Score: 0.999253506489169


In [266]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, enable_categorical=True)
xgb_model.fit(X_train, y_train)

xgb_y_pred = xgb_model.predict(X_test)

xgb_rmse = mean_squared_error(y_test, xgb_y_pred, squared=False)
xgb_r2_score = r2_score(y_test, xgb_y_pred)

print(f"RMSE: {xgb_rmse}")
print(f"R2 Score: {xgb_r2_score}")

ValueError: Experimental support for categorical data is not implemented for current tree method yet.

In [267]:
# create table to compare models
model_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'RMSE': [lr_model_rmse, rf_rmse, gb_rmse],
    'R2 Score': [lr_model_r2_score, rf_r2_score, gb_r2_score ]
})

model_comparison

Unnamed: 0,Model,RMSE,R2 Score
0,Linear Regression,1.137348,0.907163
1,Random Forest,0.828833,0.950698
2,Gradient Boosting,0.828833,0.999254


## Prophet Model

In [268]:
from prophet import Prophet
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error

# Step 1: Prepare the data
prophet_df = selected_skus_df.copy()
prophet_df = prophet_df.reset_index()
prophet_df = prophet_df.rename(columns={'Date': 'ds', 'TotalQuantity': 'y'})

# Ensure the data is sorted by date
prophet_df = prophet_df.sort_values(by='ds')
prophet_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206690 entries, 126283 to 78860
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ds          206690 non-null  datetime64[ns]
 1   MainDistID  206690 non-null  object        
 2   DistID      206690 non-null  object        
 3   HcoID       206690 non-null  object        
 4   HcoType     206690 non-null  category      
 5   Status      206690 non-null  category      
 6   IsReturn    206690 non-null  bool          
 7   IsMrsOrder  206690 non-null  bool          
 8   PaidQty     206690 non-null  int64         
 9   FGQty       206690 non-null  int64         
 10  y           206690 non-null  int64         
dtypes: bool(2), category(2), datetime64[ns](1), int64(3), object(3)
memory usage: 13.4+ MB


In [269]:
# Step 2: Split the data into training and testing sets
# Use a specified proportion for the training set, e.g., 80% training and 20% testing
train_size = int(len(prophet_df) * 0.8)
train_df = prophet_df.iloc[:train_size]
test_df = prophet_df.iloc[train_size:]

In [ ]:
# Step 3: Instantiate and fit the Prophet model on training data
model = Prophet()
model.fit(train_df[['ds', 'y']])

# Step 4: Make future predictions for the test period
# Create a dataframe with future dates based on the test period
future = model.make_future_dataframe(periods=len(test_df), freq='D')

# Predict future values
forecast = model.predict(future)

10:46:39 - cmdstanpy - INFO - Chain [1] start processing
10:46:52 - cmdstanpy - INFO - Chain [1] done processing


In [ ]:
# Step 5: Extract predictions corresponding to the test period
forecast_test = forecast.iloc[train_size:]

# Step 6: Evaluate the model
# Merge actual test values with the forecasted values
evaluation_df = test_df[['ds', 'y']].merge(forecast_test[['ds', 'yhat']], on='ds', how='left')

In [ ]:
# Calculate evaluation metrics, e.g., Mean Absolute Error (MAE)
mae = mean_absolute_error(evaluation_df['y'], evaluation_df['yhat'])
print(f'Mean Absolute Error: {mae}')

In [ ]:
# Step 7: Plot the comparison graph
plt.figure(figsize=(10, 6))
plt.plot(train_df['ds'], train_df['y'], label='Training Data')
plt.plot(test_df['ds'], test_df['y'], label='Actual Data')
plt.plot(forecast_test['ds'], forecast_test['yhat'], label='Predicted Data')

plt.xlabel('Date')
plt.ylabel('Total Quantity')
plt.title('Actual vs Predicted Data')
plt.legend()
plt.show()

In [None]:
from prophet.plot import plot_plotly, plot_components_plotly

plot_plotly(model, forecast)