In [110]:
import pandas as pd
import holidays
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [150]:
data = pd.read_csv("data/online_sales_2009_2010.csv",encoding='unicode_escape')

In [151]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%d-%m-%Y %H:%M')
data = data.set_index('InvoiceDate')

In [152]:
def create_time_series_features(df: pd.DataFrame):
    features = {
        'quarter': df.index.quarter,
        'day_of_week': df.index.dayofweek,
        'month': df.index.month,
        'year': df.index.year,
        'day_of_month': df.index.day,
        'day_of_week' : df.index.day_name(),
        'week_of_year': df.index.isocalendar().week
    }

    for feature_name, feature_values in features.items():
        df[feature_name] = feature_values
    
    df.reset_index(inplace=True)

    return df

df = create_time_series_features(data)

#Feature Engineering

In [154]:
df['InvoiceDate'] = df['InvoiceDate'].dt.strftime('%Y-%m-%d')

#In the perspective of Inventory control, the goal is to focus on quantity attribute in the product level. Therefore, aggregating the quantity by product and other time related features.

#Additionally along with existing feature, ml models require some more features to understand the data clearly.

In [187]:
sales = df.groupby(['StockCode','InvoiceDate','year','month','day_of_week','day_of_month'])['Quantity'].sum().reset_index()

sales = sales.sort_values('InvoiceDate')

sales['rolling_avg_7days'] = sales.groupby('StockCode')['Quantity'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
sales['cumulative_sum'] = sales.groupby('StockCode')['Quantity'].cumsum()

sales['lag_1'] = sales.groupby('StockCode')['Quantity'].shift(1)
sales['lag_7'] = sales.groupby('StockCode')['Quantity'].shift(7)

sales['lag_1'] = sales.groupby(['StockCode','year','month'])['lag_1'].transform(lambda x: x.fillna(x.mean()))
sales['lag_7'] = sales.groupby(['StockCode','year','month'])['lag_7'].transform(lambda x: x.fillna(x.mean()))

#Creating two new features to include the holidays. Because, usually most of the people go for shopping in holidays instead of working days. 

In [188]:
uk_holidays = holidays.UnitedKingdom()

sales['is_uk_holiday'] = sales['InvoiceDate'].isin(uk_holidays).astype(int)
sales['is_weekend'] = sales['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
sales['is_holiday_or_weekend'] = ((sales['is_uk_holiday'] == 1) | (sales['is_weekend'] == 1)).astype(int)

In [190]:
sales = sales.drop(['InvoiceDate'], axis=1)

In [191]:
def oneHotEncoding(df, index_column):
    df=df.set_index(index_column)
    
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded_categorical = encoder.fit_transform(df[categorical_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))
    
    one_hot_encoded = pd.concat([df.reset_index(),one_hot_df], axis=1)
    one_hot_encoded = one_hot_encoded.drop(categorical_cols, axis=1)
    df = one_hot_encoded.set_index(index_column)

    return df

sales = oneHotEncoding(sales, 'StockCode')

In [203]:
split_point = int(len(sales) * 0.8)

# Split the data
train = sales.iloc[:split_point]
test = sales.iloc[split_point:]

In [204]:
X_train = train.drop(['Quantity'], axis=1)
y_train = train['Quantity']

X_test = test.drop(['Quantity'], axis=1)
y_test = test['Quantity']

In [205]:
# Convert to NumPy arrays
X_train = X_train.to_numpy()  
y_train = y_train.to_numpy() 

X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

In [206]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 8749.574064735005
R^2 Score: 0.3130624985863445


In [200]:
import xgboost as xgb

model_xgb = xgb.XGBRegressor(objective='reg:squarederror', max_depth=4, n_estimators=500)

# Training the model on the training data
model_xgb.fit(X_train, y_train)

# Making predictions on the test set
predictions_xgb = model_xgb.predict(X_test)

# Calculate the mean squared error and R-squared score
mse_xgb = mean_squared_error(y_test, predictions_xgb)
r2_xgb = r2_score(y_test, predictions_xgb)

print("XGBoost Mean Squared Error:", mse_xgb)
print("XGBoost R-squared Score:", r2_xgb)

XGBoost Mean Squared Error: 15435.33442377234
XGBoost R-squared Score: -0.21184277534484863


In [207]:
from sklearn.svm import SVR

# Creating an SVM regressor model
model_svr = SVR(kernel='rbf')  # You can also try 'linear' or 'poly' kernels

# Training the model on the training data
model_svr.fit(X_train, y_train)

# Making predictions on the test set
predictions_svr = model_svr.predict(X_test)

# Calculate the mean squared error and R-squared score
mse_svr = mean_squared_error(y_test, predictions_svr)
r2_svr = r2_score(y_test, predictions_svr)

print("SVM Mean Squared Error:", mse_svr)
print("SVM R-squared Score:", r2_svr)
