In [13]:
import pandas as pd
import warnings
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

In [14]:
sales = pd.read_csv("data/sales_data.csv")

In [15]:
sales.head()

Unnamed: 0,order_number,order_date,sku_id,warehouse_id,customer_type,order_quantity,unit_sale_price,revenue,adjusted_order_quantity
0,SO - 018900,2021-01-01,3551CA,GUT930,Export,105.0,7.07,742,105.0
1,SO - 018901,2021-01-01,3079BA,AXW291,Wholesale,128.0,134.5,20310,128.0
2,SO - 018902,2021-01-01,3250CA,AXW291,Distributor,185.33,34.75,10426,185.33
3,SO - 018903,2021-01-01,1161AA,GUT930,Wholesale,167.0,136.59,6830,167.0
4,SO - 018904,2021-01-01,3512AA,GUT930,Distributor,450.0,0.1,103,450.0


In [16]:
sales['order_date'] = pd.to_datetime(sales['order_date'])
sales = sales.loc[:, ~sales.columns.str.startswith('Unnamed')]
sales=sales[sales['order_date']<"2022-12-31"]
sales = sales.set_index('order_date')

In [17]:
sales.head()

Unnamed: 0_level_0,order_number,sku_id,warehouse_id,customer_type,order_quantity,unit_sale_price,revenue,adjusted_order_quantity
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-01,SO - 018900,3551CA,GUT930,Export,105.0,7.07,742,105.0
2021-01-01,SO - 018901,3079BA,AXW291,Wholesale,128.0,134.5,20310,128.0
2021-01-01,SO - 018902,3250CA,AXW291,Distributor,185.33,34.75,10426,185.33
2021-01-01,SO - 018903,1161AA,GUT930,Wholesale,167.0,136.59,6830,167.0
2021-01-01,SO - 018904,3512AA,GUT930,Distributor,450.0,0.1,103,450.0


In [18]:
def create_time_series_features(df: pd.DataFrame):
    features = {
        'quarter': df.index.quarter,
        'day_of_week': df.index.dayofweek,
        'date_and_month': df.index.strftime('%m %b'),
        'month': df.index.month,
        'year': df.index.year,
        'day_of_month': df.index.day,
        'week_of_year': df.index.isocalendar().week
    }

    for feature_name, feature_values in features.items():
        df[feature_name] = feature_values

    return df

sales = create_time_series_features(sales)

In [19]:
split_date = '2022-08-31'

train = sales.loc[sales.index < split_date]
test = sales.loc[sales.index >= split_date]

Filter the attributes that are needed for training and testing:

In [21]:
training_filtered_sales_data = train[['sku_id','customer_type','day_of_month','day_of_week','month','quarter','year','week_of_year','order_quantity']]
testing_filtered_sales_data = test[['sku_id','customer_type','day_of_month','day_of_week','month','quarter','year','week_of_year','order_quantity']]

In [22]:
def oneHotEncoding(df, index_column):
    df=df.set_index(index_column)
    
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded_categorical = encoder.fit_transform(df[categorical_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))
    
    one_hot_encoded = pd.concat([df.reset_index(),one_hot_df], axis=1)
    one_hot_encoded = one_hot_encoded.drop(categorical_cols, axis=1)
    df = one_hot_encoded.set_index(index_column)

    return df

In [23]:
training_sales_encoded = oneHotEncoding(training_filtered_sales_data, 'sku_id')
testing_sales_encoded = oneHotEncoding(testing_filtered_sales_data, 'sku_id')

In [24]:
features = training_sales_encoded.columns.values.tolist()
features.remove('order_quantity')
target = 'order_quantity'

In [25]:
X_train = training_sales_encoded[features]
y_train = training_sales_encoded[target]

X_test = testing_sales_encoded[features]
y_test = testing_sales_encoded[target]

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Random Forest Regressor with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 2187917.95052384
R^2 Score: -0.09932309221989821


In [31]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shashanks1202/retail-transactions-online-sales-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shashanks1202/retail-transactions-online-sales-dataset?dataset_version_number=1...


100%|██████████| 43.3M/43.3M [00:03<00:00, 11.4MB/s]

Extracting files...





Path to dataset files: C:\Users\JASWANTH REDDY\.cache\kagglehub\datasets\shashanks1202\retail-transactions-online-sales-dataset\versions\1
