In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [10]:
# Load the dataset
data = pd.read_csv('Transections.csv')

# Transpose the dataset to have dates as rows and stores as columns
data = data.transpose()

In [11]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,162,163,164,165,166,167,168,169,170,171
Store ID,10001,10002,10003,10004,10005,10006,10007,10008,10009,10010,...,10163,10164,10165,10166,10167,10168,10169,10170,10171,10172
01/01/2019,715,662,783,1278,553,1102,846,564,1010,655,...,0,0,0,0,0,0,0,0,0,0
01/02/2019,701,577,543,668,415,1007,760,556,991,476,...,0,0,0,0,0,0,0,0,0,0
01/03/2019,813,634,636,895,495,1295,865,608,1224,584,...,0,0,0,0,0,0,0,0,0,0
01/04/2019,806,558,681,953,529,1251,750,539,1131,587,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Reset the index and rename the columns
data = data.reset_index()
data.columns = ['Date'] + list(data.iloc[0, 1:])  # Assign the first row as column names
data = data.drop(data.index[0])
data.head()

Unnamed: 0,Date,10001,10002,10003,10004,10005,10006,10007,10008,10009,...,10163,10164,10165,10166,10167,10168,10169,10170,10171,10172
1,01/01/2019,715,662,783,1278,553,1102,846,564,1010,...,0,0,0,0,0,0,0,0,0,0
2,01/02/2019,701,577,543,668,415,1007,760,556,991,...,0,0,0,0,0,0,0,0,0,0
3,01/03/2019,813,634,636,895,495,1295,865,608,1224,...,0,0,0,0,0,0,0,0,0,0
4,01/04/2019,806,558,681,953,529,1251,750,539,1131,...,0,0,0,0,0,0,0,0,0,0
5,01/05/2019,693,560,552,647,372,962,931,481,964,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Convert the Date column to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
data.head()

Unnamed: 0,Date,10001,10002,10003,10004,10005,10006,10007,10008,10009,...,10163,10164,10165,10166,10167,10168,10169,10170,10171,10172
1,2019-01-01,715,662,783,1278,553,1102,846,564,1010,...,0,0,0,0,0,0,0,0,0,0
2,2019-01-02,701,577,543,668,415,1007,760,556,991,...,0,0,0,0,0,0,0,0,0,0
3,2019-01-03,813,634,636,895,495,1295,865,608,1224,...,0,0,0,0,0,0,0,0,0,0
4,2019-01-04,806,558,681,953,529,1251,750,539,1131,...,0,0,0,0,0,0,0,0,0,0
5,2019-01-05,693,560,552,647,372,962,931,481,964,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Melt the dataframe to have a single column for store ID
data_melted = pd.melt(data, id_vars=['Date'], value_vars=list(data.columns[1:]), var_name='Store ID', value_name='Transections')
data_melted.head()

Unnamed: 0,Date,Store ID,Transections
0,2019-01-01,10001,715
1,2019-01-02,10001,701
2,2019-01-03,10001,813
3,2019-01-04,10001,806
4,2019-01-05,10001,693


In [21]:
# Feature Engineering
data_melted['Month'] = data_melted['Date'].dt.month
data_melted['DayOfWeek'] = data_melted['Date'].dt.dayofweek
data_melted.head()

Unnamed: 0,Date,Store ID,Transections,Month,DayOfWeek
0,2019-01-01,10001,715,1,1
1,2019-01-02,10001,701,1,2
2,2019-01-03,10001,813,1,3
3,2019-01-04,10001,806,1,4
4,2019-01-05,10001,693,1,5


In [22]:
data_melted.to_csv('Transection_viz_data.csv', index=True)

In [25]:
features = ['Month', 'DayOfWeek', 'Store ID']

X = data_melted[['Month', 'DayOfWeek', 'Store ID']]
y = data_melted[['Transections']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Forecasting for the period of Feb - Dec 2021
forecast_dates = pd.date_range(start=datetime(2021, 2, 1), end=datetime(2021, 12, 31))
forecast_data = pd.DataFrame({'Date': forecast_dates})
forecast_data['Month'] = forecast_data['Date'].dt.month
forecast_data['DayOfWeek'] = forecast_data['Date'].dt.dayofweek
forecast_data['Store ID'] = X['Store ID']

# Initialize an empty list to store the forecasted data for each store
forecasted_data = []

# Make predictions for each store
for store_id in data_melted['Store ID'].unique():
    forecast_data['Store ID'] = store_id
    forecast_data['Transections'] = model.predict(forecast_data[features])
    forecasted_data.append(forecast_data.copy())

# Concatenate the forecasted data for all stores into a single dataframe
forecasted_data = pd.concat(forecasted_data, ignore_index=True)

# Display the forecasted sales
print(forecasted_data[['Store ID', 'Date', 'Transections']])


  model.fit(X_train, y_train)


Mean Absolute Error: 87.59664887979594
       Store ID       Date  Transections
0         10001 2021-02-01    660.998754
1         10001 2021-02-02    633.006139
2         10001 2021-02-03    658.467749
3         10001 2021-02-04    873.793757
4         10001 2021-02-05    833.053132
...         ...        ...           ...
57443     10172 2021-12-27    202.522825
57444     10172 2021-12-28    156.485748
57445     10172 2021-12-29    230.961553
57446     10172 2021-12-30    306.489015
57447     10172 2021-12-31    272.725849

[57448 rows x 3 columns]


In [19]:
Forcast = forecasted_data.pivot(index='Date', columns='Store ID', values='Transections')
Forcast = Forcast.transpose()

In [20]:
Forcast.to_csv('TransectionsForcast.csv', index=True)