In [1]:
import pandas as pd

# Set display options to show maximum columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the datasets
transactions = pd.read_csv('./input_data/transactions.csv')
users = pd.read_csv('./input_data/users.csv')
stores = pd.read_csv('./input_data/stores.csv')

In [3]:
# Convert 'event_occurrence' to datetime if needed
transactions['event_occurrence'] = pd.to_datetime(transactions['event_occurrence'])

In [5]:
# clean and transform the users data
# Identify IDs with both missing gender and age
missing_both_ids = users[users['gender'].isnull() & users['age'].isnull()]['id']

# Remove records where both gender and age are missing
cleaned_users = users[~users['id'].isin(missing_both_ids)]

# Display the cleaned DataFrame
# cleaned_users.count()

# Assign a static value 'Unknown' to missing gender
cleaned_users['gender'] = cleaned_users['gender'].fillna('Unknown')
# Display the cleaned DataFrame
cleaned_users.count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_users['gender'] = cleaned_users['gender'].fillna('Unknown')


id        9463
gender    9463
age       9463
dtype: int64

## 2) Forecast GMV for each user for the month of January 2022

### To forecast the Gross Merchandise Value (GMV) for each user for January 2022, you can follow these steps:

	1.	Prepare the Data: Ensure that the transaction data is cleaned and preprocessed.
	2.	Aggregate Historical Data: Use historical data to understand past transaction patterns.
	3.	Build a Forecasting Model: Use a time series forecasting model to predict future values.
	4.	Generate Predictions: Forecast GMV for each user for January 2022.
	5.	Evaluate and Visualize: Assess the model’s performance and visualize the forecast.

In [6]:

# Step1  Filter data up to December 2021 for training
train_data = transactions[transactions['event_occurrence'] < '2022-01-01']

In [7]:
# Step 2 Aggregate Historical Data

#Aggregate the GMV for each user by month to understand historical patterns.
# Aggregate GMV per user by month
train_data['year_month'] = train_data['event_occurrence'].dt.to_period('M')
monthly_gmv = train_data.groupby(['user_id', 'year_month']).agg({'amount': 'sum'}).reset_index()

In [10]:
monthly_gmv.head()

Unnamed: 0,user_id,year_month,amount
0,00073cc2-c801-c67c-d039-fca63c78c6a9,2020-11,309
1,00073cc2-c801-c67c-d039-fca63c78c6a9,2020-12,18318
2,00073cc2-c801-c67c-d039-fca63c78c6a9,2021-01,12986
3,00073cc2-c801-c67c-d039-fca63c78c6a9,2021-02,13430
4,00073cc2-c801-c67c-d039-fca63c78c6a9,2021-03,14961


In [8]:
## 3. Build a Forecasting Model

#For simplicity, let’s use a time series forecasting model like ARIMA. You might need to install statsmodels if not already installed.

In [9]:
from statsmodels.tsa.arima.model import ARIMA
import numpy as np

# Create a dictionary to store the forecasts
user_forecasts = {}

for user_id in monthly_gmv['user_id'].unique():
    user_data = monthly_gmv[monthly_gmv['user_id'] == user_id].set_index('year_month')
    
    # Ensure the index is in datetime format
    user_data.index = user_data.index.to_timestamp()
    
    # Fit ARIMA model (parameters p, d, q can be adjusted)
    model = ARIMA(user_data['amount'], order=(5, 1, 0))
    model_fit = model.fit()
    
    # Forecast GMV for January 2022
    forecast = model_fit.get_forecast(steps=1)
    forecast_value = forecast.predicted_mean[0]
    
    # Store the forecast
    user_forecasts[user_id] = forecast_value

  forecast_value = forecast.predicted_mean[0]
  warn('Non-stationary starting autoregressive parameters'
  forecast_value = forecast.predicted_mean[0]
  forecast_value = forecast.predicted_mean[0]
  forecast_value = forecast.predicted_mean[0]
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


LinAlgError: Schur decomposition solver error.

In [11]:
from pmdarima import auto_arima

for user_id in monthly_gmv['user_id'].unique():
    user_data = monthly_gmv[monthly_gmv['user_id'] == user_id].set_index('year_month')
    user_data.index = user_data.index.to_timestamp()
    
    # Fit ARIMA model with auto_arima
    model = auto_arima(user_data['amount'], seasonal=False, trace=True, error_action='ignore', suppress_warnings=True)
    
    try:
        # Forecast GMV for January 2022
        forecast = model.predict(n_periods=1)
        forecast_value = forecast[0]
    except Exception as e:
        print(f"Error forecasting for user {user_id}: {e}")
        forecast_value = np.nan
    
    # Store the forecast
    user_forecasts[user_id] = forecast_value

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=inf, Time=0.03 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=313.699, Time=0.00 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=303.002, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=311.191, Time=0.00 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=303.422, Time=0.01 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=inf, Time=0.02 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=302.958, Time=0.02 sec
 ARIMA(3,0,1)(0,0,0)[0]             : AIC=304.930, Time=0.01 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=inf, Time=0.02 sec
 ARIMA(3,0,0)(0,0,0)[0]             : AIC=303.747, Time=0.01 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=310.635, Time=0.04 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=299.307, Time=0.01 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=297.747, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=295.773, Time=0.00 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=293.742, Time=0.0

  forecast_value = forecast[0]
  forecast_value = forecast[0]


Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=342.397, Time=0.02 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=363.497, Time=0.00 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=339.451, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=inf, Time=0.01 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=341.448, Time=0.01 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=341.224, Time=0.01 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=inf, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=335.346, Time=0.01 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=334.007, Time=0.00 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=334.203, Time=0.01 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.02 sec

Best model:  ARIMA(0,0,0)(0,0,0)[0] intercept
Total fit time: 0.108 seconds
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=inf, Time=0.03 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=573.327, Time=0.00 sec
 ARIMA(1

  forecast_value = forecast[0]


 ARIMA(3,0,2)(0,0,0)[0]             : AIC=inf, Time=0.05 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=535.651, Time=0.02 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=536.130, Time=0.01 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=533.126, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=531.984, Time=0.00 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=532.884, Time=0.00 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=530.803, Time=0.00 sec
 ARIMA(0,0,2)(0,0,0)[0] intercept   : AIC=535.492, Time=0.01 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=536.586, Time=0.01 sec

Best model:  ARIMA(0,0,1)(0,0,0)[0] intercept
Total fit time: 0.259 seconds
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=26.028, Time=0.00 sec
Total fit time: 0.013 seconds
Error forecasting for user 001cfd71-1996-2ffe-d382-4e7cd476ef88: 0
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=inf, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=388.777, Time=0.00 sec
 ARIMA(1,0,0

  forecast_value = forecast[0]
  return get_prediction_index(
  return get_prediction_index(


 ARIMA(1,0,2)(0,0,0)[0]             : AIC=inf, Time=0.02 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=inf, Time=0.04 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=352.816, Time=0.01 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=350.747, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=348.416, Time=0.02 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=348.180, Time=0.00 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=348.634, Time=0.01 sec

Best model:  ARIMA(0,0,0)(0,0,0)[0] intercept
Total fit time: 0.240 seconds
Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=inf, Time=0.02 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=201.291, Time=0.00 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=197.674, Time=0.00 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=201.338, Time=0.00 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=195.370, Time=0.01 sec
 ARIMA(3,0,0)(0,0,0)[0]             : AIC=inf, Time=0.01 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=194.126, 

  forecast_value = forecast[0]


 ARIMA(1,0,2)(0,0,0)[0]             : AIC=inf, Time=0.03 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=inf, Time=0.04 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=189.595, Time=0.01 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=187.628, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=185.893, Time=0.00 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=189.091, Time=0.00 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=187.453, Time=0.01 sec

Best model:  ARIMA(1,0,0)(0,0,0)[0] intercept
Total fit time: 0.222 seconds
Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=137.962, Time=0.05 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=139.939, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=136.832, Time=0.01 sec


  forecast_value = forecast[0]
  forecast_value = forecast[0]


 ARIMA(0,0,1)(0,0,0)[0]             : AIC=142.036, Time=0.01 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=137.338, Time=0.01 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=50271.128, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=133.662, Time=0.00 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=131.591, Time=0.00 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=134.676, Time=0.01 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.03 sec

Best model:  ARIMA(0,0,0)(0,0,0)[0] intercept
Total fit time: 0.174 seconds
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.05 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=535.645, Time=0.00 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=532.178, Time=0.01 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.02 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=533.685, Time=0.00 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=533.833, Time=0.01 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=527

  forecast_value = forecast[0]


IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

In [None]:
#4. Generate Predictions

#Compile the forecasts into a DataFrame.
