In [138]:
import pandas as pd
import datetime
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

In [2]:
tr_1 = pd.read_csv("transactions_1.csv", index_col=0)
tr_2 = pd.read_csv("transactions_2.csv", index_col=0)
tr_total = pd.concat([tr_1, tr_2], axis = 0).sort_values(by = ["date"])
tr_total = tr_total.drop_duplicates()

In [3]:
# convert date column to datetime
tr_total['date'] = pd.to_datetime(tr_total['date'], format='%Y-%m-%dT%H:%M:%S.%fZ', errors='coerce')

tr_total['year'] = tr_total['date'].dt.to_period('Y') # Format: YYYY
tr_total['year_month'] = tr_total['date'].dt.to_period('M')  # Format: YYYY-MM
tr_total['year_month_day'] = tr_total['date'].dt.to_period('D')  # Format: YYYY-MM-DD
tr_total['year_month_day_hour'] = tr_total['date'].dt.to_period('h')  # Format: YYYY-MM-DD HH

In [37]:
monthly_data = tr_total.groupby(['customer_id', 'year_month'])['product_id'].count().reset_index().rename(columns = {"product_id" : "transactions"})

In [76]:
monthly_data.head()

Unnamed: 0,customer_id,year_month,transactions
0,1001614,2018-06,28
1,1001614,2018-09,3
2,1001614,2018-10,19
3,1001614,2019-01,4
4,1001614,2019-02,8


In [68]:
all_dates = pd.period_range(start=monthly_data['year_month'].min(),
                            end=monthly_data['year_month'].max(),
                            freq='M')

all_combinations = pd.MultiIndex.from_product(
    [monthly_data['customer_id'].unique(), all_dates],
    names=['customer_id', 'year_month']
).to_frame(index=False)

# Merge with the original data to fill in missing months
complete_data = all_combinations.merge(monthly_data, on=['customer_id', 'year_month'], how='left')
complete_data['transactions'] = complete_data['transactions'].fillna(0)

In [77]:
complete_data.head()

Unnamed: 0,customer_id,year_month,transactions
0,1001614,2017-01,0.0
1,1001614,2017-02,0.0
2,1001614,2017-03,0.0
3,1001614,2017-04,0.0
4,1001614,2017-05,0.0


In [93]:
window_size = 3
targets = []

for customer_id in complete_data['customer_id'].unique():
    customer_data = complete_data[complete_data['customer_id'] == customer_id]
    
    for i in range(len(customer_data) - window_size + 1):
        target_sum = customer_data['transactions'].iloc[i:i+window_size].sum()
        
        targets.append({
            'customer_id': customer_id,
            'start_year': customer_data['year_month'].iloc[i].year,
            'start_month': customer_data['year_month'].iloc[i].month,
            'end_year': customer_data['year_month'].iloc[i+window_size-1].year,
            'end_month': customer_data['year_month'].iloc[i+window_size-1].month,
            'transactions': target_sum
        })

targets_df = pd.DataFrame(targets)
targets_df = targets_df.sort_values(by=['customer_id', 'start_year', 'start_month'])

In [110]:
targets_df.head()

Unnamed: 0,customer_id,start_year,start_month,end_year,end_month,transactions,train
0,1001614,2017,1,2017,3,0.0,True
1,1001614,2017,2,2017,4,0.0,True
2,1001614,2017,3,2017,5,0.0,True
3,1001614,2017,4,2017,6,0.0,True
4,1001614,2017,5,2017,7,0.0,True


### Train/test split

In [113]:
train_year = 2019
train_month = 1

train = targets_df[
    (targets_df['end_year'] < train_year) |
    ((targets_df['end_year'] == train_year) & (targets_df['end_month'] <= train_month))
].drop("train", axis=1)

test = targets_df[
    (targets_df['start_year'] == train_year) &
    (targets_df['start_month'] == train_month + 1)
].drop("train", axis=1)

In [117]:
train.columns, train.shape, test.shape

(Index(['customer_id', 'start_year', 'start_month', 'end_year', 'end_month',
        'transactions'],
       dtype='object'),
 (46046, 6),
 (2002, 6))

In [129]:
# Features and target for training
X_train = train[['customer_id', 'start_year', 'start_month', 'end_year', 'end_month']]
y_train = train['transactions']

# Features and target for testing
X_test = test[['customer_id', 'start_year', 'start_month', 'end_year', 'end_month']]
y_test = test['transactions']

### Benchmark

* mean of previous customer transactions
* Previous three months transactions 

In [119]:
mean_transactions = train.groupby('customer_id')['transactions'].mean().reset_index()
mean_transactions.rename(columns={'transactions': 'mean_transactions'}, inplace=True)
test_with_mean = test.merge(mean_transactions, on='customer_id', how='left')

In [139]:
y_pred_benchmark = test_with_mean['mean_transactions']

# Compute evaluation metrics for the benchmark
mae_benchmark = mean_absolute_error(y_test, y_pred_benchmark)
rmse_benchmark = root_mean_squared_error(y_test, y_pred_benchmark)

# Print the benchmark results
print("Benchmark Results (Mean Prediction):")
print(f"Mean Absolute Error (MAE): {mae_benchmark:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_benchmark:.2f}")

Benchmark Results (Mean Prediction):
Mean Absolute Error (MAE): 32.38
Root Mean Squared Error (RMSE): 98.33


In [144]:
last_train_values = train[(train["end_year"] == 2019) & (train["end_month"] == 1)]["transactions"]

# Compute evaluation metrics for the benchmark
mae_benchmark_2 = mean_absolute_error(y_test, last_train_values)
rmse_benchmark_2 = root_mean_squared_error(y_test, last_train_values)

# Print the benchmark results
print("Benchmark Results (Mean Prediction):")
print(f"Mean Absolute Error (MAE): {mae_benchmark_2:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_benchmark_2:.2f}")

Benchmark Results (Mean Prediction):
Mean Absolute Error (MAE): 31.37
Root Mean Squared Error (RMSE): 100.75
