In [3]:
# : Importing dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt


In [5]:
#  Loading and clean events.csv
events = pd.read_csv("events.csv")
events["timestamp"] = pd.to_datetime(events["timestamp"], unit="ms")
events["date"] = events["timestamp"].dt.date
events["event_weight"] = events["event"].map({"view": 1, "addtocart": 2, "transaction": 3})
events.dropna(subset=["event_weight"], inplace=True)

#  Aggregate daily activity
daily_activity = events.groupby(["visitorid", "date"])["event_weight"]\
    .sum().reset_index().rename(columns={"event_weight": "activity_count"})
daily_activity["date"] = pd.to_datetime(daily_activity["date"])

In [11]:
#  Pad dates per visitor
all_dates = pd.date_range(daily_activity["date"].min(), daily_activity["date"].max())
active_users = daily_activity.groupby("visitorid")["activity_count"].sum().sort_values(ascending=False).head(200).index.tolist()

X_all = []
y_all = []

for visitor in active_users:
    user_data = daily_activity[daily_activity["visitorid"] == visitor]
    padded = pd.DataFrame({
        "date": all_dates,
        "activity_count": 0
    })
    padded = padded.merge(user_data, on="date", how="left", suffixes=("", "_real"))
    padded["activity_count"] = padded["activity_count_real"].fillna(padded["activity_count"])
    padded.drop(columns=["activity_count_real"], inplace=True)
    padded["visitorid"] = visitor
    padded["dayofweek"] = padded["date"].dt.dayofweek
    padded["rolling_mean"] = padded["activity_count"].rolling(3, min_periods=1).mean()
    padded["rolling_std"] = padded["activity_count"].rolling(3, min_periods=1).std().fillna(0)
    padded["prev_day"] = padded["activity_count"].shift(1).fillna(0)


In [17]:
# store raw and normalized activity
padded["raw_count"] = padded["activity_count"]
padded["activity_count"] = (padded["activity_count"] - padded["activity_count"].mean()) / (padded["activity_count"].std() + 1e-5)

# Select features
features = ["activity_count", "rolling_mean", "rolling_std", "prev_day"]
values = padded[features].values

# Define sequence builder
def create_sequences(data, target, input_window=60, forecast_window=5):
    X, y = [], []
    for i in range(len(data) - input_window - forecast_window + 1):
        X.append(data[i:i+input_window])
        y.append(target[i+input_window:i+input_window+forecast_window])
    return np.array(X), np.array(y)

# Prepare inputs
raw_target = padded["raw_count"].values
X_cust, y_cust = create_sequences(values, raw_target)
if len(X_cust) > 0:
    X_all.append(X_cust)
    y_all.append(y_cust)


In [19]:
# STEP 5: Concatenate and split
X = np.concatenate(X_all)
y = np.concatenate(y_all)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
#  Model
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X.shape[1], X.shape[2])),
    Dropout(0.2),
    LSTM(64),
    Dense(64, activation='relu'),
    Dense(y.shape[1])
])
model.compile(optimizer='adam', loss='mae')

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
]

history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=100, batch_size=64, callbacks=callbacks, verbose=1)


Epoch 1/100


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 1.9073 - val_loss: 1.2428 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - loss: 1.8658 - val_loss: 1.2319 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 314ms/step - loss: 1.8537 - val_loss: 1.2310 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - loss: 1.8471 - val_loss: 1.2292 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - loss: 1.8460 - val_loss: 1.2253 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - loss: 1.8439 - val_loss: 1.2266 - learning_rate: 0.0010
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - loss: 1.8435 - val_loss: 1.2244 - learning_rate: 0.0010
Epoch 8/100
[

In [31]:
# spliting into 100 customers chunks
grouped = full_activity.groupby("customer_id")["activity_count"]
all_customers = list(grouped.groups.keys())

# Split into chunks of 100
batch_size = 100
chunks = [all_customers[i:i+batch_size] for i in range(0, len(all_customers), batch_size)]

print(f" Total customers: {len(all_customers)} split into {len(chunks)} batches of {batch_size}")

#groups the data by customer_id and then splits the list of customers into chunks of 100 customers.
#It uses these chunks to process batches of customer data, which can be useful for training machine learning models in smaller, more manageable chunks.

 Total customers: 1407580 split into 14076 batches of 100


In [15]:
import pandas as pd
import glob # helps in finding file when we have multiple files with a similar name

# Grab all batch forecast files
csv_files = sorted(glob.glob("forecast_batch_*.csv"))

print(f" Found {len(csv_files)} batch files. Merging now...")

# Read and merge
merged_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Save final merged file
merged_df.to_csv("forecast_per_customer.csv", index=False)

print(" All forecast batches merged into forecast_per_customer.csv")
print(f" Total rows: {len(merged_df)}")
print(" Preview:")
print(merged_df.head())
#collects all batch files, reads and combines their data
# and then saves the merged data into a new file.

 Found 2910 batch files. Merging now...
 All forecast batches merged into forecast_per_customer.csv
 Total rows: 2037000
 Preview:
   customer_id     day  predicted_activity
0            0  Day +1                0.02
1            0  Day +2                0.01
2            0  Day +3                0.00
3            0  Day +4                0.01
4            0  Day +5                0.01


In [3]:
import pandas as pd

# Load just a portion of the file to check structure
df = pd.read_csv("forecast_per_customer.csv", nrows=1000)

print(" Preview of Columns:", df.columns.tolist())
print("Sample Rows:")
print(df.head())

print("\n Null Values:")
print(df.isnull().sum())

print("\n Duplicate Rows:", df.duplicated().sum())
#loads the first 1000 rows

 Preview of Columns: ['customer_id', 'day', 'predicted_activity']
Sample Rows:
   customer_id     day  predicted_activity
0            0  Day +1                0.02
1            0  Day +2                0.01
2            0  Day +3                0.00
3            0  Day +4                0.01
4            0  Day +5                0.01

 Null Values:
customer_id           0
day                   0
predicted_activity    0
dtype: int64

 Duplicate Rows: 0


In [None]:
# Phase 2: Personalized Product Recommendations

In [9]:
 #  Load events and assign weights
import pandas as pd

events = pd.read_csv("events.csv", usecols=["timestamp", "visitorid", "event", "itemid"])
events.columns = ["timestamp", "customer_id", "action", "item_id"]

# Assign weights based on type of action
weights = {
    "view": 1,
    "addtocart": 2,
    "transaction": 3
}
events["weight"] = events["action"].map(weights)

print("Sample of weighted interaction data:")
print(events.head())
#assing a weight to each type of customer action view, add to cart, transaction.

Sample of weighted interaction data:
       timestamp  customer_id action  item_id  weight
0  1433221332117       257597   view   355908       1
1  1433224214164       992329   view   248676       1
2  1433221999827       111016   view   318965       1
3  1433221955914       483717   view   253185       1
4  1433221337106       951259   view   367447       1


In [11]:
# Aggregate scores per customer-item
user_item_scores = events.groupby(["customer_id", "item_id"])["weight"].sum().reset_index()

# Optional: sort by score for each customer
user_item_scores = user_item_scores.sort_values(by=["customer_id", "weight"], ascending=[True, False])

print("Sample of user-item scores:")
print(user_item_scores.head())
# the score of each customer item pair by summing the weight values

Sample of user-item scores:
   customer_id  item_id  weight
0            0    67045       1
1            0   285930       1
2            0   357564       1
3            1    72028       1
6            2   325215       3


In [13]:
##  Load item_properties and extract category info
item_props = pd.read_csv("item_properties_part1.csv", usecols=["timestamp", "itemid", "property", "value"])
item_props = item_props[item_props["property"] == "categoryid"]
item_props = item_props.drop_duplicates(subset=["itemid"])
item_props = item_props.rename(columns={"itemid": "item_id", "value": "category_id"})

# Merge with user-item scores
recs = pd.merge(user_item_scores, item_props[["item_id", "category_id"]], on="item_id", how="left")

print(" Sample of enriched recommendations:")
print(recs.head())


 Sample of enriched recommendations:
   customer_id  item_id  weight category_id
0            0    67045       1         333
1            0   285930       1         NaN
2            0   357564       1         256
3            1    72028       1         NaN
4            2   325215       3         299


In [15]:
# Renaming weight to score for clarity
recs = recs.rename(columns={"weight": "score"})

# Save the final recommendations
recs.to_csv("recs_multiuser.csv", index=False)
print(" Saved recs_multiuser.csv with", len(recs), "rows.")
# here we are renaming weight column to score for my clarity by the score it can make 
#better prediction

 Saved recs_multiuser.csv with 2145179 rows.


In [19]:
###phase3: creating marketing strategy logic to add in dashboard
import pandas as pd

# Load the forecast data
forecast_df = pd.read_csv("forecast_per_customer.csv")


In [21]:
# Example selected user (you can replace this with user input later)

selected_user = int(input("Enter customer ID: "))

# Filter forecast data for that customer
customer_forecast = forecast_df[forecast_df["customer_id"] == selected_user]

# Generate a smart marketing suggestion
if not customer_forecast.empty:
    avg_activity = customer_forecast["predicted_activity"].mean()

    if avg_activity >= 2:
        print("High predicted engagement. Offer a loyalty reward or bundle deal.")
    elif 1 <= avg_activity < 2:
        print("Moderate activity. Consider sending a product reminder or wishlist notification.")
    else:
        print("Low activity expected. Try offering a discount to re-engage this customer.")
else:
    print("No forecast found for this customer.")



 Low activity expected. Try offering a discount to re-engage this customer.


In [1]:
import pandas as pd

# Loading full forecast and recommendations files
forecast_full = pd.read_csv("forecast_per_customer.csv")
recs_full = pd.read_csv("recs_multiuser.csv")

# Step 1: Randomly sample 1000 unique customer IDs
sampled_customers = forecast_full['customer_id'].drop_duplicates().sample(1000, random_state=42)

# Step 2: Filter forecast data
forecast_demo = forecast_full[forecast_full['customer_id'].isin(sampled_customers)]

# Step 3: Filter recommendation data to same customers
recs_demo = recs_full[recs_full['customer_id'].isin(sampled_customers)]

# Step 4: Save demo versions for GitHub upload
forecast_demo.to_csv("forecast_per_customer_demo.csv", index=False)
recs_demo.to_csv("recs_multiuser_demo.csv", index=False)

print(" Saved forecast_per_customer_demo.csv and recs_multiuser_demo.csv")


 Saved forecast_per_customer_demo.csv and recs_multiuser_demo.csv


In [3]:
import pandas as pd

# Load demo forecast file
forecast_demo = pd.read_csv("forecast_per_customer_demo.csv")

# Show 10 unique customer IDs to test
sample_ids = forecast_demo['customer_id'].drop_duplicates().head(10).tolist()
print(" Sample Customer IDs:", sample_ids)


 Sample Customer IDs: [1009078, 1090330, 1090334, 1090464, 1090709, 1092214, 1092539, 1009305, 1093123, 1093222]
