In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the dataset
file_path = '/content/Markov_data.csv'
data = pd.read_csv(file_path)

# Fill NaN values with a placeholder to indicate no purchase
data.fillna("None", inplace=True)

# Reshape data to track transitions by customer
# Here, we group the purchases of each customer across months into sequences
customer_data = data.groupby('Customer').apply(lambda x: x.drop('Customer', axis=1).values.flatten())
sequences = customer_data.tolist()

# Flatten sequences and encode the product categories
flattened_sequences = [item for sublist in sequences for item in sublist]
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(flattened_sequences)

# Reshape the encoded data back into the original sequence format
sequences_encoded = [integer_encoded[i:i + len(seq)] for i, seq in enumerate(sequences)]

# Define the maximum sequence length
max_sequence_len = max([len(seq) for seq in sequences_encoded])

# Pad sequences so that they all have the same length
padded_sequences = pad_sequences(sequences_encoded, maxlen=max_sequence_len, padding='post')

# Split the sequences into input (X) and target (y)
X, y = [], []
for seq in padded_sequences:
    for i in range(1, len(seq)):
        X.append(seq[:i])  # Input: sequence up to this point
        y.append(seq[i])   # Target: next item in the sequence

# Pad sequences to ensure all inputs are the same length
X = pad_sequences(X, maxlen=max_sequence_len, padding='post')

# One-hot encode the target values
y = to_categorical(y, num_classes=len(label_encoder.classes_))

# Build the LSTM model
model = Sequential()
model.add(Embedding(len(label_encoder.classes_), 10, input_length=max_sequence_len))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(50, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=30, batch_size=32)

# Prediction example
def predict_next_purchase(sequence):
    encoded_sequence = label_encoder.transform(sequence)
    padded_input = pad_sequences([encoded_sequence], maxlen=max_sequence_len, padding='post')
    prediction = model.predict(padded_input)
    predicted_index = np.argmax(prediction)
    return label_encoder.inverse_transform([predicted_index])[0]

# Example prediction for a sequence starting with "Shoes"
example_sequence = ['Shoes']
next_purchase = predict_next_purchase(example_sequence)
print(f'Next predicted purchase: {next_purchase}')


Epoch 1/30


  data.fillna("None", inplace=True)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.5268 - loss: 1.4585
Epoch 2/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.6328 - loss: 0.9585
Epoch 3/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.4046 - loss: 0.9426
Epoch 4/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5641 - loss: 0.9722
Epoch 5/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.6209 - loss: 0.8799
Epoch 6/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5867 - loss: 0.9474
Epoch 7/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5876 - loss: 0.9485
Epoch 8/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.6079 - loss: 0.9261
Epoch 9/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m