# Initial steps

### Libraries

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate, Dropout
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split, KFold
from google.cloud import storage
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import normalize
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
import os
import matplotlib.pyplot as plt

### Importing DataFrame

In [None]:
### Importing the file if it's in the same directory as in the Github
df = pd.read_csv('data/psv_processed.csv')

In [None]:
### Importing the file through google drive
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

# File zonder verzendkosten, purchases die returned zijn en returns
file_path = '/content/gdrive/MyDrive/DeepLearning/Data/psv_processed.csv'   #add file path
df = pd.read_csv(file_path)


In [None]:
### Importing the file through GCE bucket
client = storage.Client()
bucket = client.bucket('deeplearningbucket123')

blob = bucket.blob('three_negative.csv')
blob.download_to_filename('three_negative.csv')
df = pd.read_csv('three_negative.csv')


# Preparing the model

### Word2Vec & PCA

In [None]:
# The CSV could not be imported correctly with word2vec included so has to be done here
# Prepare the product names data
product_names = df['merchandise_product_name'].apply(lambda x: preprocess_string(x))

# Train Word2Vec model
w2vmodel = Word2Vec(sentences=product_names, vector_size=100, window=5, min_count=3, workers=2, sg=1, epochs=20)

# Function to convert product name to vector
def product_name_to_vector(product_name):
    words = preprocess_string(product_name)
    word_vectors = [w2vmodel.wv[word] for word in words if word in w2vmodel.wv]
    if not word_vectors:
        return np.zeros(w2vmodel.vector_size)  # Return a zero vector if no words are found
    return np.mean(np.array(word_vectors), axis=0)

# Apply function and ensure type is np.array
df['product_vector'] = df['merchandise_product_name'].apply(product_name_to_vector).apply(np.array)

In [None]:
# Instantiate PCA with 1 component
pca = PCA(n_components=1)

# Apply PCA transformation to product vectors
df['product_pca'] = pca.fit_transform(df['product_vector'].to_list())

### Encoding and Normalizing

In [None]:
label_encoders = {}

# Features requiring encoding
encoding_features = ['fan_id', 'merchandise_product_name'] + ['gender', 'is_fanclub_member', 'is_clubcard_member', 'is_supver_member', 'is_scc_holder', 'merchandise_product_description1', 'is_kid_size']

# Fit and transform each categorical feature
for feature in encoding_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

numerical_features = ['age', 'distance_from_club', 'total_spend_merchandise', 'total_spend_ticket', 'total_spend_other', 'total_spend_all', 'merchandise_product_price', 'product_pca']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

### The model itself, including 5-fold cross validation validation

In [None]:
# Early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=25,
    restore_best_weights=True
)

# Define the maximum number of features for embedding layers
num_users = df['fan_id'].nunique()
num_products = df['merchandise_product_name'].nunique()

# Function to build the model with editable hyperparameters, including learning rate
def build_model(embedding_size, l2_regularization, dropout_rate, layers, learning_rate):
    user_input = Input(shape=(1,), name='user_input')
    product_input = Input(shape=(1,), name='product_input')
    user_additional_features = Input(shape=(len(user_features) - 1,), name='user_additional_features')
    product_additional_features = Input(shape=(len(product_features) - 1,), name='product_additional_features')

    user_embedding = Embedding(num_users, embedding_size, name='user_embedding')(user_input)
    product_embedding = Embedding(num_products, embedding_size, name='product_embedding')(product_input)

    user_vec = Flatten(name='flatten_user')(user_embedding)
    product_vec = Flatten(name='flatten_product')(product_embedding)

    combined_features = Concatenate()([user_vec, product_vec, user_additional_features, product_additional_features])

    # Dynamically create the layers according to the input `layers` list
    x = combined_features
    for layer_size in layers:
        x = Dense(layer_size, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_regularization))(x)
        x = Dropout(dropout_rate)(x)

    output = Dense(1, activation='sigmoid')(x)

    # Set the learning rate in the Adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model = Model(inputs=[user_input, product_input, user_additional_features, product_additional_features], outputs=output)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define hyperparameter combinations (as a list of dictionaries), including the learning rate
hyperparameters = [
    {'layers': [128, 64, 32], 'embedding_size': 50, 'l2_regularization': 0.04, 'dropout_rate': 0.2, 'learning_rate': 0.0001}
]

# Data preparation
user_features = [
    'fan_id', 'age', 'gender', 'distance_from_club', 'is_fanclub_member',
    'is_clubcard_member', 'is_supver_member', 'is_scc_holder',
    'total_spend_merchandise', 'total_spend_ticket', 'total_spend_other', 'total_spend_all'
]
product_features = [
    'merchandise_product_name', 'merchandise_product_description1',
    'merchandise_product_price', 'is_kid_size', 'product_pca'
]

# Cross-validation loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Loop through each set of hyperparameters
for params in hyperparameters:
    layers = params['layers']
    embedding_size = params['embedding_size']
    l2_regularization = params['l2_regularization']
    dropout_rate = params['dropout_rate']
    learning_rate = params['learning_rate']

    fold = 1
    all_fold_histories = []
    for train_index, test_index in kf.split(df):
        train, test = df.iloc[train_index], df.iloc[test_index]

        train_inputs = [
            train['fan_id'].values.astype('int32'),  # Ensure integer type for ID inputs
            train['merchandise_product_name'].values.astype('int32'),
            train[user_features[1:]].values,  # excluding 'fan_id' which is already used in embedding
            train[product_features[1:]].values  # excluding 'merchandise_product_name'
        ]

        test_inputs = [
            test['fan_id'].values.astype('int32'),
            test['merchandise_product_name'].values.astype('int32'),
            test[user_features[1:]].values,  # excluding 'fan_id'
            test[product_features[1:]].values  # excluding 'merchandise_product_name'
        ]

        train_labels = train['interaction'].values.astype('float32')
        test_labels = test['interaction'].values.astype('float32')

        # Build a new model for each fold using the current hyperparameters
        model = build_model(embedding_size, l2_regularization, dropout_rate, layers, learning_rate)

        # Train the model
        history = model.fit(
            train_inputs,
            train_labels,
            batch_size=256,
            epochs=25,
            validation_data=(test_inputs, test_labels),
            callbacks=[early_stopping]
        )

        print(f'Finished fold {fold} with layers {layers}, embedding size {embedding_size}, L2 regularization {l2_regularization}, dropout rate {dropout_rate}, and learning rate {learning_rate}.')
        fold += 1

model.save("psv_model")

# Model Validation


In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(model, test_inputs, test_labels):
    predictions = model.predict(test_inputs)
    predictions = (predictions > 0.5).astype(int)
    cm = confusion_matrix(test_labels, predictions)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Function to plot precision-recall curve
def plot_precision_recall_curve(model, test_inputs, test_labels):
    predictions = model.predict(test_inputs).ravel()
    precision, recall, _ = precision_recall_curve(test_labels, predictions)

    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()

def evaluate_model_performance(model, test_inputs, test_labels):
    # Generate predictions
    predictions = model.predict(test_inputs).ravel()
    predictions_binary = (predictions > 0.5).astype(int)

    # Calculate metrics
    precision = precision_score(test_labels, predictions_binary)
    recall = recall_score(test_labels, predictions_binary)
    f1 = f1_score(test_labels, predictions_binary)
    roc_auc = roc_auc_score(test_labels, predictions)
    pr_auc = average_precision_score(test_labels, predictions)

def evaluate_model_performance(model, test_inputs, test_labels):
    # Generate predictions
    predictions = model.predict(test_inputs).ravel()
    predictions_binary = (predictions > 0.5).astype(int)

    # Calculate metrics
    precision = precision_score(test_labels, predictions_binary)
    recall = recall_score(test_labels, predictions_binary)
    f1 = f1_score(test_labels, predictions_binary)
    roc_auc = roc_auc_score(test_labels, predictions)
    pr_auc = average_precision_score(test_labels, predictions)

    # Display the metrics
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print(f'ROC AUC: {roc_auc:.2f}')
    print(f'Precision-Recall AUC: {pr_auc:.2f}')

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc
    }


In [None]:
# Plot the graphs
print(evaluate_model_performance(model, test_inputs, test_labels))
plot_confusion_matrix(model, test_inputs, test_labels)
plot_roc_curve(model, test_inputs, test_labels)
plot_precision_recall_curve(model, test_inputs, test_labels)
