# Data Preparation

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

In [3]:
# load the dataset
amazon_data = pd.read_csv('C:\\Repos\\MastersProject\\data-analysis\\Data\\amazon.csv')

In [34]:
data_agg = amazon_data.groupby(['user_id', 'product_id']).agg({'rating': 'sum'}).reset_index()

In [35]:
interaction_matrix = data_agg.pivot(index='user_id', columns='product_id', values='rating').fillna(0)

In [37]:
print(interaction_matrix)

product_id                                          B002PD61Y4  ...  B0BR4F878Q
user_id                                                         ...            
AE22Y3KIS7SE6LI3HE2VS6WWPU4Q,AHWEYO2IJ5I5GDWZAH...         0.0  ...         0.0
AE23RS3W7GZO7LHYKJU6KSKVM4MQ,AEQUNEY6GQOTEGUMS6...         0.0  ...         0.0
AE242TR3GQ6TYC6W4SJ5UYYKBTYQ                               0.0  ...         0.0
AE27UOZENYSWCQVQRRUQIV2ZM7VA,AGMYSLV6NNOAYES25J...         0.0  ...         0.0
AE2JTMRKTUOIVIZWS2WDGTMNTU4Q,AF4QXCB32VC2DVE7O3...         0.0  ...         0.0
...                                                        ...  ...         ...
AHZFKWGDBRQKNMNQ4ZPL52OZBRKA,AGBEFVJFOQIRF7C7KY...         0.0  ...         0.0
AHZJHJWFZLYD64GVP4PXVI2F4LXA,AEUCRZPOISXKHXMCZU...         0.0  ...         0.0
AHZNSNBVKQR4OGJAQHE4DCDA4YHA,AFBW6COTZXGHQMWVDU...         0.0  ...         0.0
AHZWJCVEIEI76H2VGMUSN5D735IQ,AH2DFUHFTG4CKQFVGZ...         0.0  ...         0.0
AHZWXUWE3RGLDH4JJUK3HT3VMBJA,AFWUWJMEO4I

In [38]:
#split data into training and testing sets - splitting the data
train_data, test_data = train_test_split(interaction_matrix, test_size= 0.2, random_state= 42)

In [39]:
#Normalize data - Data Normalization
max_rating = interaction_matrix.max().max()
train_data /= max_rating
test_data /= max_rating

In [40]:
#define input dimensions
input_dim = train_data.shape[1]

In [41]:
#Define autoencoder architecture
input_layer = Input(shape=(input_dim,))
encoder_layer1 = Dense(64, activation='relu')(input_layer) 
encoder_layer2 = Dense(32, activation='relu')(encoder_layer1) 
encoder_layer3 = Dense(16, activation='relu')(encoder_layer2)
decoder_layer1 = Dense(32, activation='relu')(encoder_layer3)
decoder_layer2 = Dense(64, activation='relu')(decoder_layer1)
decoder_layer3 = Dense(input_dim, activation='sigmoid')(decoder_layer2)
 
# Create the autoencoder model - model crreation
autoencoder = Model(input_layer, decoder_layer3)

# Compile the model - model compilation
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Print model summary
autoencoder.summary()

# Train the autoencoder - model training
autoencoder.fit(train_data, train_data,
                epochs=50,
                batch_size=64,
                shuffle=True,
                validation_data=(test_data, test_data))

Epoch 1/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 0.2488 - val_loss: 0.2446
Epoch 2/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.2413 - val_loss: 0.2223
Epoch 3/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2044 - val_loss: 0.1184
Epoch 4/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0767 - val_loss: 0.0041
Epoch 5/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0016 - val_loss: 1.6347e-04
Epoch 6/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.3651e-04 - val_loss: 1.0767e-04
Epoch 7/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 1.0790e-04 - val_loss: 1.0205e-04
Epoch 8/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.0402e-04 - val_loss: 1.0080e-04
Epoch 9/50
[1m15/15[0m [

<keras.src.callbacks.history.History at 0x22c99adc2d0>

In [42]:
# Extract embeddings
encoder_model = Model(input_layer, encoder_layer3)
user_embeddings = encoder_model.predict(train_data)

# Calculate cosine similarity between user embeddings - calculate user similarities
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_embeddings)



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [43]:
# Function to generate recommendations for a given user
def generate_recommendations(user_id, user_similarity, top_n=5):
    similar_users = np.argsort(-user_similarity[user_id])[1:]  # Exclude the user itself
    recommendations = []
    for similar_user in similar_users:
        unrated_items = np.where(train_data.iloc[similar_user] == 0)[0]  # Find unrated items by similar user
        rated_items = np.where(train_data.iloc[user_id] != 0)[0]  # Find rated items by target user
        recommended_items = np.intersect1d(unrated_items, rated_items)[:top_n]  # Find common unrated items
        recommendations.extend(recommended_items)
        if len(recommendations) >= top_n:
            break
    return recommendations

In [44]:
# Convert user ID to the corresponding index in the interaction matrix
user_id = 'AG3D6O4STAQKAY2UVGEUV46KN35Q'
if user_id in train_data.index:
    user_index = train_data.index.get_loc(user_id)
    recommendations = generate_recommendations(user_index, user_similarity, train_data)
    if recommendations:
        print("Recommended items for user", user_id, ":", recommendations)
    else:
        print("No recommendations found for user", user_id)
else:
    print("User ID", user_id, "not found in the dataset. Unable to generate recommendations.")

User ID AG3D6O4STAQKAY2UVGEUV46KN35Q not found in the dataset. Unable to generate recommendations.


In [25]:
# Step 1: Data Preparation
amazon_data = pd.read_csv('C:\\Repos\\MastersProject\\data-analysis\\Data\\amazon.csv')  # Load the Amazon dataset
interaction_matrix = pd.pivot_table(amazon_data, values='rating', index='user_id', columns='product_id', fill_value=0)  # Create interaction matrix
train_data, test_data = train_test_split(interaction_matrix, test_size=0.2, random_state=42)  # Split data into train and test sets
max_rating = interaction_matrix.max().max()  # Normalize data
train_data /= max_rating
test_data /= max_rating

# Step 2: Building the Deep Autoencoder Model
input_dim = train_data.shape[1]  # Define input dimension
input_layer = Input(shape=(input_dim,))
encoder_layer1 = Dense(64, activation='relu')(input_layer)
encoder_layer2 = Dense(32, activation='relu')(encoder_layer1)
encoder_layer3 = Dense(16, activation='relu')(encoder_layer2)
decoder_layer1 = Dense(32, activation='relu')(encoder_layer3)
decoder_layer2 = Dense(64, activation='relu')(decoder_layer1)
decoder_layer3 = Dense(input_dim, activation='sigmoid')(decoder_layer2)
autoencoder = Model(input_layer, decoder_layer3)  # Create autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')  # Compile model

# Step 3: Training the Model
autoencoder.fit(train_data, train_data, epochs=50, batch_size=64, shuffle=True, validation_data=(test_data, test_data))  # Train the autoencoder

# Step 4: Generating Recommendations
encoder_model = Model(input_layer, encoder_layer3)  # Extract encoder model
user_embeddings = encoder_model.predict(train_data)  # Extract user embeddings
user_similarity = np.dot(user_embeddings, user_embeddings.T)  # Calculate user similarity
user_id = 'AG3D6O4STAQKAY2UVGEUV46KN35Q'  # Example user ID
try:
    user_index = train_data.index.get_loc(user_id)  # Convert user ID to index
    similar_users = np.argsort(-user_similarity[user_index])[1:]  # Get similar users
    recommendations = []
    for similar_user in similar_users:
        unrated_items = np.where(train_data.iloc[similar_user] == 0)[0]  # Find unrated items
        rated_items = np.where(train_data.iloc[user_index] != 0)[0]  # Find rated items by target user
        recommended_items = np.intersect1d(unrated_items, rated_items)  # Find common unrated items
        recommendations.extend(recommended_items)
        if len(recommendations) >= 5:
            break
    print("Recommended items for user", user_id, ":", recommendations)
except KeyError:
    print("User ID not found in the dataset.")

Epoch 1/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 0.2486 - val_loss: 0.2417
Epoch 2/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.2348 - val_loss: 0.1983
Epoch 3/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.1665 - val_loss: 0.0540
Epoch 4/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0254 - val_loss: 0.0011
Epoch 5/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 7.0871e-04 - val_loss: 5.6671e-04
Epoch 6/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 5.7326e-04 - val_loss: 5.5002e-04
Epoch 7/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 5.9102e-04 - val_loss: 5.4767e-04
Epoch 8/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5.6730e-04 - val_loss: 5.4712e-04
Epoch 9/50
[1m15/15