Road Bike Recommendation Algorithm

In [23]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [24]:
from web_scraping import scrape_data

# Run the scraping function
data_df = scrape_data()

# Display the DataFrame
data_df.head()

Unnamed: 0,Article Content,label,title,price,average_rating,total_ratings,feature_bullets,reviews
0,Tommaso Forcellais by far one of the best deal...,Tommaso Forcella,"Tommaso Forcella Endurance Aluminum Road Bike,...",899.0,4.2 out of 5 stars,152 ratings,About this itemQUALITY CYCLING PRODUCTS SINCE ...,[Great value bike for the equipment you get on...
1,Tommaso Imola is an elegant entry-level road b...,Tommaso Imola,"Tommaso Imola Endurance Aluminum Road Bike, Sh...",799.0,4.2 out of 5 stars,325 ratings,About this itemQUALITY CYCLING PRODUCTS SINCE ...,[So far the bike has been an absolute blast to...
2,Schwinn Men’s Volare 1400 700c is the latest a...,Schwinn Volare 1400,"Schwinn Volare Hybrid Sports Road Bike, Men an...",799.0,4.0 out of 5 stars,416 ratings,About this itemQuick and agile riding with the...,[I needed a bike to commute to work because my...
3,"‍Schwinn hasn’t always had an easy road, but i...",Schwinn Phocus 1600,"Schwinn Phocus Adult Road Bike, Mens and Women...",799.0,3.9 out of 5 stars,239 ratings,About this itemAluminum road frame paired with...,"[For the price, this is an excellent entry-lev..."


Reformat the Different Data Components

In [39]:
import re

# Ensure columns are strings
data_df['average_rating'] = data_df['average_rating'].astype(str)
data_df['total_ratings'] = data_df['total_ratings'].astype(str)

# Clean average_rating column
data_df['average_rating'] = data_df['average_rating'].apply(lambda x: re.search(r'\d+\.\d+', x).group() if isinstance(x, str) else x)

# Clean total_ratings column
data_df['total_ratings'] = data_df['total_ratings'].apply(lambda x: re.search(r'\d+', x).group() if isinstance(x, str) else x)

# Convert total_ratings to integer
data_df['total_ratings'] = data_df['total_ratings'].astype(int)

data_df['price'] = data_df['price'].astype(str)

# Clean price column by removing trailing period and convert to integer
data_df['price'] = data_df['price'].str.rstrip('.').astype(int)

# Ensure columns are strings
data_df['Article Content'] = data_df['Article Content'].astype(str)
data_df['reviews'] = data_df['reviews'].astype(str)

# Concatenate columns
data_df['description'] = data_df['Article Content'] + " " + data_df['reviews']


Tokenization and Vector Data Creation

In [40]:
print(data_df)

# Tokenize and pad text data
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(data_df['description'])
sequences = tokenizer.texts_to_sequences(data_df['description'])
max_text_length = max(len(seq) for seq in sequences)
text_data_padded = pad_sequences(sequences, maxlen=max_text_length)


# Encode labels
label_encoder = LabelEncoder()
label_data = label_encoder.fit_transform(data_df['label'])
num_bikes = len(label_encoder.classes_)


# Extract numerical data
numerical_data = data_df[['price', 'average_rating', 'total_ratings']].values

# Numerical Input
numerical_input = Input(shape=(3,), name='numerical_input')
x_num = Dense(64, activation='relu')(numerical_input)
x_num = Dense(32, activation='relu')(x_num)

# Text Input (Description)
vocab_size = 100000
text_input = Input(shape=(max_text_length,), name='text_input')
x_text = Embedding(input_dim=vocab_size, output_dim=128)(text_input)
x_text = LSTM(128)(x_text)

# Concatenate
x = concatenate([x_num, x_text])
x = Dense(64, activation='relu')(x)
output = Dense(num_bikes, activation='softmax')(x)


                                     Article Content                label  \
0  Tommaso Forcellais by far one of the best deal...     Tommaso Forcella   
1  Tommaso Imola is an elegant entry-level road b...        Tommaso Imola   
2  Schwinn Men’s Volare 1400 700c is the latest a...  Schwinn Volare 1400   
3  ‍Schwinn hasn’t always had an easy road, but i...  Schwinn Phocus 1600   

                                               title  price average_rating  \
0  Tommaso Forcella Endurance Aluminum Road Bike,...    899            4.2   
1  Tommaso Imola Endurance Aluminum Road Bike, Sh...    799            4.2   
2  Schwinn Volare Hybrid Sports Road Bike, Men an...    799            4.0   
3  Schwinn Phocus Adult Road Bike, Mens and Women...    799            3.9   

   total_ratings                                    feature_bullets  \
0            152  About this itemQUALITY CYCLING PRODUCTS SINCE ...   
1            325  About this itemQUALITY CYCLING PRODUCTS SINCE ...   
2         

Creating Model Inputs, Training, and Testing

In [44]:
# Create Model
model = Model(inputs=[numerical_input, text_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Split data into training and validation sets
numerical_train, numerical_val, text_train, text_val, label_train, label_val = train_test_split(
    numerical_data, text_data_padded, label_data, test_size=0.25, random_state=42)

# Setting necessary data types for the Tensor model fitting
numerical_train = numerical_train.astype('float64')
numerical_val = numerical_val.astype('float64')
label_train = label_train.astype('float64')
label_val = label_val.astype('float64')

# Train the model
history = model.fit(
    [numerical_train, text_train], label_train,
    validation_data=([numerical_val, text_val], label_val),
    epochs=15, batch_size=32
)

# Evaluate the model
loss, accuracy = model.evaluate([numerical_val, text_val], label_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

Epoch 1/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - accuracy: 0.6667 - loss: 1.4492 - val_accuracy: 0.0000e+00 - val_loss: 34.4157
Epoch 2/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 1.0000 - loss: 0.0957 - val_accuracy: 0.0000e+00 - val_loss: 36.8477
Epoch 3/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.6667 - loss: 1.5647 - val_accuracy: 0.0000e+00 - val_loss: 35.1497
Epoch 4/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.6667 - loss: 0.4768 - val_accuracy: 0.0000e+00 - val_loss: 32.1875
Epoch 5/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 1.0000 - loss: 0.0239 - val_accuracy: 0.0000e+00 - val_loss: 32.2466
Epoch 6/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 1.0000 - loss: 0.0196 - val_accuracy: 0.0000e+00 - val_loss: 32.7846
Epoch 7/15
[1m1/1[0m [32m

User Input Fields for Model to Give a Recommendation

In [45]:
# Example user input
user_numerical_input = np.array([[1100, 4.3, 180]])
user_text_input = ["Lightweight and affordable"]
user_text_sequences = tokenizer.texts_to_sequences(user_text_input)
user_text_padded = pad_sequences(user_text_sequences, maxlen=max_text_length)

# Predict
predictions = model.predict([user_numerical_input, user_text_padded])
predicted_bike_index = np.argmax(predictions, axis=1)
predicted_bike_name = label_encoder.inverse_transform(predicted_bike_index)

print(f'Recommended Bike: {predicted_bike_name[0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 554ms/step
Recommended Bike: Tommaso Forcella
