In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df = pd.read_csv('data/amazon_co-ecommerce_sample.csv') # from https://www.kaggle.com/datasets/yasserh/amazon-product-reviews-dataset
#df.head

In [None]:
df.columns

Index(['uniq_id', 'product_name', 'manufacturer', 'price',
       'number_available_in_stock', 'number_of_reviews',
       'number_of_answered_questions', 'average_review_rating',
       'amazon_category_and_sub_category',
       'customers_who_bought_this_item_also_bought', 'description',
       'product_information', 'product_description',
       'items_customers_buy_after_viewing_this_item',
       'customer_questions_and_answers', 'customer_reviews', 'sellers'],
      dtype='object')

In [None]:
df.product_description.isna().sum()

651

In [None]:
df.product_description[(df.product_description.str.len() == 0) | (df.product_description == "")].shape[0]

0

In [None]:
#dropping all NAs
print("Dimensions of dataset before dropping the NAs:", df.shape)
df = df.dropna(subset=['product_description'])
df = df.dropna(subset=['average_review_rating'])
df = df.dropna(subset=['price'])
print("Dimensions of dataset after dropping the NAs:", df.shape)

Dimensions of dataset before dropping the NAs: (10000, 17)
Dimensions of dataset after dropping the NAs: (8001, 17)


In [None]:
mask = df['price'].str.contains('-')

# index the DataFrame using the boolean mask to remove the rows
df = df[~mask]

In [None]:
df.price[0]
def format_price(x):
    x = x.replace(",", "")
    return float(x[1:])
        
df['float_price'] = df['price'].apply(lambda x: format_price(x))
df['float_rating'] = df['average_review_rating'].apply(lambda x: float(x[:3]))

In [None]:
df['float_rating']

0       4.9
1       4.5
2       3.9
3       5.0
4       4.7
       ... 
9993    4.0
9994    4.0
9995    5.0
9997    5.0
9998    5.0
Name: float_rating, Length: 7985, dtype: float64

In [None]:
print("Dimensions of dataset after dropping the NAs:", df.shape)

Dimensions of dataset after dropping the NAs: (7985, 19)


In [None]:
df['num_keywords'] = df['product_description'].apply(lambda x: len(x.split()))

X_train, X_test, y_train, y_test = train_test_split(df[['num_keywords','product_description', 'float_rating']], df['float_price'], test_size=0.2, random_state=42)

In [None]:
# vectorize the product descriptions using a count vectorizer
vectorizer = CountVectorizer()
train_X = vectorizer.fit_transform(X_train['product_description'])
test_X = vectorizer.transform(X_test['product_description'])

In [None]:
# select a machine learning algorithm (in this case, linear regression)
model = LinearRegression()

# train the model on the training set
model.fit(train_X, y_train)

LinearRegression()

In [None]:
# make predictions on the testing set
y_pred = model.predict(test_X)

# evaluate the performance of the model using mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean squared error:', mse)
print('R-squared:', r2)

Mean squared error: 44336.538072397656
R-squared: -26.87457248555067


In [None]:
df['num_keywords']

In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
!pip install transformers

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# load the data into a pandas dataframe
data = pd.read_csv('/content/drive/MyDrive/Coding/Colab Notebooks (1)/amazon_co-ecommerce_sample.csv')

In [7]:
#dropping all NAs
print("Dimensions of dataset before dropping the NAs:", data.shape)
data = data.dropna(subset=['product_description'])
data = data.dropna(subset=['average_review_rating'])
data = data.dropna(subset=['price'])

mask = data['price'].str.contains('-')

# index the DataFrame using the boolean mask to remove the rows
data = data[~mask]

print("Dimensions of dataset after dropping the NAs:", data.shape)


data.price[0]
def format_price(x):
    x = x.replace(",", "")
    return float(x[1:])
        
data['float_price'] = data['price'].apply(lambda x: format_price(x))
data['float_rating'] = data['average_review_rating'].apply(lambda x: float(x[:3]))

Dimensions of dataset before dropping the NAs: (10000, 17)
Dimensions of dataset after dropping the NAs: (7985, 17)


In [8]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

with tf.device('/device:GPU:0'):
  # initialize the BERT tokenizer and model
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  model = TFBertModel.from_pretrained('bert-base-uncased')

  # tokenize the product descriptions and encode them as input IDs and attention masks
  train_inputs = tokenizer(train_data['product_description'].tolist(), truncation=True, padding=True, max_length=128, return_tensors='tf')
  test_inputs = tokenizer(test_data['product_description'].tolist(), truncation=True, padding=True, max_length=128, return_tensors='tf')

  # create the TensorFlow datasets for training and testing
  train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_inputs), train_data['float_price']))
  test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_inputs), test_data['float_price']))

  # define the model architecture
  input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
  attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
  bert_output = model({'input_ids': input_ids, 'attention_mask': attention_mask})[1]
  output = tf.keras.layers.Dense(1, activation=None)(bert_output)
  model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)

  # compile the model and define the loss function and optimizer
  model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=2e-5))

  # train the model on the training dataset
  model.fit(train_dataset.batch(32), epochs=3, validation_data=test_dataset.batch(32))

  # evaluate the model on the testing dataset and print the mean squared error
  predicted_prices = model.predict(test_dataset.batch(32)).flatten()
  mse = mean_squared_error(test_data['float_price'], predicted_prices)
  print('Mean squared error: {:.2f}'.format(mse))

  # example usage: predict the price of a new product based on its description
  new_description = 'This is a high-quality product with a durable design.'
  new_inputs = tokenizer([new_description], truncation=True, padding=True, max_length=128, return_tensors='tf')
  predicted_price = model.predict(dict(new_inputs)).flatten()[0]
  print('Predicted price: {:.2f}'.format(predicted_price))

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/3


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/3
Epoch 3/3


ValueError: ignored