# import the necessary libraries

In [11]:
import pandas as pd
import re
from bs4 import BeautifulSoup


## define the paths to the datasets


In [3]:
amazon_fashion_dataset_path = "datasets/Amazon_Fashion.jsonl"
appliances_dataset_path = "datasets/Appliances.jsonl"

# read the dataset
amazon_fashion_dataset = pd.read_json(amazon_fashion_dataset_path, lines=True)

# display the first 5 rows of the dataset
print(amazon_fashion_dataset.head())

   rating                 title  \
0       5         Pretty locket   
1       5                     A   
2       2             Two Stars   
3       1       Won’t buy again   
4       5  I LOVE these glasses   

                                                text images        asin  \
0  I think this locket is really pretty. The insi...     []  B00LOPVX74   
1                                              Great     []  B07B4JXK8D   
2  One of the stones fell out within the first 2 ...     []  B007ZSEQ4Q   
3  Crappy socks. Money wasted. Bought to wear wit...     []  B07F2BTFS9   
4  I LOVE these glasses!  They fit perfectly over...     []  B00PKRFU4O   

  parent_asin                       user_id               timestamp  \
0  B00LOPVX74  AGBFYI2DDIKXC5Y4FARTYDTQBMFQ 2020-01-09 00:06:34.489   
1  B07B4JXK8D  AFQLNQNQYFWQZPJQZS6V3NZU4QBQ 2020-12-20 01:04:06.701   
2  B007ZSEQ4Q  AHITBJSS7KYUBVZPX7M2WJCOIVKQ 2015-05-23 01:33:48.000   
3  B07F2BTFS9  AFVNEEPDEIH5SPUN5BWC6NKL3WNQ 2018-12-31

## display the info of the dataset


In [4]:
amazon_fashion_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500939 entries, 0 to 2500938
Data columns (total 10 columns):
 #   Column             Dtype         
---  ------             -----         
 0   rating             int64         
 1   title              object        
 2   text               object        
 3   images             object        
 4   asin               object        
 5   parent_asin        object        
 6   user_id            object        
 7   timestamp          datetime64[ns]
 8   helpful_vote       int64         
 9   verified_purchase  bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(6)
memory usage: 174.1+ MB


In [5]:
appliances_dataset = pd.read_json(appliances_dataset_path, lines=True)
print(appliances_dataset.head())

   rating              title  \
0       5         Work great   
1       5  excellent product   
2       5    Happy customer!   
3       5      Amazing value   
4       5        Dryer parts   

                                                text images        asin  \
0              work great. use a new one every month     []  B01N0TQ0OH   
1                            Little on the thin side     []  B07DD2DMXB   
2                   Quick delivery, fixed the issue!     []  B082W3Z9YK   
3  I wasn't sure whether these were worth it or n...     []  B078W2BJY8   
4  Easy to install got the product expected to re...     []  B08C9LPCQV   

  parent_asin                       user_id               timestamp  \
0  B01N0TQ0OH  AGKHLEW2SOWHNMFQIJGBECAF7INQ 2018-02-22 16:31:48.692   
1  B07DD37QPZ  AHWWLSPCJMALVHDDVSUGICL6RUCA 2022-10-02 21:41:03.446   
2  B082W3Z9YK  AHZIJGKEWRTAEOZ673G5B3SNXEGQ 2020-12-06 03:30:35.363   
3  B078W2BJY8  AFGUPTDFAWOHHL4LZDV27ERDNOYQ 2018-08-12 20:03:04.306   
4

## handling the missing values


In [8]:
amazon_fashion_dataset = amazon_fashion_dataset.dropna(subset=['title'])
appliances_dataset = appliances_dataset.dropna(subset=['title'])

amazon_fashion_dataset['title'] = amazon_fashion_dataset['title'].fillna('')
appliances_dataset['title'] = appliances_dataset['title'].fillna('')

## data cleaning

In [12]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text

In [13]:
amazon_fashion_dataset['title'] = amazon_fashion_dataset['title'].apply(clean_text)
appliances_dataset['title'] = appliances_dataset['title'].apply(clean_text)



## Parsing Dates and Types
Ensuring all data types are correct for efficient processing, storage and querying


In [14]:
def parse_dates(df):
    timestamp_column = "timestamp"
    rating_column = "rating"
    verified_purchase_column = "verified_purchase"

    df[timestamp_column] = pd.to_datetime(df[timestamp_column])
    df[rating_column] = df[rating_column].astype(int)
    df[verified_purchase_column] = df[verified_purchase_column].astype(bool)
    return df


In [None]:
amazon_fashion_dataset = parse_dates(amazon_fashion_dataset)
appliances_dataset = parse_dates(appliances_dataset)