In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Create DataFrame from output_json file for processor

df = pd.read_json('search_processor_output.jsonl')
df

Unnamed: 0,title,url,rating,review_count,price
0,Intel Core i5-6500 Desktop CPU Processor- SR2L...,https://www.amazon.com/gp/slredirect/picassoRe...,4.4 out of 5 stars,69,$139.99
1,"AMD Ryzen 5 5600X 6-core, 12-Thread Unlocked D...",https://www.amazon.com/AMD-Ryzen-5600X-12-Thre...,4.8 out of 5 stars,4431,
2,Intel Core i7-9700K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i7-9700K-Desktop-...,4.8 out of 5 stars,8642,
3,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8 out of 5 stars,21451,$190.00
4,Intel Core i9-9900K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i9-9900K-Processo...,4.8 out of 5 stars,1318,
...,...,...,...,...,...
3032,"New 2020 HP 11.6"" HD Chromebook for Students M...",https://www.amazon.com/HP-Chromebook-Students-...,4.0 out of 5 stars,19,$175.75
3033,USB-C Laptop Charger Power Adapter: GX20M33579...,https://www.amazon.com/USB-C-Laptop-Charger-Po...,4.3 out of 5 stars,1060,$29.99
3034,"HP EliteBook 840 G3 14in Laptop, Core i5-6300U...",https://www.amazon.com/HP-EliteBook-i5-6300U-C...,4.2 out of 5 stars,157,$449.00
3035,Intel Core i5-6500 Desktop CPU Processor- SR2L...,https://www.amazon.com/gp/slredirect/picassoRe...,4.4 out of 5 stars,69,$139.99


## Performing following operations on the DataFrame
- Checking data type for the columns
- Price column: remove '$' and ',', also change the type to float
- Review_count cloumn: remove ',', also change the type to float
- Rating column: Extract ONLY product rating form the column

- Check null values for the columns. Remove rows without price, rating and review_count
- Remove duplicate rows

- Add Product column and write 'processor' in it

In [3]:
# Checking the data type

df.dtypes

title           object
url             object
rating          object
review_count    object
price           object
dtype: object

In [4]:
df.isnull().sum()

title             0
url               0
rating          434
review_count    434
price           496
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.isnull().sum()

title           0
url             0
rating          0
review_count    0
price           0
dtype: int64

In [7]:
df.shape

(2162, 5)

In [8]:
# Price column: remove '$' and ',', also change the type to float

df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].astype(float)

In [9]:
# Review_count cloumn: remove ',', also change the type to float

df['review_count'] = df['review_count'].str.replace(',', '')
df['review_count'] = df['review_count'].astype(float)
df['review_count'] = df['review_count'].astype(int)

In [10]:
# Rating column: Extract ONLY product rating form the column

df.rating = df.rating.astype('category')
df['rating'] = df['rating'].str.extract("(\d*\.?\d+)", expand=True)

In [11]:
df.dtypes

title            object
url              object
rating           object
review_count      int64
price           float64
dtype: object

In [12]:
df

Unnamed: 0,title,url,rating,review_count,price
0,Intel Core i5-6500 Desktop CPU Processor- SR2L...,https://www.amazon.com/gp/slredirect/picassoRe...,4.4,69,139.99
3,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21451,190.00
6,Intel Core i9-10850K Desktop Processor 10 Core...,https://www.amazon.com/Intel-i9-10850K-Desktop...,4.9,977,399.91
7,Intel BX80684I99900KF Intel Core i9-9900KF Des...,https://www.amazon.com/Intel-BX80684I99900KF-i...,4.7,1704,349.29
8,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",https://www.amazon.com/AMD-Ryzen-3600-12-Threa...,4.9,35328,299.99
...,...,...,...,...,...
3032,"New 2020 HP 11.6"" HD Chromebook for Students M...",https://www.amazon.com/HP-Chromebook-Students-...,4.0,19,175.75
3033,USB-C Laptop Charger Power Adapter: GX20M33579...,https://www.amazon.com/USB-C-Laptop-Charger-Po...,4.3,1060,29.99
3034,"HP EliteBook 840 G3 14in Laptop, Core i5-6300U...",https://www.amazon.com/HP-EliteBook-i5-6300U-C...,4.2,157,449.00
3035,Intel Core i5-6500 Desktop CPU Processor- SR2L...,https://www.amazon.com/gp/slredirect/picassoRe...,4.4,69,139.99


In [13]:
df.groupby(['title'])['rating'].count().sort_values(ascending = False)

title
Intel Core i5-6500 Desktop CPU Processor- SR2L6 (Renewed)                                                                                                                                                  176
Mini PC Windows 10, Beelink Mini Computers Intel Broadwell Processor I3-5005U, Desktop Pc 8GB DDR3 256GB SSD, Mini Desktop Computers with HD Graphics Card 5500 4K HD Dual HDMI Use for Home, Business     153
Beelink Mini PC Windows 10, Intel Broadwell Processor i3-5005U，8GB DDR3 128GB SSD，Mini Desktop Computers with HD Graphics Card 5500,Bluetooth 4.0，4K HD Dual HDMI                                          148
Beelink Mini PC Windows 10 Pro, AMD Ryzen 5 3550H Processor, 8GB DDR4 256GB SSD 1TB Quadruple Display Desktop Computer, Fingerprint Login in Gaming Working Mini PC                                        137
Noctua NT-H1 3.5g, Pro-Grade Thermal Compound Paste (3.5g)                                                                                                            

In [14]:
df = df.drop_duplicates(subset=['title'])
df

Unnamed: 0,title,url,rating,review_count,price
0,Intel Core i5-6500 Desktop CPU Processor- SR2L...,https://www.amazon.com/gp/slredirect/picassoRe...,4.4,69,139.99
3,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21451,190.00
6,Intel Core i9-10850K Desktop Processor 10 Core...,https://www.amazon.com/Intel-i9-10850K-Desktop...,4.9,977,399.91
7,Intel BX80684I99900KF Intel Core i9-9900KF Des...,https://www.amazon.com/Intel-BX80684I99900KF-i...,4.7,1704,349.29
8,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",https://www.amazon.com/AMD-Ryzen-3600-12-Threa...,4.9,35328,299.99
...,...,...,...,...,...
3030,"Dell Inspiron 15 FHD Touchscreen Laptop, AMD R...",https://www.amazon.com/Dell-Inspiron-Touchscre...,4.0,3,749.00
3031,"Newest Lenovo Flex 3 11.6"" 2-in-1 Touchscreen ...",https://www.amazon.com/Lenovo-Touchscreen-Chro...,4.7,7,230.00
3032,"New 2020 HP 11.6"" HD Chromebook for Students M...",https://www.amazon.com/HP-Chromebook-Students-...,4.0,19,175.75
3033,USB-C Laptop Charger Power Adapter: GX20M33579...,https://www.amazon.com/USB-C-Laptop-Charger-Po...,4.3,1060,29.99


In [15]:
# Inserting 'product name' column at the start

df.insert(0, 'Product', 'Processor')

In [16]:
df

Unnamed: 0,Product,title,url,rating,review_count,price
0,Processor,Intel Core i5-6500 Desktop CPU Processor- SR2L...,https://www.amazon.com/gp/slredirect/picassoRe...,4.4,69,139.99
3,Processor,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21451,190.00
6,Processor,Intel Core i9-10850K Desktop Processor 10 Core...,https://www.amazon.com/Intel-i9-10850K-Desktop...,4.9,977,399.91
7,Processor,Intel BX80684I99900KF Intel Core i9-9900KF Des...,https://www.amazon.com/Intel-BX80684I99900KF-i...,4.7,1704,349.29
8,Processor,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",https://www.amazon.com/AMD-Ryzen-3600-12-Threa...,4.9,35328,299.99
...,...,...,...,...,...,...
3030,Processor,"Dell Inspiron 15 FHD Touchscreen Laptop, AMD R...",https://www.amazon.com/Dell-Inspiron-Touchscre...,4.0,3,749.00
3031,Processor,"Newest Lenovo Flex 3 11.6"" 2-in-1 Touchscreen ...",https://www.amazon.com/Lenovo-Touchscreen-Chro...,4.7,7,230.00
3032,Processor,"New 2020 HP 11.6"" HD Chromebook for Students M...",https://www.amazon.com/HP-Chromebook-Students-...,4.0,19,175.75
3033,Processor,USB-C Laptop Charger Power Adapter: GX20M33579...,https://www.amazon.com/USB-C-Laptop-Charger-Po...,4.3,1060,29.99


In [17]:
df.to_csv('processor_output.csv', index=False)

In [18]:
#df[df['title']== 'Acer Predator Helios 300 PH315-54-760S Gaming Laptop | Intel i7-11800H | NVIDIA GeForce RTX 3060 Laptop GPU | 15.6" Full HD 144Hz 3ms IPS Display | 16GB DDR4 | 512GB SSD | Killer WiFi 6 | RGB Keyboard']