In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Create DataFrame from output_json file for processor

df = pd.read_json('search_processor_output.jsonl')
df

Unnamed: 0,title,url,rating,review_count,price
0,Intel Core i9-10850K Desktop Processor 10 Core...,https://www.amazon.com/Intel-i9-10850K-Desktop...,4.9 out of 5 stars,975,
1,"AMD Ryzen 5 5600X 6-core, 12-Thread Unlocked D...",https://www.amazon.com/AMD-Ryzen-5600X-12-Thre...,4.8 out of 5 stars,4406,
2,"AMD Ryzen 7 5800X 8-core, 16-Thread Unlocked D...",https://www.amazon.com/AMD-Ryzen-5800X-16-Thre...,4.8 out of 5 stars,3923,
3,Intel Core i7-9700K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i7-9700K-Desktop-...,4.8 out of 5 stars,8639,$255.00
4,"AMD Ryzen 7 3700X 8-Core, 16-Thread Unlocked D...",https://www.amazon.com/AMD-Ryzen-3700X-16-Thre...,4.9 out of 5 stars,22217,$279.49
...,...,...,...,...,...
2047,"AVITA 14"" Pura [CN6Q14] AMD A9 8GB RAM 128GB S...",https://www.amazon.com/CN6Q14-Screen-Windows-L...,4.1 out of 5 stars,437,$299.99
2048,LEVEN SINBA 32GB KIT (16GBx2) DDR4 3600MHz PC4...,https://www.amazon.com/LEVEN-3600MHz-PC4-28800...,4.6 out of 5 stars,537,$128.99
2049,"Windows 10 Pro Mini PC, T4 Upgraded 4GB/64GB e...",https://www.amazon.com/Upgraded-Intel-Quad-Cor...,4.1 out of 5 stars,167,$139.99
2050,"Android Tablet 10 Inch 2021, 32GB Storage, WiF...",https://www.amazon.com/Android-Storage-Tablets...,3.8 out of 5 stars,1518,$97.77


## Performing following operations on the DataFrame
- Checking data type for the columns
- Price column: remove '$' and ',', also change the type to float
- Review_count cloumn: remove ',', also change the type to float
- Rating column: Extract ONLY product rating form the column

- Check null values for the columns. Remove rows without price, rating and review_count
- Remove duplicate rows

- Add Product column and write 'processor' in it

In [3]:
# Checking the data type

df.dtypes

title           object
url             object
rating          object
review_count    object
price           object
dtype: object

In [4]:
df.isnull().sum()

title             0
url               0
rating          314
review_count    314
price           354
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.isnull().sum()

title           0
url             0
rating          0
review_count    0
price           0
dtype: int64

In [7]:
df.shape

(1426, 5)

In [8]:
# Price column: remove '$' and ',', also change the type to float

df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].astype(float)

In [9]:
# Review_count cloumn: remove ',', also change the type to float

df['review_count'] = df['review_count'].str.replace(',', '')
df['review_count'] = df['review_count'].astype(float)
df['review_count'] = df['review_count'].astype(int)

In [10]:
# Rating column: Extract ONLY product rating form the column

df.rating = df.rating.astype('category')
df['rating'] = df['rating'].str.extract("(\d*\.?\d+)", expand=True)

In [11]:
df.dtypes

title            object
url              object
rating           object
review_count      int64
price           float64
dtype: object

In [12]:
df

Unnamed: 0,title,url,rating,review_count,price
3,Intel Core i7-9700K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i7-9700K-Desktop-...,4.8,8639,255.00
4,"AMD Ryzen 7 3700X 8-Core, 16-Thread Unlocked D...",https://www.amazon.com/AMD-Ryzen-3700X-16-Thre...,4.9,22217,279.49
5,Intel Core i9-9900K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i9-9900K-Processo...,4.8,1314,329.99
6,Intel BX80684I99900KF Intel Core i9-9900KF Des...,https://www.amazon.com/Intel-BX80684I99900KF-i...,4.7,1701,347.00
7,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21439,211.99
...,...,...,...,...,...
2046,Dell Latitude 14 7000 7480 Business UltraBook ...,https://www.amazon.com/Dell-Latitude-7000-7480...,4.4,139,379.00
2047,"AVITA 14"" Pura [CN6Q14] AMD A9 8GB RAM 128GB S...",https://www.amazon.com/CN6Q14-Screen-Windows-L...,4.1,437,299.99
2048,LEVEN SINBA 32GB KIT (16GBx2) DDR4 3600MHz PC4...,https://www.amazon.com/LEVEN-3600MHz-PC4-28800...,4.6,537,128.99
2049,"Windows 10 Pro Mini PC, T4 Upgraded 4GB/64GB e...",https://www.amazon.com/Upgraded-Intel-Quad-Cor...,4.1,167,139.99


In [13]:
df.groupby(['title'])['rating'].count().sort_values(ascending = False)

title
AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked Desktop Processor(Tray) with Wraith Stealth Cooler                                                                                                             122
Noctua NT-H1 3.5g, Pro-Grade Thermal Compound Paste (3.5g)                                                                                                                                                  78
Kingston FURY Renegade 64GB (2x32GB) 3200MHz DDR4 CL16 Desktop Memory Kit of 2 KF432C16RBK2/64                                                                                                              72
MOVESPEED 256GB 3D NAND Internal PC SSD - SATA III 6 Gb/s, 2.5"/7mm, Up to 540 MB/s - YSSDJQB-256GSQ                                                                                                        21
MOVESPEED 512GB 3D NAND Internal PC SSD - SATA III 6 Gb/s, 2.5"/7mm, Up to 540 MB/s - YSSDJQB-512GSQ                                                                  

In [14]:
df = df.drop_duplicates(subset=['title'])
df

Unnamed: 0,title,url,rating,review_count,price
3,Intel Core i7-9700K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i7-9700K-Desktop-...,4.8,8639,255.00
4,"AMD Ryzen 7 3700X 8-Core, 16-Thread Unlocked D...",https://www.amazon.com/AMD-Ryzen-3700X-16-Thre...,4.9,22217,279.49
5,Intel Core i9-9900K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i9-9900K-Processo...,4.8,1314,329.99
6,Intel BX80684I99900KF Intel Core i9-9900KF Des...,https://www.amazon.com/Intel-BX80684I99900KF-i...,4.7,1701,347.00
7,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21439,211.99
...,...,...,...,...,...
2046,Dell Latitude 14 7000 7480 Business UltraBook ...,https://www.amazon.com/Dell-Latitude-7000-7480...,4.4,139,379.00
2047,"AVITA 14"" Pura [CN6Q14] AMD A9 8GB RAM 128GB S...",https://www.amazon.com/CN6Q14-Screen-Windows-L...,4.1,437,299.99
2048,LEVEN SINBA 32GB KIT (16GBx2) DDR4 3600MHz PC4...,https://www.amazon.com/LEVEN-3600MHz-PC4-28800...,4.6,537,128.99
2049,"Windows 10 Pro Mini PC, T4 Upgraded 4GB/64GB e...",https://www.amazon.com/Upgraded-Intel-Quad-Cor...,4.1,167,139.99


In [15]:
# Inserting 'product name' column at the start

df.insert(0, 'Product', 'Processor')

In [16]:
df

Unnamed: 0,Product,title,url,rating,review_count,price
3,Processor,Intel Core i7-9700K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i7-9700K-Desktop-...,4.8,8639,255.00
4,Processor,"AMD Ryzen 7 3700X 8-Core, 16-Thread Unlocked D...",https://www.amazon.com/AMD-Ryzen-3700X-16-Thre...,4.9,22217,279.49
5,Processor,Intel Core i9-9900K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i9-9900K-Processo...,4.8,1314,329.99
6,Processor,Intel BX80684I99900KF Intel Core i9-9900KF Des...,https://www.amazon.com/Intel-BX80684I99900KF-i...,4.7,1701,347.00
7,Processor,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21439,211.99
...,...,...,...,...,...,...
2046,Processor,Dell Latitude 14 7000 7480 Business UltraBook ...,https://www.amazon.com/Dell-Latitude-7000-7480...,4.4,139,379.00
2047,Processor,"AVITA 14"" Pura [CN6Q14] AMD A9 8GB RAM 128GB S...",https://www.amazon.com/CN6Q14-Screen-Windows-L...,4.1,437,299.99
2048,Processor,LEVEN SINBA 32GB KIT (16GBx2) DDR4 3600MHz PC4...,https://www.amazon.com/LEVEN-3600MHz-PC4-28800...,4.6,537,128.99
2049,Processor,"Windows 10 Pro Mini PC, T4 Upgraded 4GB/64GB e...",https://www.amazon.com/Upgraded-Intel-Quad-Cor...,4.1,167,139.99


In [17]:
df.to_csv('processor_output.csv', index=False)

In [18]:
#df[df['title']== 'Acer Predator Helios 300 PH315-54-760S Gaming Laptop | Intel i7-11800H | NVIDIA GeForce RTX 3060 Laptop GPU | 15.6" Full HD 144Hz 3ms IPS Display | 16GB DDR4 | 512GB SSD | Killer WiFi 6 | RGB Keyboard']