In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Create DataFrame from output_json file for processor

df = pd.read_json('search_processor_output.jsonl')
df

Unnamed: 0,title,url,rating,review_count,price
0,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",https://www.amazon.com/gp/slredirect/picassoRe...,4.4 out of 5 stars,19,$248.99
1,Intel Core i7-10700KF Desktop Processor 8 Core...,https://www.amazon.com/Intel-i7-10700KF-Proces...,4.8 out of 5 stars,461,$386.97
2,Intel Core i5-9400F Desktop Processor 6 Cores ...,https://www.amazon.com/Intel-i5-9400F-Desktop-...,4.8 out of 5 stars,5138,$169.99
3,Intel Core i3-10100 Desktop Processor 4 Cores ...,https://www.amazon.com/Intel-i3-10100-Desktop-...,4.8 out of 5 stars,1535,$188.99
4,Intel Core i5-10600K Desktop Processor 6 Cores...,https://www.amazon.com/Intel-i5-10600K-Desktop...,4.8 out of 5 stars,1279,
...,...,...,...,...,...
1869,MSI Z490-A PRO ProSeries ATX Motherboard (10th...,https://www.amazon.com/MSI-PRO-ProSeries-Mothe...,4.3 out of 5 stars,35,$169.95
1870,HP 14-inch Chromebook HD Touchscreen Laptop PC...,https://www.amazon.com/HP-14-inch-Chromebook-T...,4.5 out of 5 stars,1767,
1871,NETGEAR Nighthawk 12-Stream AX12 Wifi 6 Router...,https://www.amazon.com/NETGEAR-Nighthawk-12-St...,4.2 out of 5 stars,871,$489.97
1872,"Tablet 10 Inch, Android 10.0 Tablets 2021 with...",https://www.amazon.com/1280x800-Touchscreen-Mi...,4.3 out of 5 stars,90,


## Performing following operations on the DataFrame
- Checking data type for the columns
- Price column: remove '$' and ',', also change the type to float
- Review_count cloumn: remove ',', also change the type to float
- Rating column: Extract ONLY product rating form the column

- Check null values for the columns. Remove rows without price, rating and review_count
- Remove duplicate rows

- Add Product column and write 'processor' in it

In [3]:
# Checking the data type

df.dtypes

title           object
url             object
rating          object
review_count    object
price           object
dtype: object

In [4]:
df.isnull().sum()

title             0
url               0
rating          282
review_count    282
price           334
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.isnull().sum()

title           0
url             0
rating          0
review_count    0
price           0
dtype: int64

In [7]:
df.shape

(1307, 5)

In [8]:
# Price column: remove '$' and ',', also change the type to float

df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].astype(float)

In [8]:
# Review_count cloumn: remove ',', also change the type to float

df['review_count'] = df['review_count'].str.replace(',', '')
df['review_count'] = df['review_count'].astype(float)
df['review_count'] = df['review_count'].astype(int)

In [10]:
# Rating column: Extract ONLY product rating form the column

df.rating = df.rating.astype('category')
df['rating'] = df['rating'].str.extract("(\d*\.?\d+)", expand=True)

In [11]:
df.dtypes

title           object
url             object
rating          object
review_count     int64
price           object
dtype: object

In [11]:
df

Unnamed: 0,title,url,rating,review_count,price
0,Intel Core i7-9700K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i7-9700K-Desktop-...,4.8,8634,255.00
2,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21436,193.00
3,Intel Core i9-9900K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i9-9900K-Processo...,4.8,1312,329.99
5,Intel Core i9-10850K Desktop Processor 10 Core...,https://www.amazon.com/Intel-i9-10850K-Desktop...,4.9,975,408.00
6,AMD Ryzen 5 1600 65W AM4 Processor with Wraith...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,6637,152.99
...,...,...,...,...,...
3052,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",https://www.amazon.com/gp/slredirect/picassoRe...,4.4,19,248.99
3055,OWC 32.0GB (4X 8GB) DDR3 ECC-R PC10600 1333MHz...,https://www.amazon.com/OWC-32-0GB-ECC-R-PC1060...,4.7,38,109.99
3056,Dell Latitude E7450 14in FHD Business Laptop C...,https://www.amazon.com/Dell-Latitude-E7450-Bus...,4.1,32,320.00
3058,ASUS CHROMEBOX3-N3299U Mini PC with Intel Core...,https://www.amazon.com/Asus-Core-i3-8130U-DDR4...,4.5,85,393.00


In [12]:
df.groupby(['title'])['rating'].count().sort_values(ascending = False)

title
AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked Desktop Processor(Tray) with Wraith Stealth Cooler                                                                                                              167
ASIAHORSE WD-001 120mm 5V ARGB Motherboard 12 Addressable LED 9 Blades Hydraulic Bearing White Case/Radiator Fan (5 Pack-White)                                                                              81
Noctua NT-H1 3.5g, Pro-Grade Thermal Compound Paste (3.5g)                                                                                                                                                   81
Gigabyte B550 AORUS ELITE AX V2 (AMD Ryzen 5000/B550/ATX/True 12+2 Phases Digital VRM/PCIe4.0/DDR4/USB3.2 Gen 1/Realtek ALC1200/Intel WiFi 6/2xM.2 Thermal Guard/2.5 GbE LAN/HDMI/DP/Gaming Motherboard)     55
Mini PC Windows 10, Beelink Mini Computers Intel Broadwell Processor I3-5005U, Desktop Pc 8GB DDR3 256GB SSD, Mini Desktop Computers with HD Graphics Card 5500 4K

In [13]:
df = df.drop_duplicates(subset=['title'])
df

Unnamed: 0,title,url,rating,review_count,price
0,Intel Core i7-9700K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i7-9700K-Desktop-...,4.8,8634,255.00
2,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21436,193.00
3,Intel Core i9-9900K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i9-9900K-Processo...,4.8,1312,329.99
5,Intel Core i9-10850K Desktop Processor 10 Core...,https://www.amazon.com/Intel-i9-10850K-Desktop...,4.9,975,408.00
6,AMD Ryzen 5 1600 65W AM4 Processor with Wraith...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,6637,152.99
...,...,...,...,...,...
3050,Microsoft Surface Pro (5th Gen) (Intel Core i5...,https://www.amazon.com/Microsoft-Surface-Intel...,4.4,1192,699.99
3055,OWC 32.0GB (4X 8GB) DDR3 ECC-R PC10600 1333MHz...,https://www.amazon.com/OWC-32-0GB-ECC-R-PC1060...,4.7,38,109.99
3056,Dell Latitude E7450 14in FHD Business Laptop C...,https://www.amazon.com/Dell-Latitude-E7450-Bus...,4.1,32,320.00
3058,ASUS CHROMEBOX3-N3299U Mini PC with Intel Core...,https://www.amazon.com/Asus-Core-i3-8130U-DDR4...,4.5,85,393.00


In [14]:
# Inserting 'product name' column at the start

df.insert(0, 'Product', 'Processor')

In [15]:
df

Unnamed: 0,Product,title,url,rating,review_count,price
0,Processor,Intel Core i7-9700K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i7-9700K-Desktop-...,4.8,8634,255.00
2,Processor,AMD Ryzen 5 2600 Processor with Wraith Stealth...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,21436,193.00
3,Processor,Intel Core i9-9900K Desktop Processor 8 Cores ...,https://www.amazon.com/Intel-i9-9900K-Processo...,4.8,1312,329.99
5,Processor,Intel Core i9-10850K Desktop Processor 10 Core...,https://www.amazon.com/Intel-i9-10850K-Desktop...,4.9,975,408.00
6,Processor,AMD Ryzen 5 1600 65W AM4 Processor with Wraith...,https://www.amazon.com/AMD-Processor-Wraith-St...,4.8,6637,152.99
...,...,...,...,...,...,...
3050,Processor,Microsoft Surface Pro (5th Gen) (Intel Core i5...,https://www.amazon.com/Microsoft-Surface-Intel...,4.4,1192,699.99
3055,Processor,OWC 32.0GB (4X 8GB) DDR3 ECC-R PC10600 1333MHz...,https://www.amazon.com/OWC-32-0GB-ECC-R-PC1060...,4.7,38,109.99
3056,Processor,Dell Latitude E7450 14in FHD Business Laptop C...,https://www.amazon.com/Dell-Latitude-E7450-Bus...,4.1,32,320.00
3058,Processor,ASUS CHROMEBOX3-N3299U Mini PC with Intel Core...,https://www.amazon.com/Asus-Core-i3-8130U-DDR4...,4.5,85,393.00
