# Product Data Cleaning

Input file: meta_Electronics.json.gz (Accessed by filling up this form: https://forms.gle/UEkkJs69e7Z5A5Ps9)

Output file: products.csv

In [1]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

In [3]:
### load the meta data
data = []
with gzip.open('meta_Electronics.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

786445
{'category': ['Electronics', 'Camera &amp; Photo', 'Video Surveillance', 'Surveillance Systems', 'Surveillance DVR Kits'], 'tech1': '', 'description': ['The following camera brands and models have been tested for compatibility with GV-Software.\nGeoVision \tACTi \tArecont Vision \tAXIS \tBosch \tCanon\nCNB \tD-Link \tEtroVision \tHikVision \tHUNT \tIQEye\nJVC \tLG \tMOBOTIX \tPanasonic \tPelco \tSamsung\nSanyo \tSony \tUDP \tVerint \tVIVOTEK \t \n \nCompatible Standard and Protocol\nGV-System also allows for integration with all other IP video devices compatible with ONVIF(V2.0), PSIA (V1.1) standards, or RTSP protocol.\nONVIF \tPSIA \tRTSP \t  \t  \t \nNote: Specifications are subject to change without notice. Every effort has been made to ensure that the information on this Web site is accurate. No liability is assumed for incidental or consequential damages arising from the use of the information or products contained herein.'], 'fit': '', 'title': 'Genuine Geovision 1 Channe

In [4]:
# Convert list into pandas dataframe
df = pd.DataFrame.from_dict(data)
print(len(df))

786445


In [5]:
# Remove rows with unformatted title (i.e. some 'title' may still contain html style content)
df3 = df.fillna('')
df4 = df3[df3.title.str.contains('getTime')] # unformatted rows
df5 = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
print(len(df4))
print(len(df5))

0
786445


In [6]:
# Inspect how those unformatted rows look like
df4.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details


In [7]:
df5.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Electronics, Camera &amp; Photo, Video Survei...",,[The following camera brands and models have b...,,Genuine Geovision 1 Channel 3rd Party NVR IP S...,[],,GeoVision,"[Genuine Geovision 1 Channel NVR IP Software, ...","[>#3,092 in Tools &amp; Home Improvement &gt; ...",[],Camera &amp; Photo,,"January 28, 2014",$65.00,11300000,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
1,"[Electronics, Camera &amp; Photo]",,[This second edition of the Handbook of Astron...,,"Books ""Handbook of Astronomical Image Processi...",[0999470906],,33 Books Co.,[Detailed chapters cover these fundamental top...,"[>#55,933 in Camera &amp; Photo (See Top 100 i...","[0943396670, 1138055360, 0999470906]",Camera &amp; Photo,,"June 17, 2003",,43396828,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,"[Electronics, eBook Readers &amp; Accessories,...",,[A zesty tale. (Publishers Weekly)<br /><br />...,,One Hot Summer,"[0425167798, 039914157X]",,Visit Amazon's Carolina Garcia Aguilera Page,[],"3,105,177 in Books (",[],Books,,,$11.49,60009810,[],[],
3,"[Electronics, eBook Readers & Accessories, eBo...",,[],,Hurray for Hattie Rabbit: Story and pictures (...,"[0060219521, 0060219580, 0060219394]",,Visit Amazon's Dick Gackenbach Page,[],"2,024,298 in Books (","[0060219521, 0060219475, 0060219394]",Books,,,.a-section.a-spacing-mini{margin-bottom:6px!im...,60219602,[],[],
4,"[Electronics, eBook Readers & Accessories, eBo...",,[&#8220;sex.lies.murder.fame. is brillllli&#82...,,sex.lies.murder.fame.: A Novel,[],,Visit Amazon's Lolita Files Page,[],"3,778,828 in Books (",[],Books,,,$13.95,60786817,[],[],


In [25]:
cols = ["asin","title", "brand", "main_cat", "price"]
df_final = df5[cols]

In [26]:
df_final = df_final.replace(';', '', regex=True)
df_final = df_final.replace('"', '', regex=True)
df_final = df_final.replace('&amp', '&', regex=True)
df_final['price'] = df_final['price'].str.replace('$', '', regex=True)

df_final.head()

Unnamed: 0,asin,title,brand,main_cat,price
0,11300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,GeoVision,Camera & Photo,65.00
1,43396828,Books Handbook of Astronomical Image Processin...,33 Books Co.,Camera & Photo,
2,60009810,One Hot Summer,Visit Amazon's Carolina Garcia Aguilera Page,Books,11.49
3,60219602,Hurray for Hattie Rabbit: Story and pictures (...,Visit Amazon's Dick Gackenbach Page,Books,.a-section.a-spacing-mini{margin-bottom:6px!im...
4,60786817,sex.lies.murder.fame.: A Novel,Visit Amazon's Lolita Files Page,Books,13.95


In [27]:
# get only numbers in price
def isnumber(x):
    try:
        float(x)
        return float(x)
    except:
        return -1

df_final['price'] = df_final['price'].apply(isnumber)

df_final.head()

Unnamed: 0,asin,title,brand,main_cat,price
0,11300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,GeoVision,Camera & Photo,65.0
1,43396828,Books Handbook of Astronomical Image Processin...,33 Books Co.,Camera & Photo,-1.0
2,60009810,One Hot Summer,Visit Amazon's Carolina Garcia Aguilera Page,Books,11.49
3,60219602,Hurray for Hattie Rabbit: Story and pictures (...,Visit Amazon's Dick Gackenbach Page,Books,-1.0
4,60786817,sex.lies.murder.fame.: A Novel,Visit Amazon's Lolita Files Page,Books,13.95


In [28]:
df_final.loc[df_final["asin"] == "B000246V8O"]

Unnamed: 0,asin,title,brand,main_cat,price
27412,B000246V8O,"Dual XD7500 AM/FM/CD Receiver, Motorized 4X50W...",Dual Electronics,All Electronics,-1.0
57780,B000246V8O,"Dual XD7500 AM/FM/CD Receiver, Motorized 4X50W...",Dual Electronics,All Electronics,-1.0


In [29]:
# remove duplicates
df_final.drop_duplicates(inplace = True)
len(df_final)

756077

In [30]:
df_final.to_csv("products.csv", sep = ";",header = False, index = False)