## Introduction

The dataset is sourced from:
https://data.world/datafiniti/electronic-products-and-pricing-data

The data schema can be found here:
https://developer.datafiniti.co/docs/product-data-schema

In [213]:
# Loading prerequisite libraries

import re
import numpy as np
import pandas as pd

In [214]:
# Load CSV to a pandas dataframe
df = pd.read_csv('ElectronicProducts.csv',infer_datetime_format=True, parse_dates=True)

## Data Cleaning

In [215]:
# Remove leading and trailing spaces from id column

df['id'] = df['id'].str.strip()
df['asins'] = df['asins'].str.strip()
df['imageurls'] = df['imageurls'].str.strip()
df['keys'] = df['keys'].str.strip()
df['manufacturer'] = df['manufacturer'].str.strip().str.capitalize()
df['manufacturernumber'] = df['manufacturernumber'].str.strip()
df['name'] = df['name'].str.strip().str.capitalize()
df['sourceurls'] = df['sourceurls'].str.strip()

In [206]:
# Remove leading and trailing spaces from prices_availability column and standardize the values

df['prices_availability'] = df['prices_availability'].str.strip()
df['prices_availability'] = df['prices_availability'].str.replace('available', 'In Stock').replace('TRUE', 'In Stock').replace('yes', 'In Stock').replace('Yes', 'In Stock')
df['prices_availability'] = df['prices_availability'].str.replace('No', 'Out Of Stock').replace('sold', 'Out Of Stock').replace('FALSE', 'Out Of Stock')
df['prices_availability'] = df['prices_availability'].str.replace('undefined', 'NaN')

In [207]:
# Remove trailing spaces from prices_condition column and standardize the values

df['prices_condition'] = df['prices_condition'].str.strip()
df['prices_condition'] = df['prices_condition'].str.replace('new', 'New').replace('pre-owned', 'Used') \
                        .replace('Seller refurbished', 'Seller Refurbished').replace('Manufacturer refurbished', 'Manufacturer Refurbished') \
                        .replace('refurbished', 'Refurbished').replace('New other (see details)', 'New') \
                        .replace('^New.*', 'New', regex=True).replace('^5/16.*', 'New', regex=True)

In [208]:
# Remove leading and trailing spaces from prices_shipping column and standardize the values

df['prices_shipping'] = df['prices_shipping'].str.strip()
df['prices_shipping'] = df['prices_shipping'].str.replace('nan', 'NaN').replace('Free Expedited Shipping', 'Free') \
                        .replace('Expedited', 'Free').replace('Free Shipping', 'Free') \
                        .replace('Free Standard Shipping', 'Free').replace('Freight', 'Standard') \
                        .replace('Value', 'Free').replace('Free Shippingon orders 35 and up', 'Free on orders 35 and up') \
                        .replace('Free Shipping on orders 35 and up', 'Free on orders 35 and up') \
                        .replace('Free Expedited Shipping for most orders over $49', 'Free on orders 50 and up') \
                        .replace('Free Shipping for this Item', 'Free') \
                        .replace('FREE', 'Free') \
                        .replace('Free Standard Shipping on Orders Over $49', 'Free on orders 50 and up') \
                        .replace('Free Delivery', 'Free') \
                        .replace('Free Next Day Delivery (USA)', 'Free Next Day Delivery(USA)')

In [209]:
# Remove leading and trailing spaces from prices_shipping column and standardize the values

df['brand'] = df['brand'].str.strip()
df['brand'] = df['brand'].str.replace('Insignia™', 'Insignia').replace('Bose®', 'Bose').str.capitalize()

In [210]:
# Remove junk characters from categories column

df['categories'] = df['categories'].str.replace('\\\\', '').replace(r' \b[a-z]+\b', '', regex=True)

In [229]:
# Remove leading and trailing spaces from weight and extract first set of valid values

df['weight'] = df['weight'].str.strip()
df['weight'] = df['weight'].str.replace('pounds', 'lb').str.replace('ounces', 'oz')\
                .str.replace('Electronics', '').str.replace(r'[http:].*', 'NaN')
df['weight'] = df['weight'].apply(lambda x: ' '.join(x.split()[:2]))

In [235]:
# Rename columns

df = df.rename(columns={"id": "Id", "prices_amountmax": "Price-Max", "prices_amountmin": "Price-Min", "prices_availability": "Availability", "prices_condition": "Condition", "prices_currency": "Price-Currency", "prices_dateseen": "Date-Seen", "prices_issale": "On-Sale", "prices_merchant": "Merchant", "prices_shipping": "Shipping", "prices_sourceurls": "Price-Source-URLs", "asins": "ASINS", "brand": "Brand", "categories": "Category-Labels", "dateadded": "Date-Added", "dateupdated": "Date-Updated", "ean": "EAN", "imageurls": "Image URLs", "keys": "Keys", "manufacturer": "Manufacturer", "manufacturernumber": "Manufacturer-Id", "name": "Name", "primarycategories": "Primary-Category", "sourceurls": "Review-Source-URLs", "upc": "UPC", "weight": "Weight"})

In [237]:
df.to_csv('Products.csv')