## Data Cleaning and Concatonating Scrape CSVs
_Author: Rachel Koenig_
____

Imports 

In [1]:
import pandas as pd
from bs4 import BeautifulSoup

Define a function to remove html

In [2]:
def remove_html(text):
    try:  
        soup = BeautifulSoup(text, 'lxml')  # Instantiate BeautifulSoup
        html_free = soup.get_text(strip=True)  # Strip html, leaving just text 
        return html_free
    except:  
        return "missing"  # if there is nulls in the column, fill with "missing" string  

Define a function to clean a dataframe from the  scrape:

In [3]:
def EDA(df):
    # Remove html and strip off brackets and 'Color:' from color column
    df['color'] = df['color'].apply(lambda x: remove_html(x)).str.replace(']', '').str.replace('[', '').str.replace('Color:', '')
    
    # Replace escape characters and white space in Category column 
    df['category'] = df['category'].str.replace('\n', '').str.replace('  ', '')
    
    # Split Categroy column on the '›' symbol, up to 6 times and return them in a new df where each split is a new column 
    category = df['category'].str.split("›", n=6, expand=True)
    
    # Rename each category split column and add it onto the original df
    df['department'] = category[0]
    df['demographic'] = category[1]
    df['division'] = category[2]
    df['category'] = category[3]
    df['subcategory'] = category[4]
    df['type'] = category[5]
    df['detail_type'] = category[6]
    
    # Remove special characters from the description column 
    df['description'] = df['description'].str.replace('\t', '').str.replace('\n', '')
    
    # Remove html and strip off brackets from details column
    df['details'] = df['details'].apply(lambda x: remove_html(x)).str.replace(']', '').str.replace('[', '')
   
    #Remove html and cut off 'Size:' from the size column 
    df['size'] = df['size'].apply(lambda x: remove_html(x)).str.replace('Size:', '')
    
    return df.head()

Read in csv of first 8000 scrapes.

In [4]:
df1 = pd.read_csv('data/product8000.csv', index_col=[0])

In [5]:
# Check first 5 rows 
df1.head()

Unnamed: 0,asin,category,color,description,details,name,size
0,0000031887,Sports & Outdoors\n \n\n\n ...,unavailable,This fits your .\n \n\n\n Make sure this fi...,[],Mystiqueshapes Girls Ballet Tutu Neon Lime Green,
1,0123456479,"Clothing, Shoes & Jewelry\n \n\n\n ...",unavailable,This fits your .\n \n\n\n Make sure this fi...,[],SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,
2,1608299953,Software\n \n\n\n ›\n ...,unavailable,Access for up to 5 family members\n\t\t\t\t\t\...,[],Learn French: Rosetta Stone French - Level 1,
3,1617160377,Software\n \n\n\n ›\n ...,unavailable,Access for up to 5 family members\n\t\t\t\t\t\...,[],Learn Italian: Rosetta Stone Italian - Level 1,
4,B00001W0KA,Toys & Games\n \n\n\n ›...,"<div class=""a-section a-spacing-small"" id=""var...",Lead Free\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n\t...,[],Buzz Lightyear Boy's Deluxe Toy Story Costume,"<div class=""a-section a-spacing-base variation..."


Read in csv of next 2000 scrapes followed by the rest of the chunks.  Each csv is a slightly different size just due to scraper cooperation and available time to run the scraper each time.

In [6]:
df2 = pd.read_csv('data/product20000.csv', index_col=[0])
df2.head()

Unnamed: 0,asin,category,color,description,details,name,size
0,B006W71WJI,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",100% Leather\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \...,[],Dansko Women's Valerie Pull-Up Clog,"<div class=""a-section a-spacing-small"" id=""var..."
1,B006W8QR56,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",IDEAL SIZE -Drawstring backpacks come in one s...,[],Mato & Hash Drawstring Bulk Bags Cinch Sacks B...,"<div class=""a-section a-spacing-base variation..."
2,B006W8WLTW,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",100% Interlock Cotton\n\t\t\t\t\t\t\t\n\t\t\t\...,[],Luvable Friends Baby Sayings Bodysuit – Wild Boy,"<div class=""a-section a-spacing-small"" id=""var..."
3,B006W8WMP0,unknown category,unknown color,unknown details,,unknown product name,
4,B006WANIVK,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",90% Cotton | 10% Rayon\n\t\t\t\t\t\t\t\n\t\t\t...,[],Funfash Plus Size Women Open Angel Sleeve Cold...,"<div class=""a-section a-spacing-base variation..."


In [7]:
df3 = pd.read_csv('data/product21180.csv', index_col=[0])
df3.head()

Unnamed: 0,asin,category,color,description,details,name,size
0,B00B8ZJ8U4,"Clothing, Shoes & Jewelry\n \n\n\n ...",unavailable,THIS PRICE IS FOR TWO BRACELETS!\n\t\t\t\t\t\t...,"[<td class=""a-span7 a-size-base"">\n ...",JOTW 2 Pieces of Goldtone with Clear Iced Out ...,
1,B00B93TTHM,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",100% Polyester\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n...,[],Doublju Classic Slim Fit Sleeveless Sexy Bodyc...,"<div class=""a-section a-spacing-small"" id=""var..."
2,B00B93UDRW,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",65% Cotton / 35% Polyester\n\t\t\t\t\t\t\t\n\t...,[],Doublju Slim Fit Ribbed Knit Button Down Henle...,"<div class=""a-section a-spacing-small"" id=""var..."
3,B00B93W5R8,unknown category,unknown color,unknown details,,unknown product name,
4,B00B98MPWS,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",Synthetic\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n\t...,[],Merrell Men's Ascend Glove Minimal Running Shoe,"<div class=""a-section a-spacing-base variation..."


In [8]:
df4 = pd.read_csv('data/product23033.csv', index_col=[0])
df4.head()

Unnamed: 0,asin,category,color,description,details,name,size
0,B00CWU9H9G,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",100% Rayon\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n\...,[],Rayon Short Sleeves Flyaway Cardigan Shawl Col...,"<div class=""a-section a-spacing-small"" id=""var..."
1,B00CWUC9TQ,Sports & Outdoors\n \n\n\n ...,"<div class=""a-section a-spacing-small"" id=""var...",Imported\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n\t\...,[],"Plus Size Yoga Long Stretch Pants 92% Cotton, ...","<div class=""a-section a-spacing-small"" id=""var..."
2,B00CWVE9KM,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...","Scoop neck short sleeve top, super comfy Plus ...",[],Hollywood Star Fashion Women's Short Cap Sleev...,"<div class=""a-section a-spacing-base variation..."
3,B00CWVUVMC,"Clothing, Shoes & Jewelry\n \n\n\n ...",unavailable,"CLASSIC DESIGN: PU leather band, roman number ...","[<td class=""a-span7 a-size-base"">\n ...","Mens Quartz Watch, Roman Numeral Business Casu...",
4,B00CWW1Q1Q,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",Chiffon Sheer Long Sleeves Blouse\n\t\t\t\t\t\...,[],Basic Long Sleeves Sheer Chiffon Blouse with P...,"<div class=""a-section a-spacing-small"" id=""var..."


Create a list of DataFrames called `dataframes` to be concatonated & then concat them along the row axis and call the new DataFrame `total_product`.

In [9]:
dataframes = [df1, df2, df3, df4] 

total_product = pd.concat(dataframes, axis=0)  

In [10]:
# Check the shape of new combined DataFrame
total_product.shape

(17033, 7)

Call the EDA data cleaning function on the `total_product` DataFrame.

In [11]:
EDA(total_product)

Unnamed: 0,asin,category,color,description,details,name,size,department,demographic,division,subcategory,type,detail_type
0,0000031887,Dance,unavailable,This fits your . Make sure this fitsby ent...,,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,missing,Sports & Outdoors,Sports & Fitness,Other Sports,Clothing,Girls,Skirts
1,0123456479,Jewelry Boxes & Organizers,unavailable,This fits your . Make sure this fitsby ent...,,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,missing,"Clothing, Shoes & Jewelry","Shoe, Jewelry & Watch Accessories",Jewelry Accessories,Jewelry Boxes,,
2,1608299953,,unavailable,Access for up to 5 family members Download act...,,Learn French: Rosetta Stone French - Level 1,missing,Software,Education & Reference,Languages,,,
3,1617160377,,unavailable,Access for up to 5 family members Download act...,,Learn Italian: Rosetta Stone Italian - Level 1,missing,Software,Education & Reference,Languages,,,
4,B00001W0KA,,Buzz Lightyear,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,,Buzz Lightyear Boy's Deluxe Toy Story Costume,SelectM(3T-4T)3T-4TSmall (4-6)Medium (7-8)Medi...,Toys & Games,Dress Up & Pretend Play,Costumes,,,


Check for nulls.

In [12]:
total_product.isnull().sum()

asin               0
category       12260
color              0
description        0
details            0
name               0
size               0
department         0
demographic    12009
division       12077
subcategory    13867
type           16054
detail_type    16619
dtype: int64

Check value counts for `department` to see how many unknown categories exist.

In [13]:
total_product['department'].value_counts(dropna=False)

unknown category              12009
Clothing, Shoes & Jewelry      4690
Sports & Outdoors               226
Toys & Games                     36
Health & Household               18
Arts, Crafts & Sewing            13
Beauty & Personal Care           10
Electronics                       8
Home & Kitchen                    7
Tools & Home Improvement          6
Baby Products                     5
Software                          2
Purchase Circles                  1
Office Products                   1
Automotive                        1
Name: department, dtype: int64

Save the rows that were not unknown as a new DataFrame called `robot_free_df`.

In [14]:
robot_free_df = (total_product[total_product['department'] != 'unknown category'])

In [15]:
# Check shape with dropped rows
robot_free_df.shape

(5024, 13)

In [16]:
# Check data types 
robot_free_df.dtypes

asin           object
category       object
color          object
description    object
details        object
name           object
size           object
department     object
demographic    object
division       object
subcategory    object
type           object
detail_type    object
dtype: object

Check if there are still nulls elsewhere.  In this case, they are expected because some products had more category types than others which expanded out to more columns leaving products with less categories having empty cells.

In [17]:
robot_free_df.isnull().sum()

asin              0
category        251
color             0
description       0
details           0
name              0
size              0
department        0
demographic       0
division         68
subcategory    1858
type           4045
detail_type    4610
dtype: int64

Read in additional scrape csvs that definitely were not affected by the anti-robot pages. 

In [18]:
norobotdf = pd.read_csv('data/product14000.csv', index_col=[0])
print(norobotdf.shape) #Check shape 
norobotdf.head()  # Check first 5 rows 

(5466, 7)


Unnamed: 0,asin,category,color,description,details,name,size
0,B003ZVQQVS,"Clothing, Shoes & Jewelry\n \n\n\n ...",unavailable,This fits your .\n \n\n\n Make sure this fi...,[],High Sierra A.T. Ultimate Access Carry-On Whee...,
1,B003ZVZDKS,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",90% cotton 10% polyester\n\t\t\t\t\t\t\t\n\t\t...,[],Champion Women's Favorite V-Neck Tee,"<div class=""a-section a-spacing-base variation..."
2,B003ZVZH5E,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",100% Cotton\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n...,[],Champion Women's Favorite V-Neck Tank,"<div class=""a-section a-spacing-base variation..."
3,B003ZW0X2K,Sports & Outdoors\n \n\n\n ...,"<div class=""a-section a-spacing-small"" id=""var...",100% Polyester\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n...,[],Champion Women's Absolute Workout Pant Short L...,"<div class=""a-section a-spacing-small"" id=""var..."
4,B003ZW163K,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",100% Cotton\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n...,[],Champion Women's Favorite Short,"<div class=""a-section a-spacing-base variation..."


In [19]:
# Clean DataFrame with EDA function 
EDA(norobotdf)

Unnamed: 0,asin,category,color,description,details,name,size,department,demographic,division,subcategory,type,detail_type
0,B003ZVQQVS,,unavailable,This fits your . Make sure this fitsby ent...,,High Sierra A.T. Ultimate Access Carry-On Whee...,missing,"Clothing, Shoes & Jewelry",Luggage & Travel Gear,Backpacks,,,
1,B003ZVZDKS,Active,Oxford Gray,90% cotton 10% polyester Imported Not applicab...,,Champion Women's Favorite V-Neck Tee,SelectX-SmallSmallMediumLargeX-LargeXX-LargeSe...,"Clothing, Shoes & Jewelry",Women,Clothing,Active Shirts & Tees,,
2,B003ZVZH5E,"Tops, Tees & Blouses",White,100% Cotton Imported tank-top-and-cami-shirts ...,,Champion Women's Favorite V-Neck Tank,SelectSmallMediumLargeX-LargeXX-LargeSelectif(...,"Clothing, Shoes & Jewelry",Women,Clothing,Tanks & Camis,,
3,B003ZW0X2K,Yoga,Black,100% Polyester pants closure Silky performance...,,Champion Women's Absolute Workout Pant Short L...,Small,Sports & Outdoors,Sports & Fitness,Exercise & Fitness,Clothing,Women,Pants
4,B003ZW163K,Active,Oxford Gray,100% Cotton Imported shorts closure Machine wa...,,Champion Women's Favorite Short,SelectSmallLargeX-LargeSelectif(typeof(Twister...,"Clothing, Shoes & Jewelry",Women,Clothing,Active Shorts,,


In [20]:
norobotdf2 = pd.read_csv('data/product8534.csv', index_col=[0])
print(norobotdf2.shape)  # Check shape 
norobotdf2.head()  # Check first 5 rows

(534, 7)


Unnamed: 0,asin,category,color,description,details,name,size
0,B003QOJB3O,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",100% polyster\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n ...,[],Sport-Tek Women's Side Blocked Performance Pol...,"<div class=""a-section a-spacing-base variation..."
1,B003QORIMA,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-base variation...",Cotton & Polyester\n\t\t\t\t\t\t\t\n\t\t\t\t\t...,[],Port Authority® Ladies Silk Touch™ Polo. L500,"<div class=""a-section a-spacing-base variation..."
2,B003QOZ732,unknown category,unknown color,unknown details,,unknown product name,
3,B003QSIMRQ,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",100% Polyester\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n...,[],eVogues Women's Sexy O-Ring Necklace Sleeveles...,"<div class=""a-section a-spacing-small"" id=""var..."
4,B003QSVLWE,"Clothing, Shoes & Jewelry\n \n\n\n ...","<div class=""a-section a-spacing-small"" id=""var...",These soft and comfortable flannel pajamas hav...,[],Men's Flannel Pajamas,"<div class=""a-section a-spacing-base variation..."


In [21]:
# Clean DataFrame with EDA function 
EDA(norobotdf2)

Unnamed: 0,asin,category,color,description,details,name,size,department,demographic,division,subcategory,type,detail_type
0,B003QOJB3O,"Tops, Tees & Blouses",Light Pink/White,"100% polyster Button closure 3.8-ounce, 100% p...",,Sport-Tek Women's Side Blocked Performance Pol...,SelectX-SmallSmallMediumLargeX-LargeXX-Large3X...,"Clothing, Shoes & Jewelry",Women,Clothing,Polos,,
1,B003QORIMA,"Tops, Tees & Blouses",Deep BerryRedTropical PinkSunflower YellowTurq...,Cotton & Polyester Button closure An enduring ...,,Port Authority® Ladies Silk Touch™ Polo. L500,SelectXSX-SmallSSmallMMediumLLargeX-LargeXX-La...,"Clothing, Shoes & Jewelry",Women,Clothing,Polos,,
2,B003QOZ732,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
3,B003QSIMRQ,"Tops, Tees & Blouses",Black,"100% Polyester Made in USA 1XL=Size 13/14, 2XL...",,eVogues Women's Sexy O-Ring Necklace Sleeveles...,2X,"Clothing, Shoes & Jewelry",Women,Clothing,Tanks & Camis,,
4,B003QSVLWE,Sleep & Lounge,Black,These soft and comfortable flannel pajamas hav...,,Men's Flannel Pajamas,SelectSmallMediumExtra Large (2X)Extra Large (...,"Clothing, Shoes & Jewelry",Men,Clothing,Sleep Sets,,


In [22]:
norobotdf3 = pd.read_csv('data/redo1.csv', index_col=[0])
print(norobotdf3.shape)  # Check shape 
norobotdf3.head()  # Check first 5 rows 

(598, 13)


Unnamed: 0,asin,category,color,description,details,name,size,department,demographic,division,subcategory,type,detail_type
177,B000H6AQ0Q,Women,unavailable,This fits your . Make sure this fitsby ent...,missing,Timex 1440 Sports Digital Mid Size Black/Pink,missing,Sports & Outdoors,Outdoor Recreation,Outdoor Clothing,Accessories,Sport Watches,
178,B000H6BYI4,Jewelry Boxes & Organizers,unavailable,This fits your . Make sure this fitsby ent...,missing,Caddy Bay Collection Wide Slot Jewelry Ring Di...,missing,"Clothing, Shoes & Jewelry","Shoe, Jewelry & Watch Accessories",Jewelry Accessories,Jewelry Boxes,,
179,B000H6DWZC,Jewelry Boxes & Organizers,unavailable,This fits your . Make sure this fitsby ent...,missing,Fairy Tale Jewelry Box,missing,"Clothing, Shoes & Jewelry","Shoe, Jewelry & Watch Accessories",Jewelry Accessories,Jewelry Boxes,,
180,B000H703V2,Oxfords,missing,"100% Full-grain leather,Leather,Nubuck Importe...",missing,Ecco Men's Track II Low GORE-TEX waterproof ou...,missing,"Clothing, Shoes & Jewelry",Men,Shoes,,,
181,B000H7YCE6,"Bags, Cases & Sleeves",missing,This fits your . Make sure this fitsby ent...,missing,High Sierra Loop Backpack,missing,Electronics,Computers & Accessories,Laptop Accessories,Backpacks,,


Create a list of all clean and robot free DataFrames to be concatonated called `clean_dfs` and call the new combined DataFrame `robot_free_df`

In [23]:
clean_dfs = [robot_free_df, norobotdf, norobotdf2, norobotdf3]
robot_free_df = pd.concat(clean_dfs, axis=0)  # Axis = 0, stack on top of each other 
robot_free_df.shape  # Check that shape matches the size of all previous rows being added together

(11622, 13)

Save combined dfs to a csv in the data folder.

In [24]:
# robot_free_df.to_csv('data/robot_free.csv')

Read in the csv we just saved.

In [25]:
robot_free_df = pd.read_csv('data/robot_free.csv', index_col=[0])
robot_free_df.head()  # Check the first 5 rows

Unnamed: 0,Unnamed: 0.1,asin,category,color,description,details,name,size,department,demographic,division,subcategory,type,detail_type
0,0,0000031887,Dance,unavailable,This fits your . Make sure this fitsby ent...,,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,missing,Sports & Outdoors,Sports & Fitness,Other Sports,Clothing,Girls,Skirts
1,1,0123456479,Jewelry Boxes & Organizers,unavailable,This fits your . Make sure this fitsby ent...,,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,missing,"Clothing, Shoes & Jewelry","Shoe, Jewelry & Watch Accessories",Jewelry Accessories,Jewelry Boxes,,
2,2,1608299953,,unavailable,Access for up to 5 family members Download act...,,Learn French: Rosetta Stone French - Level 1,missing,Software,Education & Reference,Languages,,,
3,3,1617160377,,unavailable,Access for up to 5 family members Download act...,,Learn Italian: Rosetta Stone Italian - Level 1,missing,Software,Education & Reference,Languages,,,
4,4,B00001W0KA,,Buzz Lightyear,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,,Buzz Lightyear Boy's Deluxe Toy Story Costume,SelectM(3T-4T)3T-4TSmall (4-6)Medium (7-8)Medi...,Toys & Games,Dress Up & Pretend Play,Costumes,,,


Check what percentage of each column are null.

In [26]:
robot_free_df.isnull().mean()

Unnamed: 0.1    0.000000
asin            0.000000
category        0.122221
color           0.000069
description     0.000000
details         0.558057
name            0.000000
size            0.000000
department      0.000000
demographic     0.079673
division        0.092026
subcategory     0.408798
type            0.815056
detail_type     0.917925
dtype: float64

Read in new csvs from the rescrapes. 

In [27]:
redo2 = pd.read_csv('data/redo2.csv', index_col=[0])
redo3 = pd.read_csv('data/redo3.csv', index_col=[0])
redo4 = pd.read_csv('data/redo4.csv', index_col=[0])
redo5 = pd.read_csv('data/redo5.csv', index_col=[0])
redo6 = pd.read_csv('data/redo6.csv', index_col=[0]) 

Concatonate all the new DataFrames with the last saved DataFrame on top of one another. 

In [28]:
# List of DataFrames to concatonate 
redo_dfs = [robot_free_df, redo2, redo3, redo4, redo5, redo6]

In [29]:
robot_free_df6 = pd.concat(redo_dfs, axis=0, # Stack dataframes on along the row index,
                           sort=True,        # Sort values  
                           ignore_index=True)  # Ignore the previous indexes so that new index goes in order 0 to n
robot_free_df6.shape  # Check that shape matches number of rows added together 

(17843, 14)

In [30]:
robot_free_df6.drop(columns='Unnamed: 0.1', inplace=True)  # Drop unnamed column 

Save new DataFrame to a csv.  Line is commented out so as to not override the save when notebook is rerun later.

In [31]:
# robot_free_df6.to_csv('data/robot_free6.csv', index=False)

In [32]:
# Check first 5 rows 
robot_free_df6.head() 

Unnamed: 0,asin,category,color,demographic,department,description,detail_type,details,division,name,size,subcategory,type
0,0000031887,Dance,unavailable,Sports & Fitness,Sports & Outdoors,This fits your . Make sure this fitsby ent...,Skirts,,Other Sports,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,missing,Clothing,Girls
1,0123456479,Jewelry Boxes & Organizers,unavailable,"Shoe, Jewelry & Watch Accessories","Clothing, Shoes & Jewelry",This fits your . Make sure this fitsby ent...,,,Jewelry Accessories,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,missing,Jewelry Boxes,
2,1608299953,,unavailable,Education & Reference,Software,Access for up to 5 family members Download act...,,,Languages,Learn French: Rosetta Stone French - Level 1,missing,,
3,1617160377,,unavailable,Education & Reference,Software,Access for up to 5 family members Download act...,,,Languages,Learn Italian: Rosetta Stone Italian - Level 1,missing,,
4,B00001W0KA,,Buzz Lightyear,Dress Up & Pretend Play,Toys & Games,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,,,Costumes,Buzz Lightyear Boy's Deluxe Toy Story Costume,SelectM(3T-4T)3T-4TSmall (4-6)Medium (7-8)Medi...,,


Read in final csv from rescraping.

In [33]:
redo7 = pd.read_csv('data/redo7.csv', index_col=[0])
redo7.head()

Unnamed: 0,asin,name
0,B007WA2TCS,Allegra K Women V Neck Sleeveless Striped Pane...
2,B007WA2VB2,Allegra K Women Scoop Neck Drawstring Mid-Calf...
3,B007WA2ZTA,Allegra K Women Peter Pan 3/4 Sleeve Summer To...
4,B007WA3558,Allegra K Lady Ruffle Tie Neck Short Sleeves B...
10,B007WA3GMK,Allegra K Women Sweet Layered Tiered Crochet D...


Concatonate final scrape with the rest of the scrapes, call it `final_df`.

In [34]:
final_df = pd.concat([robot_free_df6, redo7], sort=True).fillna(0)  # fill any empty cells with 0s.

Check first 5 rows to make sure asins are in order and everything looks as expected.

In [35]:
final_df.head() 

Unnamed: 0,asin,category,color,demographic,department,description,detail_type,details,division,name,size,subcategory,type
0,0000031887,Dance,unavailable,Sports & Fitness,Sports & Outdoors,This fits your . Make sure this fitsby ent...,Skirts,0,Other Sports,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,missing,Clothing,Girls
1,0123456479,Jewelry Boxes & Organizers,unavailable,"Shoe, Jewelry & Watch Accessories","Clothing, Shoes & Jewelry",This fits your . Make sure this fitsby ent...,0,0,Jewelry Accessories,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,missing,Jewelry Boxes,0
2,1608299953,0,unavailable,Education & Reference,Software,Access for up to 5 family members Download act...,0,0,Languages,Learn French: Rosetta Stone French - Level 1,missing,0,0
3,1617160377,0,unavailable,Education & Reference,Software,Access for up to 5 family members Download act...,0,0,Languages,Learn Italian: Rosetta Stone Italian - Level 1,missing,0,0
4,B00001W0KA,0,Buzz Lightyear,Dress Up & Pretend Play,Toys & Games,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,0,0,Costumes,Buzz Lightyear Boy's Deluxe Toy Story Costume,SelectM(3T-4T)3T-4TSmall (4-6)Medium (7-8)Medi...,0,0


Check number off final rows, columns should still be the same.

In [36]:
final_df.shape

(19766, 13)

Check value counts for `name` column to see how many are still unknown.

In [37]:
final_df['name'].value_counts().head()

unknown product name                            1161
Birkenstock Women's Mayari Birko-Flor Sandal       7
MG Low Profile Dyed Cotton Twill Cap               7
ASICS Women's GEL-Noosa Tri 8 Running Shoe         6
Reebok Women's Easytone                            6
Name: name, dtype: int64

Drop the unknown product name rows. 

In [38]:
final_df[final_df['name'] == 'unknown product name'].head()

Unnamed: 0,asin,category,color,demographic,department,description,detail_type,details,division,name,size,subcategory,type
5038,B003ZYUD0K,0,unknown color,0,unknown category,unknown details,0,missing,0,unknown product name,missing,0,0
5046,B00400N6WU,0,unknown color,0,unknown category,unknown details,0,missing,0,unknown product name,missing,0,0
5047,B00400N6XE,0,unknown color,0,unknown category,unknown details,0,missing,0,unknown product name,missing,0,0
5048,B00400N6YS,0,unknown color,0,unknown category,unknown details,0,missing,0,unknown product name,missing,0,0
5050,B00400WLYY,0,unknown color,0,unknown category,unknown details,0,missing,0,unknown product name,missing,0,0


Check value counts for `asin` column.

In [39]:
final_df['asin'].value_counts().head(25)

B001A3Y912    2
B009SRN6T8    2
B007NNX43W    2
B002FQKARA    2
B0099JXX5M    2
B003MSO2RY    2
B007VFHOQ0    2
B001IOETPO    2
B007WGGPCC    2
B007ST73DS    2
B00A73AKBY    2
B009AFGZP0    2
B00A86Q2HQ    2
B001QCZ3E4    2
B0086H00Y0    2
B0083ZD34O    2
B001SASX14    2
B001BZ2DQC    2
B002U5CVC8    2
B009RV0PSA    2
B009ZDEXQK    2
B0085E311S    2
B00361E9OS    2
B00A7Y4HSK    2
B0087AY0VK    2
Name: asin, dtype: int64

In [40]:
final_df['asin'].value_counts().tail(25)

B006YFWL0M    1
B00005JHKE    1
B004ARQ5A8    1
B00FXE3154    1
B00BY2ER5W    1
B001HLE570    1
B004SUANZU    1
B005X4TUZY    1
B00028B4U0    1
B000AOZJZM    1
B003XDTOFS    1
B0007QMK2S    1
B006TQLQX4    1
B007IS5ITU    1
B0054403MM    1
B00B3OY2BU    1
B0041YZKYW    1
B00F4WT960    1
B008ELMWMG    1
B005FKVEN2    1
B003VPA2YU    1
B000TR4TI8    1
B00EHJ5FT6    1
B004HVB2V4    1
B00EFD44DC    1
Name: asin, dtype: int64

The value counts show me that some products got scraped twice so I drop the duplicates.

In [41]:
final_df.drop_duplicates(inplace=True)

Check that the row count reflects the drops.

In [42]:
final_df.shape

(16816, 13)

Check the value counts for the `department` column.

In [43]:
final_df['department'].value_counts(normalize=True, dropna=False)

Clothing, Shoes & Jewelry     0.765699
0                             0.114355
unknown category              0.069041
Sports & Outdoors             0.033837
Toys & Games                  0.004579
Health & Household            0.002973
Home & Kitchen                0.001725
Arts, Crafts & Sewing         0.001725
Electronics                   0.001665
Tools & Home Improvement      0.001130
Beauty & Personal Care        0.001070
Automotive                    0.000654
Baby Products                 0.000476
Office Products               0.000357
Cell Phones & Accessories     0.000297
Software                      0.000178
Industrial & Scientific       0.000119
Appliances                    0.000059
Purchase Circles              0.000059
Name: department, dtype: float64

Drop the rows that have unknown category or 0 in `department` column.

In [44]:
final_df = final_df[(final_df['department'] != 'unknown category') & (final_df['department'] != 0)]

Check that the shape reflects dropped rows. 

In [45]:
final_df.shape

(13732, 13)

Save to final everything to csv.

In [46]:
# final_df.to_csv('data/final_df.csv')