In [1]:
import json
import pandas as pd
import gzip
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from bs4 import BeautifulSoup
import nltk

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

reviews = getDF('reviews_Clothing_Shoes_and_Jewelry_5.json.gz')

In [3]:
reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1KLRMWW2FWPL4,31887,"Amazon Customer ""cameramom""","[0, 0]",This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,"02 12, 2011"
1,A2G5TCU2WDFZ65,31887,Amazon Customer,"[0, 0]",I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,"01 19, 2013"
2,A1RLQXYNCMWRWN,31887,Carola,"[0, 0]",What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,"01 4, 2013"
3,A8U3FAMSJVHS5,31887,Caromcg,"[0, 0]","We bought several tutus at once, and they are ...",5.0,"Adorable, Sturdy",1398556800,"04 27, 2014"
4,A3GEOILWLK86XM,31887,CJ,"[0, 0]",Thank you Halo Heaven great product for Little...,5.0,Grammy's Angels Love it,1394841600,"03 15, 2014"


In [4]:
reviews.dtypes

reviewerID         object
asin               object
reviewerName       object
helpful            object
reviewText         object
overall           float64
summary            object
unixReviewTime      int64
reviewTime         object
dtype: object

In [5]:
#Convert universal time to date
reviews['review_date'] = pd.to_datetime(reviews['unixReviewTime'], unit='s')
reviews.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,review_date
0,A1KLRMWW2FWPL4,31887,"Amazon Customer ""cameramom""","[0, 0]",This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,"02 12, 2011",2011-02-12
1,A2G5TCU2WDFZ65,31887,Amazon Customer,"[0, 0]",I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,"01 19, 2013",2013-01-19
2,A1RLQXYNCMWRWN,31887,Carola,"[0, 0]",What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,"01 4, 2013",2013-01-04


In [6]:
reviews.dtypes

reviewerID                object
asin                      object
reviewerName              object
helpful                   object
reviewText                object
overall                  float64
summary                   object
unixReviewTime             int64
reviewTime                object
review_date       datetime64[ns]
dtype: object

In [7]:
reviews.isnull().sum()

reviewerID          0
asin                0
reviewerName      452
helpful             0
reviewText          0
overall             0
summary             0
unixReviewTime      0
reviewTime          0
review_date         0
dtype: int64

In [8]:
reviews['reviewerName'].value_counts(dropna=False).head(20)

Amazon Customer    4001
NaN                 452
Kindle Customer     282
Jennifer            254
Sarah               245
Stephanie           243
Chris               219
Karen               193
Mary                191
Lisa                188
Michelle            185
Jessica             174
Jen                 169
Ashley              162
Pen Name            158
Amanda              158
Susan               157
Melissa             153
Kat                 152
Rachel              151
Name: reviewerName, dtype: int64

In [9]:
# Check how many unique reviewers there are in the dataset 
len(set(reviews['reviewerID']))

39387

In [10]:
reviews.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime', 'review_date'],
      dtype='object')

In [11]:
reviews['helpful'].value_counts(normalize=True).head(20)
# Since 68% of the data does not have a helpful score, I do not think this column is useful.

[0, 0]    0.687735
[1, 1]    0.106668
[0, 1]    0.034883
[2, 2]    0.032389
[1, 2]    0.023382
[3, 3]    0.015218
[2, 3]    0.010187
[4, 4]    0.009053
[3, 4]    0.007726
[5, 5]    0.005756
[6, 6]    0.003890
[0, 2]    0.003660
[4, 5]    0.003592
[7, 7]    0.002828
[1, 3]    0.002551
[5, 6]    0.002304
[2, 4]    0.002264
[8, 8]    0.002175
[3, 5]    0.001995
[9, 9]    0.001669
Name: helpful, dtype: float64

In [12]:
reviews.drop(columns=['reviewerName', 'helpful', 'reviewTime'], inplace=True)

In [13]:
reviews['overall'].value_counts()

5.0    163240
4.0     58357
3.0     30425
2.0     15463
1.0     11192
Name: overall, dtype: int64

In [14]:
reviews['summary'].head()

0    Great tutu-  not cheaply made
1                      Very Cute!!
2         I have buy more than one
3                 Adorable, Sturdy
4          Grammy's Angels Love it
Name: summary, dtype: object

In [15]:
reviews['reviewText'][1]

'I bought this for my 4 yr old daughter for dance class, she wore it today for the first time and the teacher thought it was adorable. I bought this to go with a light blue long sleeve leotard and was happy the colors matched up great. Price was very good too since some of these go for over $15.00 dollars.'

Read in product info csv.

In [16]:
product_info = pd.read_csv('data/final_df.csv', index_col=[0])
product_info.head()

Unnamed: 0,asin,category,color,demographic,department,description,detail_type,details,division,name,size,subcategory,type
0,0000031887,Dance,unavailable,Sports & Fitness,Sports & Outdoors,This fits your . Make sure this fitsby ent...,Skirts,0,Other Sports,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,missing,Clothing,Girls
1,0123456479,Jewelry Boxes & Organizers,unavailable,"Shoe, Jewelry & Watch Accessories","Clothing, Shoes & Jewelry",This fits your . Make sure this fitsby ent...,0,0,Jewelry Accessories,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,missing,Jewelry Boxes,0
2,1608299953,0,unavailable,Education & Reference,Software,Access for up to 5 family members Download act...,0,0,Languages,Learn French: Rosetta Stone French - Level 1,missing,0,0
3,1617160377,0,unavailable,Education & Reference,Software,Access for up to 5 family members Download act...,0,0,Languages,Learn Italian: Rosetta Stone Italian - Level 1,missing,0,0
4,B00001W0KA,0,Buzz Lightyear,Dress Up & Pretend Play,Toys & Games,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,0,0,Costumes,Buzz Lightyear Boy's Deluxe Toy Story Costume,SelectM(3T-4T)3T-4TSmall (4-6)Medium (7-8)Medi...,0,0


Check number of columns and rows.

In [17]:
product_info.shape

(16816, 13)

Check for null rows.

In [18]:
product_info.isnull().sum()

asin           0
category       0
color          0
demographic    0
department     0
description    0
detail_type    0
details        0
division       0
name           0
size           0
subcategory    0
type           0
dtype: int64

Look at column names.

In [19]:
product_info.columns

Index(['asin', 'category', 'color', 'demographic', 'department', 'description',
       'detail_type', 'details', 'division', 'name', 'size', 'subcategory',
       'type'],
      dtype='object')

Check value counts of all the rows to explore the data an idea of how to get it in a usable format.

In [20]:
product_info['name'].value_counts().head(10)

unknown product name                                                                                             1161
Simple 6MM Gemstone Round Ball Stud Earrings For Women For Teen 925 Sterling Silver 9 Birthstones More Colors       6
Birkenstock Women's Mayari Birko-Flor Sandal                                                                        5
uiphgjwexzv                                                                                                         5
Timex Ironman Classic 30 Mid-Size Watch                                                                             5
Simple 8MM Gemstone Round Ball Stud Earrings For Women For Teen 925 Sterling Silver 9 Birthstone More Colors        4
Birkenstock Women's Gizeh Thong Sandals                                                                             4
MG Low Profile Dyed Cotton Twill Cap                                                                                4
Hanes Men's Woven Plain-Weave Pajama Set                

In [21]:
product_info = product_info[product_info['name'] != 'unknown product name']

In [22]:
product_info['department'].value_counts()

Clothing, Shoes & Jewelry     12876
0                              1923
Sports & Outdoors               569
Toys & Games                     77
Health & Household               50
Arts, Crafts & Sewing            29
Home & Kitchen                   29
Electronics                      28
Tools & Home Improvement         19
Beauty & Personal Care           18
Automotive                       11
Baby Products                     8
Office Products                   6
Cell Phones & Accessories         5
Software                          3
Industrial & Scientific           2
Purchase Circles                  1
Appliances                        1
Name: department, dtype: int64

In [23]:
product_info['department'] = product_info['department'].str.strip(' ').str.replace(',', '').str.replace('&', '').str.replace(' ', '_').str.replace('__', '_')
# Replace all commas and ampersands with and underscore and replace spaces with nothing.

In [24]:
#Check that it worked 
product_info['department'].value_counts()

Clothing_Shoes_Jewelry     12876
0                           1923
Sports_Outdoors              569
Toys_Games                    77
Health_Household              50
Arts_Crafts_Sewing            29
Home_Kitchen                  29
Electronics                   28
Tools_Home_Improvement        19
Beauty_Personal_Care          18
Automotive                    11
Baby_Products                  8
Office_Products                6
Cell_Phones_Accessories        5
Software                       3
Industrial_Scientific          2
Appliances                     1
Purchase_Circles               1
Name: department, dtype: int64

In [25]:
# Turn each value in a column with a 1 if the product is in that department and a 0 if not. 
departments = pd.get_dummies(product_info['department'], drop_first=True)
departments.shape

(15655, 17)

In [26]:
product_info['demographic'].value_counts().head(10)

 Women                                 7760
 Men                                   2969
0                                      1923
 Novelty & More                         503
 Luggage & Travel Gear                  355
 Sports & Fitness                       343
 Costumes & Accessories                 271
 Girls                                  224
 Shoe, Jewelry & Watch Accessories      220
 Baby                                   205
Name: demographic, dtype: int64

In [27]:
product_info['demographic'] = product_info['demographic'].str.strip(' ').str.replace(',', '_').str.replace('&', '').str.replace(' ', '_').str.replace('__', '_')
# Strip white space from beginning and end 
# Replace punctuation and spaces with underscores, replace double underscore with single underscore 

In [28]:
product_info['demographic'].value_counts().head(10)

Women                             7844
Men                               2997
0                                 1923
Novelty_More                       513
Luggage_Travel_Gear                359
Sports_Fitness                     346
Costumes_Accessories               272
Girls                              240
Shoe_Jewelry_Watch_Accessories     220
Baby                               209
Name: demographic, dtype: int64

In [29]:
# Turn each value in a column with a 1 if the product is in that department and a 0 if not. 
demographic = pd.get_dummies(product_info['demographic'], drop_first=True)
demographic.shape

(15655, 78)

In [30]:
product_info.head()

Unnamed: 0,asin,category,color,demographic,department,description,detail_type,details,division,name,size,subcategory,type
0,0000031887,Dance,unavailable,Sports_Fitness,Sports_Outdoors,This fits your . Make sure this fitsby ent...,Skirts,0,Other Sports,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,missing,Clothing,Girls
1,0123456479,Jewelry Boxes & Organizers,unavailable,Shoe_Jewelry_Watch_Accessories,Clothing_Shoes_Jewelry,This fits your . Make sure this fitsby ent...,0,0,Jewelry Accessories,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,missing,Jewelry Boxes,0
2,1608299953,0,unavailable,Education_Reference,Software,Access for up to 5 family members Download act...,0,0,Languages,Learn French: Rosetta Stone French - Level 1,missing,0,0
3,1617160377,0,unavailable,Education_Reference,Software,Access for up to 5 family members Download act...,0,0,Languages,Learn Italian: Rosetta Stone Italian - Level 1,missing,0,0
4,B00001W0KA,0,Buzz Lightyear,Dress_Up_Pretend_Play,Toys_Games,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,0,0,Costumes,Buzz Lightyear Boy's Deluxe Toy Story Costume,SelectM(3T-4T)3T-4TSmall (4-6)Medium (7-8)Medi...,0,0


In [31]:
product_info['detail_type'].value_counts().tail(20)

Trunks                       1
Sunglasses & Eyewear         1
Link                         1
Clasp                        1
Skirts & Skorts              1
 Panties ›Briefs             1
Pajama Sets                  1
Sets                         1
Unitards                     1
Polos                        1
Wear to Work                 1
Tanks & Racerbacks           1
Hosiery                      1
 Gloves & Mittens ›Gloves    1
Neckties                     1
Gun Holsters                 1
Boxer Briefs                 1
Tanks Tops                   1
Suspenders                   1
One-Piece Pajamas            1
Name: detail_type, dtype: int64

When checking out the `detail_type` value counts I noticed some additional categories that did not get split off in the first round of EDA so I did it now.

In [32]:
extra_split = product_info['detail_type'].str.split("›", expand=True)
extra_split[1].value_counts()

Baseball Caps          5
Beanies & Knit Hats    2
Berets                 2
Arm Warmers            2
Gloves                 1
Briefs                 1
Name: 1, dtype: int64

In [33]:
product_info['extra_split'] = extra_split[1]
product_info['detail_type'] = extra_split[0]

In [34]:
product_info['detail_type'] = product_info['detail_type'].str.strip(' ').str.replace(',', '_').str.replace('&', '').str.replace(' ', '_').str.replace('__', '_')
# Strip white space from beginning and end 
# Replace punctuation and spaces with underscores, replace double underscore with single underscore  

In [35]:
product_info['detail_type'].value_counts().tail(10)

Suspenders           1
Pajama_Sets          1
Unitards             1
Neckties             1
Skirts_Skorts        1
Link                 1
Panties              1
One-Piece_Pajamas    1
Trunks               1
Clasp                1
Name: detail_type, dtype: int64

In [36]:
product_info['extra_split'] = product_info['extra_split'].str.replace('&', '_').str.replace(' ', '_').str.replace('__', '_')
product_info['extra_split'].value_counts()

Baseball_Caps         5
Berets                2
Beanies__Knit_Hats    2
Arm_Warmers           2
Gloves                1
Briefs                1
Name: extra_split, dtype: int64

In [37]:
product_info['division'].value_counts()

 Clothing                                   4004
 Shoes                                      3946
0                                           2107
 Jewelry                                    1432
 Accessories                                 761
 Watches                                     742
 Uniforms, Work & Safety                     222
 Handbags & Wallets                          182
 Exercise & Fitness                          152
 Outdoor Clothing                            142
 Women                                       130
 Baby Girls                                  121
 Travel Accessories                           91
 Jewelry Accessories                          84
Shoes                                         84
 Baby Boys                                    82
 Surf, Skate & Street                         81
 Kids & Baby                                  77
Clothing                                      74
 Luggage                                      72
 Watch Accessories  

In [38]:
product_info['division'] = product_info['division'].str.strip(' ').str.replace(',', '_').str.replace('&', '').str.replace(' ', '_').str.replace('__', '_')
product_info['division'].head(10)

0           Other_Sports
1    Jewelry_Accessories
2              Languages
3              Languages
4               Costumes
5              Kids_Baby
6     Travel_Accessories
7     Travel_Accessories
8     Travel_Accessories
9              Kids_Baby
Name: division, dtype: object

In [39]:
product_info['category'] = product_info['category'].str.strip(' ').str.replace(',', '_').str.replace('&', '').str.replace(' ', '_').str.replace('__', '_')


In [40]:
product_info['category'].value_counts()

0                                 2559
Lingerie_Sleep_Lounge              896
Athletic                           839
Sandals                            754
Wrist_Watches                      704
Boots                              619
Earrings                           464
Clothing                           430
Necklaces                          367
Novelty                            359
Flats                              344
Pumps                              294
Active                             292
Bracelets                          268
Tops_Tees_Blouses                  268
Jeans                              266
Loafers_Slip-Ons                   256
Underwear                          244
Fashion_Sneakers                   232
Dresses                            208
Socks_Hosiery                      189
Slippers                           182
Pants                              179
Shirts                             174
Mules_Clogs                        164
Accessories              

In [41]:
product_info['subcategory'].value_counts().head(10)

0                     6826
 Lingerie              760
 Running               318
 Women                 305
Casual                 273
 Men                   248
Pendants               217
Platforms & Wedges     214
Stud                   194
Flip-Flops             191
Name: subcategory, dtype: int64

In [42]:
product_info['subcategory'] = product_info['subcategory'].str.strip(' ').str.replace(',', '_').str.replace('&', '').str.replace(' ', '_').str.replace('__', '_')
product_info['subcategory'].value_counts().head(10)

0                   6826
Lingerie             767
Running              454
Women                321
Casual               273
Men                  269
Pendants             217
Platforms_Wedges     214
Stud                 194
Flip-Flops           191
Name: subcategory, dtype: int64

In [43]:
product_info['type'].value_counts().head(15)

0                12917
 Bras              391
Road Running       241
 Panties           179
 Shirts            144
Sport Watches      105
 Shapewear          92
 Tops & Tees        73
 Accessories        72
Trail Running       72
 Charms             68
Pants               58
 Women              53
Robes               50
Tops                42
Name: type, dtype: int64

In [44]:
product_info['type'] = product_info['type'].str.strip(' ').str.replace(',', '_').str.replace('&', '').str.replace(' ', '_').str.replace('__', '_')
product_info['type'].value_counts().head(10)

0                12917
Bras               399
Road_Running       241
Panties            199
Shirts             147
Sport_Watches      105
Accessories         97
Shapewear           95
Charms              76
Tops_Tees           74
Name: type, dtype: int64

In [45]:
product_info.head()

Unnamed: 0,asin,category,color,demographic,department,description,detail_type,details,division,name,size,subcategory,type,extra_split
0,0000031887,Dance,unavailable,Sports_Fitness,Sports_Outdoors,This fits your . Make sure this fitsby ent...,Skirts,0,Other_Sports,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,missing,Clothing,Girls,
1,0123456479,Jewelry_Boxes_Organizers,unavailable,Shoe_Jewelry_Watch_Accessories,Clothing_Shoes_Jewelry,This fits your . Make sure this fitsby ent...,0,0,Jewelry_Accessories,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,missing,Jewelry_Boxes,0,
2,1608299953,0,unavailable,Education_Reference,Software,Access for up to 5 family members Download act...,0,0,Languages,Learn French: Rosetta Stone French - Level 1,missing,0,0,
3,1617160377,0,unavailable,Education_Reference,Software,Access for up to 5 family members Download act...,0,0,Languages,Learn Italian: Rosetta Stone Italian - Level 1,missing,0,0,
4,B00001W0KA,0,Buzz Lightyear,Dress_Up_Pretend_Play,Toys_Games,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,0,0,Costumes,Buzz Lightyear Boy's Deluxe Toy Story Costume,SelectM(3T-4T)3T-4TSmall (4-6)Medium (7-8)Medi...,0,0,


In [46]:
details = pd.get_dummies(product_info['detail_type'], drop_first=True)
details.head()

Unnamed: 0,Adhesive_Bras,Baby_Dolls,Bandanas,Bead,Belt_Buckles,Belts,Bikinis,Bodysuits,Boxer_Briefs,Boy_Shorts,...,Tanks_Sleeveless_Shirts,Tanks_Tops,Thigh_Slimmers,Tights,Tops,Trunks,Unitards,Waist_Cinchers,Wallets,Wear_to_Work
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
category = pd.get_dummies(product_info['category'], drop_first=True)

In [48]:
division = pd.get_dummies(product_info['division'], drop_first=True)

In [49]:
subcategory = pd.get_dummies(product_info['subcategory'], drop_first=True)

In [50]:
subcategory.head()

Unnamed: 0,Accessories,Accessory_Sets,Active,Active_Base_Layers,Active_Hoodies,Active_Leggings,Active_Pants,Active_Performance,Active_Shirts_Tees,Active_Shorts,...,Wool_Blends,Wool_Pea_Coats,Work,Work_Safety,Work_Utility_Safety,Wrap,Wraps_Pashminas,Wrestling,Wrist_Watches,Y-Necklaces
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
extra_dummy = pd.get_dummies(product_info['extra_split'].fillna(0), drop_first=True)

In [74]:
extra_dummy.shape

(15655, 6)

In [54]:
dummies = [product_info, departments, demographic, details, category, division, subcategory, extra_dummy]


In [58]:
product_dummies = pd.concat(dummies, axis=1).drop(columns=['department', 'demographic', 'detail_type', 
                                                           'category', 'division', 'subcategory', 'type','extra_split'])



In [59]:
product_dummies.shape

(15655, 936)

In [63]:
product_dummies.tail()

Unnamed: 0,asin,color,description,details,name,size,Appliances,Arts_Crafts_Sewing,Automotive,Baby_Products,...,Wraps_Pashminas,Wrestling,Wrist_Watches,Y-Necklaces,Arm_Warmers,Baseball_Caps,Beanies__Knit_Hats,Berets,Briefs,Gloves
3781,B00K5T4NHC,0,0,0,Mato & Hash Workout Shirts for Men | Moisture ...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3782,B00KA602SY,0,0,0,PAKULA Women's Silvering Cross Loose Vest Whit...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3783,B00KCWMG5S,0,0,0,Classic Designs Womens Stretch Poplin Cargo Re...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3784,B00KF9180W,0,0,0,"Marlrin Full Face Snorkel Mask, Snorkeling Mas...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3786,B00KKXCJQU,0,0,0,Shacke Pak - 4 Set Packing Cubes - Travel Orga...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
# Merge DataFrames
df = pd.merge(reviews, product_dummies, on='asin', how='left')

In [65]:
df.shape

(278677, 942)

In [66]:
#Checking if asin that were not able to be scraped have NaNs in the product info columns
df[df['asin'] == 'B007WA397M']

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,review_date,color,description,details,...,Wraps_Pashminas,Wrestling,Wrist_Watches,Y-Necklaces,Arm_Warmers,Baseball_Caps,Beanies__Knit_Hats,Berets,Briefs,Gloves
196515,A1OFEEXTHPAROW,B007WA397M,it's kinda small when i already ordered the ri...,2.0,small,1395705600,2014-03-25,,,,...,,,,,,,,,,
196516,A1C8L5G5NIWQZ8,B007WA397M,I receive wonderful compliments when I wear th...,4.0,Pretty in Pink,1394323200,2014-03-09,,,,...,,,,,,,,,,
196517,A33O1AU7FTFOEV,B007WA397M,The color and shape is exactly the same as you...,4.0,Great.,1384905600,2013-11-20,,,,...,,,,,,,,,,
196518,A1EU9MOBPEHMUG,B007WA397M,I bought this from their website (and not from...,5.0,cute,1353888000,2012-11-26,,,,...,,,,,,,,,,
196519,A3VFQ0XO6TJ8BV,B007WA397M,I placed like 4 orders so far for this seller....,2.0,My last review and order for Allegra K,1350000000,2012-10-12,,,,...,,,,,,,,,,
196520,AO4N0QPLSUHMQ,B007WA397M,I asked for extra small and they sent me a sma...,1.0,extra small and they sent me a small size that...,1381622400,2013-10-13,,,,...,,,,,,,,,,
196521,ATHZKQ8IVYD61,B007WA397M,Allegra K Woman Stand Collar 3/4 Sleeve Flounc...,2.0,Not very good,1369699200,2013-05-28,,,,...,,,,,,,,,,
196522,A1HLW9QWAEAT6,B007WA397M,I got compliments on this all day... This shir...,5.0,I LOVE THIS SHIRT!,1342483200,2012-07-17,,,,...,,,,,,,,,,
196523,AGQ52R6CDMO4R,B007WA397M,I bought 10 items from Allegra K and this was ...,5.0,Favorite shirt from Allegra K,1392681600,2014-02-18,,,,...,,,,,,,,,,
196524,AUPTVTOI6SFKV,B007WA397M,I love this shirt.bits nice enough to wear to ...,5.0,Adorable shirt,1386460800,2013-12-08,,,,...,,,,,,,,,,


In [67]:
df.isnull().mean()

reviewerID                     0.000000
asin                           0.000000
reviewText                     0.000000
overall                        0.000000
summary                        0.000000
unixReviewTime                 0.000000
review_date                    0.000000
color                          0.319086
description                    0.319086
details                        0.319086
name                           0.319086
size                           0.319086
Appliances                     0.319086
Arts_Crafts_Sewing             0.319086
Automotive                     0.319086
Baby_Products                  0.319086
Beauty_Personal_Care           0.319086
Cell_Phones_Accessories        0.319086
Clothing_Shoes_Jewelry         0.319086
Electronics                    0.319086
Health_Household               0.319086
Home_Kitchen                   0.319086
Industrial_Scientific          0.319086
Office_Products                0.319086
Purchase_Circles               0.319086


In [68]:
df.dropna(inplace=True)

In [69]:
df.isnull().sum().sum()

0

In [70]:
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,review_date,color,description,details,...,Wraps_Pashminas,Wrestling,Wrist_Watches,Y-Necklaces,Arm_Warmers,Baseball_Caps,Beanies__Knit_Hats,Berets,Briefs,Gloves
0,A1KLRMWW2FWPL4,31887,This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,2011-02-12,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A2G5TCU2WDFZ65,31887,I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,2013-01-19,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A1RLQXYNCMWRWN,31887,What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,2013-01-04,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A8U3FAMSJVHS5,31887,"We bought several tutus at once, and they are ...",5.0,"Adorable, Sturdy",1398556800,2014-04-27,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GEOILWLK86XM,31887,Thank you Halo Heaven great product for Little...,5.0,Grammy's Angels Love it,1394841600,2014-03-15,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
df['Beanies__Knit_Hats'].value_counts()

0.0    189742
1.0        13
Name: Beanies__Knit_Hats, dtype: int64

In [71]:
len(df.columns)

942

In [72]:
len(set(df.columns))

824

In [83]:
all_cols = list(df.columns)
all_cols

['reviewerID',
 'asin',
 'reviewText',
 'overall',
 'summary',
 'unixReviewTime',
 'review_date',
 'color',
 'description',
 'details',
 'name',
 'size',
 'Appliances',
 'Arts_Crafts_Sewing',
 'Automotive',
 'Baby_Products',
 'Beauty_Personal_Care',
 'Cell_Phones_Accessories',
 'Clothing_Shoes_Jewelry',
 'Electronics',
 'Health_Household',
 'Home_Kitchen',
 'Industrial_Scientific',
 'Office_Products',
 'Purchase_Circles',
 'Software',
 'Sports_Outdoors',
 'Tools_Home_Improvement',
 'Toys_Games',
 'Accessories',
 'Accessories_Supplies',
 'Antivirus_Security',
 'Arts_Crafts',
 'Baby',
 'Baby_Toddler_Toys',
 'Bath',
 'Beading_Jewelry_Making',
 'Bedding',
 'Boys',
 'Camera_Photo',
 'Car_Seats_Accessories',
 'Cases_Holsters_Sleeves',
 'Computers_Accessories',
 'Costumes_Accessories',
 'Crafting',
 'Dolls_Accessories',
 'Dress_Up_Pretend_Play',
 'Education_Reference',
 'Event_Party_Supplies',
 'Exterior_Accessories',
 'Fabric_Decorating',
 'Fan_Shop',
 'Feeding',
 'Foot_Hand_Nail_Care',
 'Ga

In [79]:
list_cols = list(df.columns)
for item in set(df.columns):
    list_cols.remove(item)

In [80]:
len(list_cols)

118

In [81]:
list_cols

['Accessories',
 'Baby',
 'Belts',
 'Boys',
 'Girls',
 'Gloves_Mittens',
 'Hats_Caps',
 'Leggings',
 'Men',
 'Pants',
 'Scarves',
 'Shirts',
 'Shorts',
 'Skirts',
 'Skirts_Skorts',
 'Socks',
 'Suspenders',
 'T-Shirts',
 'Women',
 'Accessories',
 'Backpacks',
 'Clothing',
 'Footwear',
 'Handbags_Wallets',
 'Men',
 'Shoes',
 'Smartwatches',
 'Tools_Accessories',
 'Uniforms_Work_Safety',
 'Watches',
 'Wigs',
 'Women',
 'Accessories',
 'Active',
 'Athletic',
 'Baby_Boys',
 'Baby_Girls',
 'Backpacks',
 'Basketball',
 'Bikinis',
 'Bodysuits',
 'Boots',
 'Boxer_Briefs',
 'Boys',
 'Briefs',
 'Button-Down_Shirts',
 'Clogs_Mules',
 'Clothing',
 'Clothing_Sets',
 'Costumes',
 'Cycling',
 'Dance',
 'Dresses',
 'Earmuffs',
 'Eyewear',
 'Fashion_Hoodies_Sweatshirts',
 'Fashion_Sneakers',
 'Flats',
 'Food_Service',
 'Girls',
 'Gloves_Mittens',
 'Golf',
 'Hats',
 'Hats_Caps',
 'Hosiery',
 'Hunting',
 'Jackets_Coats',
 'Jeans',
 'Link',
 'Masks',
 'Medical',
 'Men',
 'Military',
 'Mules_Clogs',
 'Neckt

In [86]:
keep_list = []
for col in list_cols:
    if col in all_cols:
        keep_list.append(df[col].max(axis=1))
    

118

In [None]:
df['new_backpacks'] = df['Backpacks'].max(axis=1)

In [85]:
df.columns.duplicated().sum()

118

In [None]:
df.drop(columns='Backpacks', inplace=True)

In [None]:
df['new_backpacks']

In [None]:
df['Backpacks']