## Feature Engineering with NLTK  
_By: Rachel Koenig_ 
____

Imports 

In [113]:
#Import pandas 
import pandas as pd
#Import Natual Language Toolkit
import nltk
#Import Beautiful Soup
from bs4 import BeautifulSoup   
#Import string for list of punctuation
import string
# Import the stop word list
from nltk.corpus import stopwords 
# Import Tokenizer
from nltk.tokenize import RegexpTokenizer
#Import Lemmatizer
from nltk.stem import WordNetLemmatizer
# Import stemmer.
from nltk.stem.porter import PorterStemmer

Read in csv.

In [114]:
df = pd.read_csv('data/reviews_and_products.csv', index_col=[0], low_memory=False)

In [115]:
#check first 5 rows 
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,review_date,color,description,details,...,Tops_Tees,Trunks,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets
0,A1KLRMWW2FWPL4,31887,This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,2011-02-12,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A2G5TCU2WDFZ65,31887,I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,2013-01-19,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A1RLQXYNCMWRWN,31887,What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,2013-01-04,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A8U3FAMSJVHS5,31887,"We bought several tutus at once, and they are ...",5.0,"Adorable, Sturdy",1398556800,2014-04-27,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GEOILWLK86XM,31887,Thank you Halo Heaven great product for Little...,5.0,Grammy's Angels Love it,1394841600,2014-03-15,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
# Check data types 
df.dtypes.head(15)

reviewerID             object
asin                   object
reviewText             object
overall               float64
summary                object
unixReviewTime          int64
review_date            object
color                  object
description            object
details                object
name                   object
size                   object
Arts_Crafts_Sewing    float64
Automotive            float64
Baby_Products         float64
dtype: object

In [117]:
#Check column names 
df.columns

Index(['reviewerID', 'asin', 'reviewText', 'overall', 'summary',
       'unixReviewTime', 'review_date', 'color', 'description', 'details',
       ...
       'Tops_Tees', 'Trunks', 'Umbrellas', 'Underwear', 'Wallets',
       'Wear_to_Work', 'Wrist_Watches', 'Arm_Warmers', 'Baseball_Caps',
       'Berets'],
      dtype='object', length=823)

Check size of DataFrame: 168,995 rows, of different products and reviewers.

In [118]:
df.shape

(168995, 823)

In [119]:
#Check for nulls 
df.isnull().sum().head(25)

reviewerID                  0
asin                        0
reviewText                 16
overall                     0
summary                     1
unixReviewTime              0
review_date                 0
color                       0
description                 0
details                     0
name                        0
size                        0
Arts_Crafts_Sewing          0
Automotive                  0
Baby_Products               0
Beauty_Personal_Care        0
Cell_Phones_Accessories     0
Clothing_Shoes_Jewelry      0
Electronics                 0
Health_Household            0
Home_Kitchen                0
Industrial_Scientific       0
Office_Products             0
Purchase_Circles            0
Software                    0
dtype: int64

Fill nulls with the string `'none'`

In [120]:
df.fillna('none', inplace=True)

In [121]:
df.isnull().sum().sum()

0

Create a new DataFrame called `users` that includes all features except `reviewText`, `summary`, `unixReviewTime`, `review_date`, `description`, `details`, and `size` 

In [122]:
users = df.drop(columns=['reviewText', 'summary', 'unixReviewTime', 
                 'review_date', 'description', 'details', 'size'])
users.head()  

Unnamed: 0,reviewerID,asin,overall,color,name,Arts_Crafts_Sewing,Automotive,Baby_Products,Beauty_Personal_Care,Cell_Phones_Accessories,...,Tops_Tees,Trunks,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets
0,A1KLRMWW2FWPL4,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A2G5TCU2WDFZ65,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A1RLQXYNCMWRWN,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A8U3FAMSJVHS5,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GEOILWLK86XM,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Group the rest of the columns based on `reviewerID` column using the groupby function. 

In [123]:
users_cats = users.groupby(['reviewerID']).sum().iloc[:, 2:]  # Sum up all categories that a user has purchased 
users_cats.head(3)

Unnamed: 0_level_0,Automotive,Baby_Products,Beauty_Personal_Care,Cell_Phones_Accessories,Clothing_Shoes_Jewelry,Electronics,Health_Household,Home_Kitchen,Industrial_Scientific,Office_Products,...,Tops_Tees,Trunks,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A001114613O3F18Q5NVR6,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00146182PNM90WNNAZ5Q,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00338282E99B8OR2JYTZ,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Check the output for one reviewerID to make sure there are some values higher than one, proving the above function worked.

In [124]:
user1 = users_cats.loc['A001114613O3F18Q5NVR6'].to_dict() 
{k:v for k,v in user1.items() if v != 0}

{'Clothing_Shoes_Jewelry': 4.0,
 'Sports_Outdoors': 1.0,
 'Novelty_More': 1.0,
 'Sports_Fitness': 1.0,
 'Novelty': 1.0,
 'Exercise_Fitness': 1.0,
 'Casual': 1.0,
 'Leggings': 1.0,
 'Men': 2.0,
 'Shorts': 1.0,
 'Skirts': 1.0,
 'T-Shirts': 1.0,
 'Women': 2.0,
 'Clothing': 4.0,
 'Shoes': 1.0,
 'Fashion_Sneakers': 1.0,
 'Running': 1.0}

In [125]:
num_reviews = users.groupby(['reviewerID']).count()['asin']  # Count up how many asins are related to each user.

In [126]:
users['number_of_reviews'] = users['reviewerID'].map(num_reviews)  # Create a new column for each user's total number of reviews
users['number_of_reviews'].head()

0    3
1    6
2    5
3    7
4    4
Name: number_of_reviews, dtype: int64

In [127]:
user_stars = users.groupby(['reviewerID']).mean().round(2)['overall'] # Find the average number of stars each user rates

users['average_stars_given'] = users['reviewerID'].map(user_stars)  # Create a new column for the users' average rating
users['average_stars_given'].head()

0    5.00
1    4.17
2    4.20
3    4.00
4    4.50
Name: average_stars_given, dtype: float64

In [128]:
users.head()

Unnamed: 0,reviewerID,asin,overall,color,name,Arts_Crafts_Sewing,Automotive,Baby_Products,Beauty_Personal_Care,Cell_Phones_Accessories,...,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets,number_of_reviews,average_stars_given
0,A1KLRMWW2FWPL4,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,5.0
1,A2G5TCU2WDFZ65,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,4.17
2,A1RLQXYNCMWRWN,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,4.2
3,A8U3FAMSJVHS5,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4.0
4,A3GEOILWLK86XM,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4.5


In [129]:
users['asin'] = users['asin'] + " "  # Add a space to the end of each asin string
all_asins = dict(users.groupby('reviewerID')['asin'].sum())  # add all the asin into one row 
users['asins'] = users['reviewerID'].map(all_asins) # create a new column for all asins to be grouped by reviewer 

In [130]:
users['color'] = users['color'] + " "  # Add a space to the end of each color string
all_colors = dict(users.groupby('reviewerID')['color'].sum()) # add all colors together in one row per user 
users['colors'] = users['reviewerID'].map(all_colors)  # create a new column for all colors to be grouped by reviewer 

In [131]:
users['name'] = users['name'] + " "  # Add a space to the end of each asin string
all_names = users.groupby('reviewerID')['name'].sum()  # add all names together in one row per user 
users['names'] = users['reviewerID'].map(all_names) # create a new column for all colors to be grouped by reviewer

In [132]:
users.head(3)

Unnamed: 0,reviewerID,asin,overall,color,name,Arts_Crafts_Sewing,Automotive,Baby_Products,Beauty_Personal_Care,Cell_Phones_Accessories,...,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets,number_of_reviews,average_stars_given,asins,colors,names
0,A1KLRMWW2FWPL4,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,5.0,0000031887 B000FH4JJQ B009H6NPBE,unavailable Black unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Gre...
1,A2G5TCU2WDFZ65,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6,4.17,0000031887 B0019K9WDQ B005JJ2762 B005OZ9LB0 B0...,unavailable unavailable Black Black unavailabl...,Mystiqueshapes Girls Ballet Tutu Neon Lime Gre...
2,A1RLQXYNCMWRWN,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5,4.2,0000031887 B0007YVP1W B000LSWXWO B001GR05W4 B0...,unavailable Inspector missing missing missing,Mystiqueshapes Girls Ballet Tutu Neon Lime Gre...


Save to a csv to cluster in a separate notebook.

In [133]:
# pd.merge((users[['reviewerID', 'names', 'number_of_reviews', 
#                  'average_stars_given', 'asins', 'colors']].set_index('reviewerID')),  #merge newly engineered columns into one new DataFrame
#          users_cats, 
#          right_index=True,
#          left_index=True).drop_duplicates().to_csv('data/user_clustering.csv') # Drop duplicate rows 
         
         
         

Add all columns with text together based on `asin` into new columns.

In [134]:
df['summary'] = df['summary'] + " "  # add a space to the end of the summary string
df['summary'].head()

0    Great tutu-  not cheaply made 
1                      Very Cute!! 
2         I have buy more than one 
3                 Adorable, Sturdy 
4          Grammy's Angels Love it 
Name: summary, dtype: object

In [135]:
summary = dict(df.groupby('asin')['summary'].sum()) # add all review summaries together in one row per product 
df['one_sum'] = df['asin'].map(summary)  # create a new column for all the summaries 
df['one_sum'].head()

0    Great tutu-  not cheaply made Very Cute!! I ha...
1    Great tutu-  not cheaply made Very Cute!! I ha...
2    Great tutu-  not cheaply made Very Cute!! I ha...
3    Great tutu-  not cheaply made Very Cute!! I ha...
4    Great tutu-  not cheaply made Very Cute!! I ha...
Name: one_sum, dtype: object

In [136]:
df['reviewText'] = df['reviewText'] + " "    # add a space to the end of the reviewtext string
reviewText = dict(df.groupby('asin')['reviewText'].sum())   # add all review summaries together in one row per product
df['all_reviews'] = df['asin'].map(reviewText) # create a new column for all the review text 
df['all_reviews'].head()

0    This is a great tutu and at a really great pri...
1    This is a great tutu and at a really great pri...
2    This is a great tutu and at a really great pri...
3    This is a great tutu and at a really great pri...
4    This is a great tutu and at a really great pri...
Name: all_reviews, dtype: object

Create a new column for the average rating out of 5 stars for each `asin`.

In [137]:
overall_mean = dict(df.groupby(['asin']).mean().round(2)['overall'])
df['overall_mean'] = df['asin'].map(overall_mean)
df['overall_mean'].tail()

278637    4.6
278638    4.6
278639    4.6
278640    4.6
278641    4.6
Name: overall_mean, dtype: float64

Now that matching `asin` rows have the same values for summary and reviews and average overall score, we can drop all th duplicate asin rows.

In [138]:
df.drop_duplicates(subset='asin', inplace=True)

In [139]:
# Check shape to confirm rows were dropped 
df.shape

(13732, 826)

In [140]:
# Change types to string so that all rows are the same (some were integers before)
df['details'] = df['details'].astype(str)
df['description'] = df['description'].astype(str)

In [141]:
df['details'].value_counts()

0                                                                                                                                                                                                                                                                                                    8132
missing                                                                                                                                                                                                                                                                                              3869
Metal Factory,316L,Stainless steel,Cubic zirconia,channel,3 millimeters,Yes,Yes,Round brilliant,colorless,FL,ideal-cut,Simulated,Not treated                                                                                                                                                            6
BodySparkle Body Jewelry,Not stamped,Stainless steel,not-applicable,NA,NA                                 

Create a feature interaction column with all the text added together. 

In [142]:
df['all_text'] = df['all_reviews'] + " " + df['one_sum'] + " " + df['description']
df['all_text'].head()

0     This is a great tutu and at a really great pri...
23    The minute I saw this my heart skipped a beat....
29    Since learning a language from software is not...
42    If you really need to learn a new language, an...
58    We got this costume for my son to wear to Disn...
Name: all_text, dtype: object

Check the data types.

In [143]:
df.dtypes.head(15)

reviewerID             object
asin                   object
reviewText             object
overall               float64
summary                object
unixReviewTime          int64
review_date            object
color                  object
description            object
details                object
name                   object
size                   object
Arts_Crafts_Sewing    float64
Automotive            float64
Baby_Products         float64
dtype: object

Check value counts of size column 

In [144]:
df['size'].value_counts() 

missing                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

After reviewing size, I think it is not going to be helpful so I'll drop it.

In [145]:
df.drop(columns='size', inplace=True)

In [146]:
# Check remaining column names 
df.columns

Index(['reviewerID', 'asin', 'reviewText', 'overall', 'summary',
       'unixReviewTime', 'review_date', 'color', 'description', 'details',
       ...
       'Wallets', 'Wear_to_Work', 'Wrist_Watches', 'Arm_Warmers',
       'Baseball_Caps', 'Berets', 'one_sum', 'all_reviews', 'overall_mean',
       'all_text'],
      dtype='object', length=826)

In [147]:
# Check the shape after dropped column 
df.shape

(13732, 826)

In [148]:
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,review_date,color,description,details,...,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets,one_sum,all_reviews,overall_mean,all_text
0,A1KLRMWW2FWPL4,0000031887,This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,2011-02-12,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,Great tutu- not cheaply made Very Cute!! I ha...,This is a great tutu and at a really great pri...,4.61,This is a great tutu and at a really great pri...
23,A2WNN1DQVL4LH5,0123456479,The minute I saw this my heart skipped a beat....,5.0,Breathtaking 5 Stars,1383782400,2013-11-07,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,Breathtaking 5 Stars VERY NICE Amazing so many...,The minute I saw this my heart skipped a beat....,4.17,The minute I saw this my heart skipped a beat....
29,A1F7YU6O5RU432,1608299953,Since learning a language from software is not...,5.0,"Suggest getting this 1 level at a time, so you...",1365465600,2013-04-09,unavailable,Access for up to 5 family members Download act...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,"Suggest getting this 1 level at a time, so you...",Since learning a language from software is not...,4.31,Since learning a language from software is not...
42,A31ICLWQ9CSHRS,1617160377,"If you really need to learn a new language, an...",5.0,A real improvement over the last version!,1305936000,2011-05-21,unavailable,Access for up to 5 family members Download act...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,A real improvement over the last version! Nice...,"If you really need to learn a new language, an...",4.62,"If you really need to learn a new language, an..."
58,A1059SSXUZZS1S,B00001W0KA,We got this costume for my son to wear to Disn...,4.0,"Great Costume, Awkward To Put On",1383696000,2013-11-06,Buzz Lightyear,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,"Great Costume, Awkward To Put On Perfectly fit...",We got this costume for my son to wear to Disn...,4.62,We got this costume for my son to wear to Disn...


Save changes to a csv called `category_only` and comment out the cell so it doesn't ever get overwritten on accident.

In [149]:
# df.drop(columns=['reviewerID','reviewText', 'overall', 'summary', 'unixReviewTime', 'review_date', 'color', 'description', 'details', 'all_text', 'one_sum', 'all_reviews']).to_csv('data/category_only.csv')


Start a new DataFrame with just the text columns.

In [150]:
df_text = df[['asin', 'name', 'color', 'details', 'description', 'all_text', 'one_sum', 'all_reviews']]

In [151]:
#Check the first 25 rows 
df_text.head(25)

Unnamed: 0,asin,name,color,details,description,all_text,one_sum,all_reviews
0,0000031887,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,unavailable,0,This fits your . Make sure this fitsby ent...,This is a great tutu and at a really great pri...,Great tutu- not cheaply made Very Cute!! I ha...,This is a great tutu and at a really great pri...
23,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,unavailable,0,This fits your . Make sure this fitsby ent...,The minute I saw this my heart skipped a beat....,Breathtaking 5 Stars VERY NICE Amazing so many...,The minute I saw this my heart skipped a beat....
29,1608299953,Learn French: Rosetta Stone French - Level 1,unavailable,0,Access for up to 5 family members Download act...,Since learning a language from software is not...,"Suggest getting this 1 level at a time, so you...",Since learning a language from software is not...
42,1617160377,Learn Italian: Rosetta Stone Italian - Level 1,unavailable,0,Access for up to 5 family members Download act...,"If you really need to learn a new language, an...",A real improvement over the last version! Nice...,"If you really need to learn a new language, an..."
58,B00001W0KA,Buzz Lightyear Boy's Deluxe Toy Story Costume,Buzz Lightyear,0,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,We got this costume for my son to wear to Disn...,"Great Costume, Awkward To Put On Perfectly fit...",We got this costume for my son to wear to Disn...
66,B00001WRHJ,Woody Deluxe Child - Size: Child S(4-6),unavailable,0,"5.5"" high 12"" wide Quality materials used to m...",My grandson has been a costume lover for the p...,Birthday gift 5year old Great for costume-lovi...,My grandson has been a costume lover for the p...
72,B00004SR8W,Lewis N. Clark Stash,Neck Stash,0,"Nylon Nylon lining Zipper closure 25.5"" should...",better quality than I expected for the price -...,Get this one it works. A bit too large for me ...,better quality than I expected for the price -...
84,B00004SR8Z,"Lewis N. Clark Deluxe Neck Stash, Beige",Beige,0,This fits your . Make sure this fitsby ent...,This pouch is lightweight and comfortable to w...,Would like it to be a bit longer Small but fun...,This pouch is lightweight and comfortable to w...
106,B00004SR9P,"Lewis N. Clark Add-A-Bag Travel Luggage Strap,...",Black,0,This fits your . Make sure this fitsby ent...,It's too short to be particularly useful. If i...,too short It Works! great product. Made handl...,It's too short to be particularly useful. If i...
122,B00004U1J2,Buzz Lightyear Jet Pack,One Size Child,0,"Polyester Imported 16"" high 12"" wide Quality m...",The Inflatable buzz lightyear costume jet pack...,"good Buzz Lightyear Jet Pack To Infinity, And ...",The Inflatable buzz lightyear costume jet pack...


Compare what one cell of the new columns look like.

In [152]:
df_text['all_text'][0]

'This is a great tutu and at a really great price. It doesn\'t look cheap at all. I\'m so glad I looked on Amazon and found such an affordable tutu that isn\'t made poorly. A++ I bought this for my 4 yr old daughter for dance class, she wore it today for the first time and the teacher thought it was adorable. I bought this to go with a light blue long sleeve leotard and was happy the colors matched up great. Price was very good too since some of these go for over $15.00 dollars. What can I say... my daughters have it in orange, black, white and pink and I am thinking to buy for they the fuccia one. It is a very good way for exalt a dancer outfit: great colors, comfortable, looks great, easy to wear, durables and little girls love it. I think it is a great buy for costumer and play too. We bought several tutus at once, and they are got high reviews. Sturdy and seemingly well-made. The girls have been wearing them regularly, including out to play, and the tutus have stood up well. Fits t

In [153]:
df_text['one_sum'][0]

"Great tutu-  not cheaply made Very Cute!! I have buy more than one Adorable, Sturdy Grammy's Angels Love it It's ok Great for dress-up and for ballet practice Great value Good WOW !! ..is all I have to say! Wonderful and great shipping. Excellent quality! Wonderful tutu! Great Tutu Great tutu for little girls! Came apart in 2weeks! So cute! Never GOT IT.... Nice skirt must have for a fairy princess My daughter loved it Sassy! Practically Perfect in every way! "

In [154]:
df['description'][0]

'This fits your\xa0.     Make sure this fitsby entering your model number.    P.when("ReplacementPartsBulletLoader").execute(function(module){ module.initializeDPX(); }) Lime green Fit girls 2-10 years old Puffy skirt great for dance Trademarked By Mystiqueshapes'

Instantiate Tokenizer 

In [155]:
tokenizer = RegexpTokenizer(r'\w+')

In [156]:
# Tokenize the color column for products df and make all words lowercase 
df_text['color'] = df_text['color'].apply(lambda x: tokenizer.tokenize(x.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [157]:
# Check first few rows 
df_text['color'].head()

0         [unavailable]
23        [unavailable]
29        [unavailable]
42        [unavailable]
58    [buzz, lightyear]
Name: color, dtype: object

In [159]:
# Apply tokenize funtion to colors column for user df 
users['colors'] = users['colors'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [160]:
users['colors'].head()

0                    [unavailable, black, unavailable]
1    [unavailable, unavailable, black, black, unava...
2    [unavailable, inspector, missing, missing, mis...
3    [unavailable, ballet, pink, missing, standard,...
4           [unavailable, white, missing, unavailable]
Name: colors, dtype: object

In [161]:
# Tokenize the all_text column and make all lowercase 
df_text['all_text'] = df_text['all_text'].apply(lambda x: tokenizer.tokenize(x.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [162]:
# Check one cell 
df_text['all_text'][0]

['this',
 'is',
 'a',
 'great',
 'tutu',
 'and',
 'at',
 'a',
 'really',
 'great',
 'price',
 'it',
 'doesn',
 't',
 'look',
 'cheap',
 'at',
 'all',
 'i',
 'm',
 'so',
 'glad',
 'i',
 'looked',
 'on',
 'amazon',
 'and',
 'found',
 'such',
 'an',
 'affordable',
 'tutu',
 'that',
 'isn',
 't',
 'made',
 'poorly',
 'a',
 'i',
 'bought',
 'this',
 'for',
 'my',
 '4',
 'yr',
 'old',
 'daughter',
 'for',
 'dance',
 'class',
 'she',
 'wore',
 'it',
 'today',
 'for',
 'the',
 'first',
 'time',
 'and',
 'the',
 'teacher',
 'thought',
 'it',
 'was',
 'adorable',
 'i',
 'bought',
 'this',
 'to',
 'go',
 'with',
 'a',
 'light',
 'blue',
 'long',
 'sleeve',
 'leotard',
 'and',
 'was',
 'happy',
 'the',
 'colors',
 'matched',
 'up',
 'great',
 'price',
 'was',
 'very',
 'good',
 'too',
 'since',
 'some',
 'of',
 'these',
 'go',
 'for',
 'over',
 '15',
 '00',
 'dollars',
 'what',
 'can',
 'i',
 'say',
 'my',
 'daughters',
 'have',
 'it',
 'in',
 'orange',
 'black',
 'white',
 'and',
 'pink',
 'and',

In [163]:
# Check value counts of name column
df_text['name'].value_counts().head(10)

Simple 6MM Gemstone Round Ball Stud Earrings For Women For Teen 925 Sterling Silver 9 Birthstones More Colors    6
Timex Ironman Classic 30 Mid-Size Watch                                                                          5
Birkenstock Women's Mayari Birko-Flor Sandal                                                                     5
Allegra K Women's Off Shoulder Self Tie Bowknot Colorblock Casual Tunic Shirt                                    4
Timex Women's Indiglo Easy Reader Quartz Analog Leather Strap Watch with Date Feature                            4
Wrangler Men's Riggs Workwear Carpenter Jean                                                                     4
Birkenstock Women's Gizeh Thong Sandals                                                                          4
Hanes Men's Woven Plain-Weave Pajama Set                                                                         4
Simple 8MM Gemstone Round Ball Stud Earrings For Women For Teen 925 Sterling Sil

In [164]:
# Tokenize the name column and give it a new column, do not override it 
df_text['name_split'] = df_text['name'].apply(lambda x: tokenizer.tokenize(x.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [165]:
# Compare the tokenized vs untokenized cells 
print(df_text['name'][0])

print(df_text['name_split'][0])

Mystiqueshapes Girls Ballet Tutu Neon Lime Green
['mystiqueshapes', 'girls', 'ballet', 'tutu', 'neon', 'lime', 'green']


In [166]:
# Tokenize name column for reviewers 
users['names'] = users['names'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [167]:
# Define a funtion to remove stop words 
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [168]:
# Apply the stop words function to the color column
df_text['color'] = df_text['color'].apply(lambda x : remove_stopwords(x))
df_text['color'].value_counts().head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[missing]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       3135
[unavailable]                                                                                                                                                                                                                                                                                                                      

In [169]:
df_text['name_split'] = df_text['name_split'].apply(lambda x : remove_stopwords(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Instantiate lemmatizer

In [170]:
lemmatizer = WordNetLemmatizer()

# Create a function for lemmatizing 
def word_lemmatizer(text):
    lem_text = " ".join([lemmatizer.lemmatize(i) for i in text])
    return lem_text

In [171]:
# Apply lemmatizer to name_split column.
df_text['name_split'] = df_text['name_split'].apply(lambda x : word_lemmatizer(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [172]:
users['names'] = users['names'].apply(word_lemmatizer)

In [173]:
# Check column names 
df_text.columns

Index(['asin', 'name', 'color', 'details', 'description', 'all_text',
       'one_sum', 'all_reviews', 'name_split'],
      dtype='object')

Read in color csv from [dataworld](https://data.world/dilumr/color-names/workspace/file?filename=wikipedia_color_names.csv)

In [75]:
colors = pd.read_csv('data/wikipedia_color_names.csv')

In [76]:
colors.head() # Check first 5 rows 

Unnamed: 0,Name,Hex (24 bit),Red (8 bit),Green (8 bit),Blue (8 bit),Hue (degrees),HSL.S (%),"HSL.L (%), HSV.S (%), HSV.V (%)"
0,Absolute zero,#0048BA,0,72,186,217.0,100.0,37.0
1,Acid green,#B0BF1A,176,191,26,65.0,76.0,43.0
2,Aero,#7CB9E8,124,185,232,206.0,70.0,70.0
3,Aero blue,#C9FFE5,201,255,229,151.0,100.0,89.0
4,African violet,#B284BE,178,132,190,288.0,31.0,63.0


Turn the colors in the `Name` column into a list and make them all lowercase.

In [77]:
colors_list = list(colors['Name'])
[x.lower() for x in colors_list]

['absolute zero',
 'acid green',
 'aero',
 'aero blue',
 'african violet',
 'air force blue (raf)',
 'air force blue (usaf)',
 'air superiority blue',
 'alabama crimson',
 'alabaster',
 'alice blue',
 'alien armpit',
 'alizarin crimson',
 'alloy orange',
 'almond',
 'amaranth',
 'amaranth deep purple',
 'amaranth pink',
 'amaranth purple',
 'amaranth red',
 'amazon',
 'amazonite',
 'amber',
 'amber (sae/ece)',
 'american rose',
 'amethyst',
 'android green',
 'anti-flash white',
 'antique brass',
 'antique bronze',
 'antique fuchsia',
 'antique ruby',
 'antique white',
 'ao (english)',
 'apple green',
 'apricot',
 'aqua',
 'aquamarine',
 'arctic lime',
 'army green',
 'arsenic',
 'artichoke',
 'arylide yellow',
 'ash grey',
 'asparagus',
 'atomic tangerine',
 'auburn',
 'aureolin',
 'aurometalsaurus',
 'avocado',
 'awesome',
 'aztec gold',
 'azure',
 'azure (web color)',
 'azure mist',
 'azureish white',
 'baby blue',
 'baby blue eyes',
 'baby pink',
 'baby powder',
 'baker-miller pink

In [174]:
# check the first 25 rows of the color column
df_text['color'].head(25)

0                                          [unavailable]
23                                         [unavailable]
29                                         [unavailable]
42                                         [unavailable]
58                                     [buzz, lightyear]
66                                         [unavailable]
72                                         [neck, stash]
84                                               [beige]
106                                              [black]
122                                   [one, size, child]
135                                        [unavailable]
143                                        [unavailable]
149                                              [shown]
154                                            [1, pack]
159                  [blackfrom24, sellersfrom1, seller]
214                                        [unavailable]
219                                        [unavailable]
225    [4, year, watch, protect

In [175]:
def colors_only(words):
    colors = []
    for i in [x.lower() for x in colors_list]:
        if i in words:
            colors.append(i)
    return colors

In [176]:
# Test the function on a string 
colors_only('the sky blue table is flat')

['blue', 'sky blue']

In [177]:
# Apply the colors only function to the name column of the product df 
df_text['name_color'] = df_text['name_split'].apply(colors_only)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [178]:
# Apply the colors only function to the name column of the users df 
users['names_colors'] = users['names'].apply(colors_only)

In [179]:
# Lemmatize product color column and join list of words back together 
df_text['color'] = df_text['color'].apply(lambda x: word_lemmatizer(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [180]:
# Lemmatize users color column and join list of words back together 
users['colors'] = users['colors'].apply(word_lemmatizer)

In [181]:
df_text['color'] = df_text['color'].apply(colors_only)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [184]:
users['colors'] = users['colors'].apply(colors_only)

In [186]:
df_text[['color', 'name_color']].head(25)

Unnamed: 0,color,name_color
0,[],[lime green]
23,[],[pink]
29,[],"[french rose, rose]"
42,[],[rose]
58,[],[]
66,[],[]
72,[],[]
84,[beige],[beige]
106,[black],[black]
122,[],[jet]


In [187]:
df_text['colors_only'] = df_text['color'] + df_text['name_color']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [194]:
users['colors_only'] = users['colors']+ users['names_colors']

In [None]:
# df_text[['asin', 'name', 'name_split']].to_csv('data/names_to_vectorize.csv')

In [188]:
df_text['colors_only'][0]

['lime green']

In [195]:
# Generate list of colors.
list_of_colors = {color for product in df_text['colors_only'] for color in product}

In [196]:
list_of_colors

{'acid green',
 'aero',
 'african violet',
 'alabaster',
 'alice blue',
 'almond',
 'amaranth',
 'amazon',
 'amber',
 'amethyst',
 'antique brass',
 'antique white',
 'apple green',
 'apricot',
 'aqua',
 'aquamarine',
 'army green',
 'ash grey',
 'auburn',
 'avocado',
 'awesome',
 'azure',
 'azure mist',
 'baby blue',
 'baby pink',
 'banana yellow',
 'barn red',
 'begonia',
 'beige',
 'bisque',
 'bittersweet',
 'black',
 'black olive',
 'blond',
 'blue',
 'blue bell',
 'blue lagoon',
 'blue sapphire',
 'blueberry',
 'blush',
 'bole',
 'bondi blue',
 'bone',
 'boysenberry',
 'brass',
 'brick red',
 'bright green',
 'bright lavender',
 'bright pink',
 'bronze',
 'brown sugar',
 'brown yellow',
 'bubble gum',
 'buff',
 'burgundy',
 'burnished brown',
 'burnt orange',
 'byzantine',
 'cadet',
 'cadet blue',
 'camel',
 'camouflage green',
 'canary',
 'canary yellow',
 'candy pink',
 'capri',
 'cardinal',
 'carmine',
 'carnelian',
 'carolina blue',
 'ceil',
 'celeste',
 'celestial blue',
 'ce

In [197]:
import time

In [200]:
# Begin timer.
t0 = time.time()
# Instantiate counter (for progress statement).
count = 0
# Iterate through list of colors.
for color in list_of_colors:
    # Print out progress statement.
    if count % 100 == 0:
        print("Completed " + str(count) + " out of " + str(len(list_of_colors)) + \
            " columns in " + str(round(time.time() - t0,4)) + " seconds.")
        
        
# Create column for each color.
# Each value is 1 if the product is represented by that color .
# Each value is 0 if the product does not have that color in the description.        
    users[color] = [1 if color in users.loc[row, 'colors_only'] else 0 for row in users.index]
# Add one to counter (for progress statement).
    count += 1

# Code adapted from Matt Brems

Completed 0 out of 366 columns in 0.0001 seconds.
Completed 100 out of 366 columns in 205.8936 seconds.
Completed 200 out of 366 columns in 411.4965 seconds.
Completed 300 out of 366 columns in 615.3048 seconds.


In [205]:
users['cornsilk'].value_counts()

0    168900
1        95
Name: cornsilk, dtype: int64

In [228]:
(users.set_index('reviewerID').iloc[:, -366:]).drop_duplicates().to_csv('data/user_colors.csv')

In [229]:
# Begin timer.
t0 = time.time()
# Instantiate counter (for progress statement).
count = 0
# Iterate through list of colors.
for color in list_of_colors:
    # Print out progress statement.
    if count % 100 == 0:
        print("Completed " + str(count) + " out of " + str(len(list_of_colors)) + \
            " columns in " + str(round(time.time() - t0,4)) + " seconds.")
        
        
# Create column for each color.
# Each value is 1 if the product is represented by that color .
# Each value is 0 if the product does not have that color in the description.        
    df_text[color] = [1 if color in df_text.loc[row, 'colors_only'] else 0 for row in df_text.index]
# Add one to counter (for progress statement).
    count += 1

# Code adapted from Matt Brems

Completed 0 out of 366 columns in 0.0088 seconds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Completed 100 out of 366 columns in 165.5718 seconds.
Completed 200 out of 366 columns in 311.4057 seconds.
Completed 300 out of 366 columns in 467.4537 seconds.


In [230]:
df_text.iloc[:, -366:].sum()

thistle               2
sunny                10
steel blue            3
gray                106
blue                688
moccasin             46
cherry               29
vivid yellow          2
mulberry              2
mango tango           1
antique brass         1
deep red              7
brown sugar           2
kelly green           9
flame                10
electric yellow       2
begonia               1
white              1139
tomato               11
camel                16
sand                701
alabaster             2
beige                94
rose pink             4
burnt orange          9
candy pink            3
coconut               3
electric lime         1
sky blue             12
maize                 2
                   ... 
silver pink          14
crimson red           1
medium purple         2
ochre                 2
malachite             3
hot pink             42
dark green           11
turquoise green       1
rosewood              2
fawn                  4
mystic          

In [231]:
tutu = df_text.loc[29].to_dict()

{k:v for k,v in tutu.items() if v != 0}

{'asin': '1608299953',
 'name': 'Learn French: Rosetta Stone French - Level 1',
 'color': [],
 'details': '0',
 'description': 'Access for up to 5 family members Download activation key included Learn at your own pace with our course that never expires Proprietary speech-recognition technology compares your voice to a native speaker 100 times per second Access to award winning mobile app for 3-months - available on Kindle Fire HD, iOS, and Android Live online tutoring sessions with a Native Speaker - 3-month trial included Earbuds with microphone included in box',
 'all_text': ['since',
  'learning',
  'a',
  'language',
  'from',
  'software',
  'is',
  'not',
  'exactly',
  'the',
  'same',
  'as',
  'learning',
  'a',
  'language',
  'from',
  'going',
  'to',
  'live',
  'in',
  'the',
  'country',
  'itself',
  'i',
  'suggest',
  'picking',
  'up',
  'the',
  'rosetta',
  'stone',
  'software',
  '1',
  'level',
  'at',
  'a',
  'time',
  'like',
  'this',
  'french',
  'level',


In [None]:
df_text.iloc[:5, :12]

In [None]:
# df_text.drop(columns=['color', 'details', 'description', 'all_text', 
            #        'one_sum', 'all_reviews', 'name_color', 'name_split', 'colors_only']).to_csv('data/colors_split.csv')