## Feature Engineering with NLTK  
_By: Rachel Koenig_ 
____

Imports 

In [4]:
#Import pandas 
import pandas as pd
#Import Natual Language Toolkit
import nltk
#Import Beautiful Soup
from bs4 import BeautifulSoup   
#Import string for list of punctuation
import string
# Import the stop word list
from nltk.corpus import stopwords 
# Import Tokenizer
from nltk.tokenize import RegexpTokenizer
#Import Lemmatizer
from nltk.stem import WordNetLemmatizer
# Import stemmer.
from nltk.stem.porter import PorterStemmer

Read in csv.

In [5]:
df = pd.read_csv('data/reviews_and_products.csv', index_col=[0], low_memory=False)

In [6]:
#check first 5 rows 
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,review_date,color,description,details,...,Tops_Tees,Trunks,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets
0,A1KLRMWW2FWPL4,31887,This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,2011-02-12,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A2G5TCU2WDFZ65,31887,I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,2013-01-19,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A1RLQXYNCMWRWN,31887,What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,2013-01-04,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A8U3FAMSJVHS5,31887,"We bought several tutus at once, and they are ...",5.0,"Adorable, Sturdy",1398556800,2014-04-27,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GEOILWLK86XM,31887,Thank you Halo Heaven great product for Little...,5.0,Grammy's Angels Love it,1394841600,2014-03-15,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Check data types 
df.dtypes.head(15)

reviewerID             object
asin                   object
reviewText             object
overall               float64
summary                object
unixReviewTime          int64
review_date            object
color                  object
description            object
details                object
name                   object
size                   object
Arts_Crafts_Sewing    float64
Automotive            float64
Baby_Products         float64
dtype: object

In [8]:
df.columns

Index(['reviewerID', 'asin', 'reviewText', 'overall', 'summary',
       'unixReviewTime', 'review_date', 'color', 'description', 'details',
       ...
       'Tops_Tees', 'Trunks', 'Umbrellas', 'Underwear', 'Wallets',
       'Wear_to_Work', 'Wrist_Watches', 'Arm_Warmers', 'Baseball_Caps',
       'Berets'],
      dtype='object', length=823)

In [9]:
df.shape

(168995, 823)

In [10]:
df.isnull().sum()

reviewerID                         0
asin                               0
reviewText                        16
overall                            0
summary                            1
unixReviewTime                     0
review_date                        0
color                              0
description                        0
details                            0
name                               0
size                               0
Arts_Crafts_Sewing                 0
Automotive                         0
Baby_Products                      0
Beauty_Personal_Care               0
Cell_Phones_Accessories            0
Clothing_Shoes_Jewelry             0
Electronics                        0
Health_Household                   0
Home_Kitchen                       0
Industrial_Scientific              0
Office_Products                    0
Purchase_Circles                   0
Software                           0
Sports_Outdoors                    0
Tools_Home_Improvement             0
T

In [11]:
df.fillna('none', inplace=True)

In [12]:
df.isnull().sum().sum()

0

Add all columns with text together into one new column.

In [13]:
df['summary'] = df['summary'] + " "


In [14]:
df['summary']

0                            Great tutu-  not cheaply made 
1                                              Very Cute!! 
2                                 I have buy more than one 
3                                         Adorable, Sturdy 
4                                  Grammy's Angels Love it 
5                                                  It's ok 
6               Great for dress-up and for ballet practice 
7                                              Great value 
8                                                     Good 
9                           WOW !! ..is all I have to say! 
10                           Wonderful and great shipping. 
11                                      Excellent quality! 
12                                         Wonderful tutu! 
13                                              Great Tutu 
14                            Great tutu for little girls! 
15                                   Came apart in 2weeks! 
16                                      

In [15]:
summary = dict(df.groupby('asin')['summary'].sum())

In [16]:
df['one_sum'] = df['asin'].map(summary)
df['one_sum'].head()

0    Great tutu-  not cheaply made Very Cute!! I ha...
1    Great tutu-  not cheaply made Very Cute!! I ha...
2    Great tutu-  not cheaply made Very Cute!! I ha...
3    Great tutu-  not cheaply made Very Cute!! I ha...
4    Great tutu-  not cheaply made Very Cute!! I ha...
Name: one_sum, dtype: object

In [17]:
df['reviewText'] = df['reviewText'] + " "

reviewText = dict(df.groupby('asin')['reviewText'].sum())
df['all_reviews'] = df['asin'].map(reviewText)
df['all_reviews'].head()

0    This is a great tutu and at a really great pri...
1    This is a great tutu and at a really great pri...
2    This is a great tutu and at a really great pri...
3    This is a great tutu and at a really great pri...
4    This is a great tutu and at a really great pri...
Name: all_reviews, dtype: object

In [18]:
overall_mean = dict(df.groupby(['asin']).mean().round(2)['overall'])
overall_mean


{'0000031887': 4.61,
 '0123456479': 4.17,
 '1608299953': 4.31,
 '1617160377': 4.62,
 'B00001W0KA': 4.62,
 'B00001WRHJ': 3.5,
 'B00004SR8W': 4.67,
 'B00004SR8Z': 4.55,
 'B00004SR9P': 4.0,
 'B00004U1J2': 3.92,
 'B000051SEN': 4.5,
 'B000051SEP': 4.33,
 'B00005JHKE': 3.8,
 'B00005JSBK': 4.6,
 'B00005KJXN': 4.6,
 'B0000643Q8': 4.4,
 'B000067R84': 4.33,
 'B00006I551': 4.23,
 'B00006XXGO': 4.67,
 'B000072UMJ': 4.59,
 'B000074RL3': 4.15,
 'B000078CYM': 4.38,
 'B00007AS82': 5.0,
 'B00007FFL9': 4.17,
 'B00007GD8X': 4.38,
 'B00007GD9W': 4.2,
 'B00007GDAL': 4.12,
 'B00007GDD3': 3.77,
 'B00007GDG5': 4.63,
 'B00007IVVR': 3.8,
 'B000086211': 3.8,
 'B0000862FI': 4.32,
 'B0000865II': 4.6,
 'B000086778': 3.43,
 'B0000868O9': 4.05,
 'B000086910': 4.21,
 'B0000891IO': 3.71,
 'B0000891K0': 4.03,
 'B00008AALU': 4.0,
 'B00008ECKG': 4.0,
 'B00008I8YM': 4.35,
 'B00008ID0L': 4.3,
 'B00008IEUW': 3.5,
 'B00008IPCG': 4.4,
 'B00008KH9C': 3.95,
 'B00009ESZI': 4.8,
 'B00009OXE8': 4.8,
 'B00009QMR3': 4.4,
 'B00009R881

In [19]:
df['overall_mean'] = df['asin'].map(overall_mean)
df['overall_mean'].tail()

278637    4.6
278638    4.6
278639    4.6
278640    4.6
278641    4.6
Name: overall_mean, dtype: float64

In [20]:
df.drop_duplicates(subset='asin', inplace=True)

In [21]:
df.shape

(13732, 826)

In [22]:
# change types to string so that all rows are the same (some were integers before)
df['details'] = df['details'].astype(str)
df['description'] = df['description'].astype(str)

In [23]:
df['details'].value_counts()

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                             8132
missing                                                                                                                                                                                                                                                                                                                                                                                                                                                                       3869
Metal Factory,316L,Stainless steel,Cubic zirconia,channel,3 millim

In [24]:
df['all_text'] = df['all_reviews'] + " " + df['one_sum'] + " " + df['description']
df['all_text'].head()

0     This is a great tutu and at a really great pri...
23    The minute I saw this my heart skipped a beat....
29    Since learning a language from software is not...
42    If you really need to learn a new language, an...
58    We got this costume for my son to wear to Disn...
Name: all_text, dtype: object

In [26]:
df.dtypes.head(15)

reviewerID             object
asin                   object
reviewText             object
overall               float64
summary                object
unixReviewTime          int64
review_date            object
color                  object
description            object
details                object
name                   object
size                   object
Arts_Crafts_Sewing    float64
Automotive            float64
Baby_Products         float64
dtype: object

In [28]:
#Check value counts of size column 
df['size'].value_counts() 

missing                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [29]:
# After reviewing size, I think it is not going to be helpful so I'll drop it.
df.drop(columns='size', inplace=True)

In [30]:
df.columns

Index(['reviewerID', 'asin', 'reviewText', 'overall', 'summary',
       'unixReviewTime', 'review_date', 'color', 'description', 'details',
       ...
       'Wallets', 'Wear_to_Work', 'Wrist_Watches', 'Arm_Warmers',
       'Baseball_Caps', 'Berets', 'one_sum', 'all_reviews', 'overall_mean',
       'all_text'],
      dtype='object', length=826)

In [31]:
df.shape

(13732, 826)

In [34]:
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,review_date,color,description,details,...,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets,one_sum,all_reviews,overall_mean,all_text
0,A1KLRMWW2FWPL4,0000031887,This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,2011-02-12,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,Great tutu- not cheaply made Very Cute!! I ha...,This is a great tutu and at a really great pri...,4.61,This is a great tutu and at a really great pri...
23,A2WNN1DQVL4LH5,0123456479,The minute I saw this my heart skipped a beat....,5.0,Breathtaking 5 Stars,1383782400,2013-11-07,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,Breathtaking 5 Stars VERY NICE Amazing so many...,The minute I saw this my heart skipped a beat....,4.17,The minute I saw this my heart skipped a beat....
29,A1F7YU6O5RU432,1608299953,Since learning a language from software is not...,5.0,"Suggest getting this 1 level at a time, so you...",1365465600,2013-04-09,unavailable,Access for up to 5 family members Download act...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,"Suggest getting this 1 level at a time, so you...",Since learning a language from software is not...,4.31,Since learning a language from software is not...
42,A31ICLWQ9CSHRS,1617160377,"If you really need to learn a new language, an...",5.0,A real improvement over the last version!,1305936000,2011-05-21,unavailable,Access for up to 5 family members Download act...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,A real improvement over the last version! Nice...,"If you really need to learn a new language, an...",4.62,"If you really need to learn a new language, an..."
58,A1059SSXUZZS1S,B00001W0KA,We got this costume for my son to wear to Disn...,4.0,"Great Costume, Awkward To Put On",1383696000,2013-11-06,Buzz Lightyear,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,"Great Costume, Awkward To Put On Perfectly fit...",We got this costume for my son to wear to Disn...,4.62,We got this costume for my son to wear to Disn...


In [35]:
df.drop(columns=['reviewerID','reviewText', 'overall', 'summary', 'unixReviewTime', 'review_date', 'color', 'description', 'details', 'all_text', 'one_sum', 'all_reviews']).to_csv('data/category_only.csv')


In [36]:
df_text = df[['asin', 'name', 'color', 'details', 'description', 'all_text', 'one_sum', 'all_reviews']]

In [37]:
df_text.head(25)

Unnamed: 0,asin,name,color,details,description,all_text,one_sum,all_reviews
0,0000031887,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,unavailable,0,This fits your . Make sure this fitsby ent...,This is a great tutu and at a really great pri...,Great tutu- not cheaply made Very Cute!! I ha...,This is a great tutu and at a really great pri...
23,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,unavailable,0,This fits your . Make sure this fitsby ent...,The minute I saw this my heart skipped a beat....,Breathtaking 5 Stars VERY NICE Amazing so many...,The minute I saw this my heart skipped a beat....
29,1608299953,Learn French: Rosetta Stone French - Level 1,unavailable,0,Access for up to 5 family members Download act...,Since learning a language from software is not...,"Suggest getting this 1 level at a time, so you...",Since learning a language from software is not...
42,1617160377,Learn Italian: Rosetta Stone Italian - Level 1,unavailable,0,Access for up to 5 family members Download act...,"If you really need to learn a new language, an...",A real improvement over the last version! Nice...,"If you really need to learn a new language, an..."
58,B00001W0KA,Buzz Lightyear Boy's Deluxe Toy Story Costume,Buzz Lightyear,0,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,We got this costume for my son to wear to Disn...,"Great Costume, Awkward To Put On Perfectly fit...",We got this costume for my son to wear to Disn...
66,B00001WRHJ,Woody Deluxe Child - Size: Child S(4-6),unavailable,0,"5.5"" high 12"" wide Quality materials used to m...",My grandson has been a costume lover for the p...,Birthday gift 5year old Great for costume-lovi...,My grandson has been a costume lover for the p...
72,B00004SR8W,Lewis N. Clark Stash,Neck Stash,0,"Nylon Nylon lining Zipper closure 25.5"" should...",better quality than I expected for the price -...,Get this one it works. A bit too large for me ...,better quality than I expected for the price -...
84,B00004SR8Z,"Lewis N. Clark Deluxe Neck Stash, Beige",Beige,0,This fits your . Make sure this fitsby ent...,This pouch is lightweight and comfortable to w...,Would like it to be a bit longer Small but fun...,This pouch is lightweight and comfortable to w...
106,B00004SR9P,"Lewis N. Clark Add-A-Bag Travel Luggage Strap,...",Black,0,This fits your . Make sure this fitsby ent...,It's too short to be particularly useful. If i...,too short It Works! great product. Made handl...,It's too short to be particularly useful. If i...
122,B00004U1J2,Buzz Lightyear Jet Pack,One Size Child,0,"Polyester Imported 16"" high 12"" wide Quality m...",The Inflatable buzz lightyear costume jet pack...,"good Buzz Lightyear Jet Pack To Infinity, And ...",The Inflatable buzz lightyear costume jet pack...


In [38]:
df_text['all_text'][0]

'This is a great tutu and at a really great price. It doesn\'t look cheap at all. I\'m so glad I looked on Amazon and found such an affordable tutu that isn\'t made poorly. A++ I bought this for my 4 yr old daughter for dance class, she wore it today for the first time and the teacher thought it was adorable. I bought this to go with a light blue long sleeve leotard and was happy the colors matched up great. Price was very good too since some of these go for over $15.00 dollars. What can I say... my daughters have it in orange, black, white and pink and I am thinking to buy for they the fuccia one. It is a very good way for exalt a dancer outfit: great colors, comfortable, looks great, easy to wear, durables and little girls love it. I think it is a great buy for costumer and play too. We bought several tutus at once, and they are got high reviews. Sturdy and seemingly well-made. The girls have been wearing them regularly, including out to play, and the tutus have stood up well. Fits t

In [39]:
df_text['one_sum'][0]

"Great tutu-  not cheaply made Very Cute!! I have buy more than one Adorable, Sturdy Grammy's Angels Love it It's ok Great for dress-up and for ballet practice Great value Good WOW !! ..is all I have to say! Wonderful and great shipping. Excellent quality! Wonderful tutu! Great Tutu Great tutu for little girls! Came apart in 2weeks! So cute! Never GOT IT.... Nice skirt must have for a fairy princess My daughter loved it Sassy! Practically Perfect in every way! "

In [40]:
df['description'][0]

'This fits your\xa0.     Make sure this fitsby entering your model number.    P.when("ReplacementPartsBulletLoader").execute(function(module){ module.initializeDPX(); }) Lime green Fit girls 2-10 years old Puffy skirt great for dance Trademarked By Mystiqueshapes'

In [41]:
#Instantiate Tokenizer 
tokenizer = RegexpTokenizer(r'\w+')

In [42]:
# Tokenize the color column and make all words lowercase 
df_text['color'] = df_text['color'].apply(lambda x: tokenizer.tokenize(x.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [43]:
df_text['color'].head()

0         [unavailable]
23        [unavailable]
29        [unavailable]
42        [unavailable]
58    [buzz, lightyear]
Name: color, dtype: object

In [44]:
# Tokenize the all_text column
df_text['all_text'] = df_text['all_text'].apply(lambda x: tokenizer.tokenize(x.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [45]:
df_text['all_text'][0]

['this',
 'is',
 'a',
 'great',
 'tutu',
 'and',
 'at',
 'a',
 'really',
 'great',
 'price',
 'it',
 'doesn',
 't',
 'look',
 'cheap',
 'at',
 'all',
 'i',
 'm',
 'so',
 'glad',
 'i',
 'looked',
 'on',
 'amazon',
 'and',
 'found',
 'such',
 'an',
 'affordable',
 'tutu',
 'that',
 'isn',
 't',
 'made',
 'poorly',
 'a',
 'i',
 'bought',
 'this',
 'for',
 'my',
 '4',
 'yr',
 'old',
 'daughter',
 'for',
 'dance',
 'class',
 'she',
 'wore',
 'it',
 'today',
 'for',
 'the',
 'first',
 'time',
 'and',
 'the',
 'teacher',
 'thought',
 'it',
 'was',
 'adorable',
 'i',
 'bought',
 'this',
 'to',
 'go',
 'with',
 'a',
 'light',
 'blue',
 'long',
 'sleeve',
 'leotard',
 'and',
 'was',
 'happy',
 'the',
 'colors',
 'matched',
 'up',
 'great',
 'price',
 'was',
 'very',
 'good',
 'too',
 'since',
 'some',
 'of',
 'these',
 'go',
 'for',
 'over',
 '15',
 '00',
 'dollars',
 'what',
 'can',
 'i',
 'say',
 'my',
 'daughters',
 'have',
 'it',
 'in',
 'orange',
 'black',
 'white',
 'and',
 'pink',
 'and',

In [46]:
# Check value counts of name column
df_text['name'].value_counts().head(10)

Simple 6MM Gemstone Round Ball Stud Earrings For Women For Teen 925 Sterling Silver 9 Birthstones More Colors    6
Timex Ironman Classic 30 Mid-Size Watch                                                                          5
Birkenstock Women's Mayari Birko-Flor Sandal                                                                     5
Timex Women's Indiglo Easy Reader Quartz Analog Leather Strap Watch with Date Feature                            4
MG Low Profile Dyed Cotton Twill Cap                                                                             4
Hanes Men's Woven Plain-Weave Pajama Set                                                                         4
Wrangler Men's Riggs Workwear Carpenter Jean                                                                     4
Birkenstock Women's Gizeh Thong Sandals                                                                          4
Simple 8MM Gemstone Round Ball Stud Earrings For Women For Teen 925 Sterling Sil

In [47]:
# Tokenize the name column and give it a new column, do not override it 
df_text['name_split'] = df_text['name'].apply(lambda x: tokenizer.tokenize(x.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [48]:
print(df_text['name'][0])

print(df_text['name_split'][0])

Mystiqueshapes Girls Ballet Tutu Neon Lime Green
['mystiqueshapes', 'girls', 'ballet', 'tutu', 'neon', 'lime', 'green']


In [49]:
# Define a funtion to remove stop words 
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [50]:
# Apply the stop words function to the color column
df_text['color'] = df_text['color'].apply(lambda x : remove_stopwords(x))
df_text['color'].value_counts().head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[missing]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       3135
[unavailable]                                                                                                                                                                                                                                                                                                                      

In [51]:
df_text['name_split'] = df_text['name_split'].apply(lambda x : remove_stopwords(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [52]:
# Instantiate lemmatizer
lemmatizer = WordNetLemmatizer()

# Create a function for lemmatizing 
def word_lemmatizer(text):
    lem_text = " ".join([lemmatizer.lemmatize(i) for i in text])
    return lem_text

In [None]:
# Apply lemmatizer to all_text column 
# df_text['all_text'] = df['all_text'].apply(lambda x: word_lemmatizer(x))

In [53]:
df_text['name_split'] = df_text['name_split'].apply(lambda x : word_lemmatizer(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
df_text.columns

Index(['asin', 'name', 'color', 'details', 'description', 'all_text',
       'one_sum', 'all_reviews', 'name_split'],
      dtype='object')

In [55]:
colors = pd.read_csv('wikipedia_color_names.csv')

In [56]:
colors.head()

Unnamed: 0,Name,Hex (24 bit),Red (8 bit),Green (8 bit),Blue (8 bit),Hue (degrees),HSL.S (%),"HSL.L (%), HSV.S (%), HSV.V (%)"
0,Absolute zero,#0048BA,0,72,186,217.0,100.0,37.0
1,Acid green,#B0BF1A,176,191,26,65.0,76.0,43.0
2,Aero,#7CB9E8,124,185,232,206.0,70.0,70.0
3,Aero blue,#C9FFE5,201,255,229,151.0,100.0,89.0
4,African violet,#B284BE,178,132,190,288.0,31.0,63.0


In [57]:
colors_list = list(colors['Name'])
[x.lower() for x in colors_list]

['absolute zero',
 'acid green',
 'aero',
 'aero blue',
 'african violet',
 'air force blue (raf)',
 'air force blue (usaf)',
 'air superiority blue',
 'alabama crimson',
 'alabaster',
 'alice blue',
 'alien armpit',
 'alizarin crimson',
 'alloy orange',
 'almond',
 'amaranth',
 'amaranth deep purple',
 'amaranth pink',
 'amaranth purple',
 'amaranth red',
 'amazon',
 'amazonite',
 'amber',
 'amber (sae/ece)',
 'american rose',
 'amethyst',
 'android green',
 'anti-flash white',
 'antique brass',
 'antique bronze',
 'antique fuchsia',
 'antique ruby',
 'antique white',
 'ao (english)',
 'apple green',
 'apricot',
 'aqua',
 'aquamarine',
 'arctic lime',
 'army green',
 'arsenic',
 'artichoke',
 'arylide yellow',
 'ash grey',
 'asparagus',
 'atomic tangerine',
 'auburn',
 'aureolin',
 'aurometalsaurus',
 'avocado',
 'awesome',
 'aztec gold',
 'azure',
 'azure (web color)',
 'azure mist',
 'azureish white',
 'baby blue',
 'baby blue eyes',
 'baby pink',
 'baby powder',
 'baker-miller pink

In [58]:
df_text['color'].head(25)

0                                          [unavailable]
23                                         [unavailable]
29                                         [unavailable]
42                                         [unavailable]
58                                     [buzz, lightyear]
66                                         [unavailable]
72                                         [neck, stash]
84                                               [beige]
106                                              [black]
122                                   [one, size, child]
135                                        [unavailable]
143                                        [unavailable]
149                                              [shown]
154                                            [1, pack]
159                  [blackfrom24, sellersfrom1, seller]
214                                        [unavailable]
219                                        [unavailable]
225    [4, year, watch, protect

In [59]:
def colors_only(words):
    colors = []
    for i in [x.lower() for x in colors_list]:
        if i in words:
            colors.append(i)
    return colors

In [60]:
colors_only('the sky blue table is flat')

['blue', 'sky blue']

In [61]:
df_text['name_color'] = df_text['name_split'].apply(colors_only)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [62]:
# Lemmatize color column and join list of words back together 
df_text['color'] = df_text['color'].apply(lambda x: word_lemmatizer(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [63]:
df_text['color'] = df_text['color'].apply(colors_only)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [64]:
df_text[['color', 'name_color']].head(25)

Unnamed: 0,color,name_color
0,[],[lime green]
23,[],[pink]
29,[],"[french rose, rose]"
42,[],[rose]
58,[],[]
66,[],[]
72,[],[]
84,[beige],[beige]
106,[black],[black]
122,[],[jet]


In [65]:
df_text['colors_only'] = df_text['color'] + df_text['name_color']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [66]:
df_text[['asin', 'name', 'name_split']].to_csv('data/names_to_vectorize.csv')

In [67]:
df_text['colors_only'][0]

['lime green']

In [68]:
# Generate list of products.
list_of_colors = {color for product in df_text['colors_only'] for color in product}

In [69]:
list_of_colors

{'acid green',
 'aero',
 'african violet',
 'alabaster',
 'alice blue',
 'almond',
 'amaranth',
 'amazon',
 'amber',
 'amethyst',
 'antique brass',
 'antique white',
 'apple green',
 'apricot',
 'aqua',
 'aquamarine',
 'army green',
 'ash grey',
 'auburn',
 'avocado',
 'awesome',
 'azure',
 'azure mist',
 'baby blue',
 'baby pink',
 'banana yellow',
 'barn red',
 'begonia',
 'beige',
 'bisque',
 'bittersweet',
 'black',
 'black olive',
 'blond',
 'blue',
 'blue bell',
 'blue lagoon',
 'blue sapphire',
 'blueberry',
 'blush',
 'bole',
 'bondi blue',
 'bone',
 'boysenberry',
 'brass',
 'brick red',
 'bright green',
 'bright lavender',
 'bright pink',
 'bronze',
 'brown sugar',
 'brown yellow',
 'bubble gum',
 'buff',
 'burgundy',
 'burnished brown',
 'burnt orange',
 'byzantine',
 'cadet',
 'cadet blue',
 'camel',
 'camouflage green',
 'canary',
 'canary yellow',
 'candy pink',
 'capri',
 'cardinal',
 'carmine',
 'carnelian',
 'carolina blue',
 'ceil',
 'celeste',
 'celestial blue',
 'ce

In [71]:
import time

In [72]:
# Begin timer.
t0 = time.time()
# Instantiate counter (for progress statement).
count = 0
# Iterate through list of colors.
for color in list_of_colors:
    # Print out progress statement.
    if count % 100 == 0:
        print("Completed " + str(count) + " out of " + str(len(list_of_colors)) + \
            " columns in " + str(round(time.time() - t0,4)) + " seconds.")
        
        
# Create column for each color.
# Each value is 1 if the product is represented by that color .
# Each value is 0 if the product does not have that color in the description.        
    df_text[color] = [1 if color in df_text.loc[row, 'colors_only'] else 0 for row in df.index]
# Add one to counter (for progress statement).
    count += 1

# Code adapted from Matt Brems

Completed 0 out of 366 columns in 0.0004 seconds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Completed 100 out of 366 columns in 39.6449 seconds.
Completed 200 out of 366 columns in 79.3594 seconds.
Completed 300 out of 366 columns in 117.5041 seconds.


In [73]:
df_text.iloc[:, -366:].sum()

turquoise            72
nickel               30
diamond             157
cream                39
maize                 2
sea green             1
dark green           11
amber                24
jet                  18
lilac                17
raspberry rose        3
chinese red           2
purple heart          1
citron               16
eggshell              1
midnight             58
silver             1159
gold fusion           1
pistachio             1
sepia                 1
navy                298
bole                 15
gunmetal             10
ube                  59
oxford blue           1
wheat                 5
antique brass         1
wine                 19
crimson red           1
turquoise blue        1
                   ... 
green sheen           3
light blue           32
thistle               2
antique white         3
pale green            1
wild strawberry       1
tulip                 5
light pink           14
steel pink            2
capri                64
indigo          

In [77]:
tutu = df_text.loc[29].to_dict()

{k:v for k,v in tutu.items() if v != 0}

{'asin': '1608299953',
 'name': 'Learn French: Rosetta Stone French - Level 1',
 'color': [],
 'details': '0',
 'description': 'Access for up to 5 family members Download activation key included Learn at your own pace with our course that never expires Proprietary speech-recognition technology compares your voice to a native speaker 100 times per second Access to award winning mobile app for 3-months - available on Kindle Fire HD, iOS, and Android Live online tutoring sessions with a Native Speaker - 3-month trial included Earbuds with microphone included in box',
 'all_text': ['since',
  'learning',
  'a',
  'language',
  'from',
  'software',
  'is',
  'not',
  'exactly',
  'the',
  'same',
  'as',
  'learning',
  'a',
  'language',
  'from',
  'going',
  'to',
  'live',
  'in',
  'the',
  'country',
  'itself',
  'i',
  'suggest',
  'picking',
  'up',
  'the',
  'rosetta',
  'stone',
  'software',
  '1',
  'level',
  'at',
  'a',
  'time',
  'like',
  'this',
  'french',
  'level',


In [76]:
df_text.iloc[:5, :12]

Unnamed: 0,asin,name,color,details,description,all_text,one_sum,all_reviews,name_split,name_color,colors_only,turquoise
0,0000031887,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,[],0,This fits your . Make sure this fitsby ent...,"[this, is, a, great, tutu, and, at, a, really,...",Great tutu- not cheaply made Very Cute!! I ha...,This is a great tutu and at a really great pri...,mystiqueshapes girl ballet tutu neon lime green,[lime green],[lime green],0
23,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,[],0,This fits your . Make sure this fitsby ent...,"[the, minute, i, saw, this, my, heart, skipped...",Breathtaking 5 Stars VERY NICE Amazing so many...,The minute I saw this my heart skipped a beat....,shining image huge pink leather jewelry box ca...,[pink],[pink],0
29,1608299953,Learn French: Rosetta Stone French - Level 1,[],0,Access for up to 5 family members Download act...,"[since, learning, a, language, from, software,...","Suggest getting this 1 level at a time, so you...",Since learning a language from software is not...,learn french rosetta stone french level 1,"[french rose, rose]","[french rose, rose]",0
42,1617160377,Learn Italian: Rosetta Stone Italian - Level 1,[],0,Access for up to 5 family members Download act...,"[if, you, really, need, to, learn, a, new, lan...",A real improvement over the last version! Nice...,"If you really need to learn a new language, an...",learn italian rosetta stone italian level 1,[rose],[rose],0
58,B00001W0KA,Buzz Lightyear Boy's Deluxe Toy Story Costume,[],0,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,"[we, got, this, costume, for, my, son, to, wea...","Great Costume, Awkward To Put On Perfectly fit...",We got this costume for my son to wear to Disn...,buzz lightyear boy deluxe toy story costume,[],[],0


In [78]:
df_text.drop(columns=['color', 'details', 'description', 'all_text', 
                    'one_sum', 'all_reviews', 'name_color', 'name_split', 'colors_only']).to_csv('data/colors_split.csv')

In [None]:
def parts_of_speech(text):
    #tagged = [nltk.pos_tag(i) for i in text]
    tagged = nltk.pos_tag(text)
    return tagged

In [None]:
[i for i in df_text['name_split']]

In [None]:
[i for i in ['the', 'quick', 'brown', 'fox']]

In [None]:
parts_of_speech(['the', 'quick', 'brown', 'fox'])

In [None]:
df_text['name_split']

In [None]:
df_text['name_split'].apply(parts_of_speech)

In [None]:
parts_of_speech(df_text['color'][225])

In [None]:
df_text.columns

In [79]:
df_text[['asin', 'name', 'color', 'description', 'details', 'all_text', 'one_sum','all_reviews','name_split']].to_csv('data/all_text.csv')
