## Feature Engineering with NLTK  
_By: Rachel Koenig_ 
____

Imports 

In [10]:
#Import pandas 
import pandas as pd
#Import Natual Language Toolkit
import nltk
#Import Beautiful Soup
from bs4 import BeautifulSoup   
#Import string for list of punctuation
import string
# Import the stop word list
from nltk.corpus import stopwords 
# Import Tokenizer
from nltk.tokenize import RegexpTokenizer
#Import Lemmatizer
from nltk.stem import WordNetLemmatizer
# Import stemmer.
from nltk.stem.porter import PorterStemmer

Read in csv.

In [11]:
df = pd.read_csv('data/reviews_and_products.csv', index_col=[0], low_memory=False)

In [12]:
#check first 5 rows 
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,review_date,color,description,details,...,Tops_Tees,Trunks,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets
0,A1KLRMWW2FWPL4,31887,This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,2011-02-12,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A2G5TCU2WDFZ65,31887,I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,2013-01-19,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A1RLQXYNCMWRWN,31887,What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,2013-01-04,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A8U3FAMSJVHS5,31887,"We bought several tutus at once, and they are ...",5.0,"Adorable, Sturdy",1398556800,2014-04-27,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GEOILWLK86XM,31887,Thank you Halo Heaven great product for Little...,5.0,Grammy's Angels Love it,1394841600,2014-03-15,unavailable,This fits your . Make sure this fitsby ent...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Check data types 
df.dtypes.head(15)

reviewerID             object
asin                   object
reviewText             object
overall               float64
summary                object
unixReviewTime          int64
review_date            object
color                  object
description            object
details                object
name                   object
size                   object
Arts_Crafts_Sewing    float64
Automotive            float64
Baby_Products         float64
dtype: object

In [15]:
#Check column names 
df.columns

Index(['reviewerID', 'asin', 'reviewText', 'overall', 'summary',
       'unixReviewTime', 'review_date', 'color', 'description', 'details',
       ...
       'Tops_Tees', 'Trunks', 'Umbrellas', 'Underwear', 'Wallets',
       'Wear_to_Work', 'Wrist_Watches', 'Arm_Warmers', 'Baseball_Caps',
       'Berets'],
      dtype='object', length=823)

Check size of DataFrame: 168,995 rows, of different products and reviewers.

In [16]:
df.shape

(168995, 823)

In [17]:
#Check for nulls 
df.isnull().sum().head(25)

reviewerID                  0
asin                        0
reviewText                 16
overall                     0
summary                     1
unixReviewTime              0
review_date                 0
color                       0
description                 0
details                     0
name                        0
size                        0
Arts_Crafts_Sewing          0
Automotive                  0
Baby_Products               0
Beauty_Personal_Care        0
Cell_Phones_Accessories     0
Clothing_Shoes_Jewelry      0
Electronics                 0
Health_Household            0
Home_Kitchen                0
Industrial_Scientific       0
Office_Products             0
Purchase_Circles            0
Software                    0
dtype: int64

Fill nulls with the string `'none'`

In [18]:
df.fillna('none', inplace=True)

In [19]:
df.isnull().sum().sum()

0

Groupby the reviewerID column for clustering. 

In [45]:
users = df.drop(columns=['reviewText', 'summary', 'unixReviewTime', 
                 'review_date', 'description', 'details', 'size'])
users.head()

Unnamed: 0,reviewerID,asin,overall,color,name,Arts_Crafts_Sewing,Automotive,Baby_Products,Beauty_Personal_Care,Cell_Phones_Accessories,...,Tops_Tees,Trunks,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets
0,A1KLRMWW2FWPL4,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A2G5TCU2WDFZ65,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A1RLQXYNCMWRWN,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A8U3FAMSJVHS5,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GEOILWLK86XM,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
users_cats = users.groupby(['reviewerID']).sum().iloc[:, 2:]
users_cats.head(3)

In [64]:
user1 = users_cats.loc['A001114613O3F18Q5NVR6'].to_dict()

{k:v for k,v in user1.items() if v != 0}

{'Clothing_Shoes_Jewelry': 4.0,
 'Sports_Outdoors': 1.0,
 'Novelty_More': 1.0,
 'Sports_Fitness': 1.0,
 'Novelty': 1.0,
 'Exercise_Fitness': 1.0,
 'Casual': 1.0,
 'Leggings': 1.0,
 'Men': 2.0,
 'Shorts': 1.0,
 'Skirts': 1.0,
 'T-Shirts': 1.0,
 'Women': 2.0,
 'Clothing': 4.0,
 'Shoes': 1.0,
 'Fashion_Sneakers': 1.0,
 'Running': 1.0}

In [47]:
num_reviews = users.groupby(['reviewerID']).count()['asin']

In [48]:
users['number_of_reviews'] = users['reviewerID'].map(num_reviews)
users['number_of_reviews'].head()

0    3
1    6
2    5
3    7
4    4
Name: number_of_reviews, dtype: int64

In [49]:
user_stars = users.groupby(['reviewerID']).mean().round(2)['overall']

users['average_stars_given'] = users['reviewerID'].map(user_stars)
users['average_stars_given'].head()

0    5.00
1    4.17
2    4.20
3    4.00
4    4.50
Name: average_stars_given, dtype: float64

In [51]:
users.head()

Unnamed: 0,reviewerID,asin,overall,color,name,Arts_Crafts_Sewing,Automotive,Baby_Products,Beauty_Personal_Care,Cell_Phones_Accessories,...,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets,number_of_reviews,average_stars_given
0,A1KLRMWW2FWPL4,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,5.0
1,A2G5TCU2WDFZ65,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,4.17
2,A1RLQXYNCMWRWN,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,4.2
3,A8U3FAMSJVHS5,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4.0
4,A3GEOILWLK86XM,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4.5


In [55]:
users['asin'] = users['asin'] + ", "

In [57]:
all_asins = dict(users.groupby('reviewerID')['asin'].sum())

In [59]:
users['asins'] = users['reviewerID'].map(all_asins)

In [68]:
users['color'] = users['color'] + " "
all_colors = dict(users.groupby('reviewerID')['color'].sum())
users['colors'] = users['reviewerID'].map(all_colors)

In [69]:
users.head(3)

Unnamed: 0,reviewerID,asin,overall,color,name,Arts_Crafts_Sewing,Automotive,Baby_Products,Beauty_Personal_Care,Cell_Phones_Accessories,...,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets,number_of_reviews,average_stars_given,asins,colors
0,A1KLRMWW2FWPL4,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,5.0,"0000031887, B000FH4JJQ, B009H6NPBE,",unavailable Black unavailable
1,A2G5TCU2WDFZ65,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6,4.17,"0000031887, B0019K9WDQ, B005JJ2762, B005OZ9LB0...",unavailable unavailable Black Black unavailabl...
2,A1RLQXYNCMWRWN,31887,5.0,unavailable,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5,4.2,"0000031887, B0007YVP1W, B000LSWXWO, B001GR05W4...",unavailable Inspector missing missing missing


Save to a csv to cluster in a separate notebook.

In [72]:
# pd.merge((users[['reviewerID', 'name', 'number_of_reviews', 
#                  'average_stars_given', 'asins', 'colors']].set_index('reviewerID')), 
#          users_cats, 
#          right_index=True,
#          left_index=True).to_csv('data/user_clustering.csv')
         
         
         

Add all columns with text together into one new column.

In [None]:
df['summary'] = df['summary'] + " "
df['summary'].head()

In [None]:
summary = dict(df.groupby('asin')['summary'].sum())

In [None]:
df['one_sum'] = df['asin'].map(summary)
df['one_sum'].head()

In [None]:
df['reviewText'] = df['reviewText'] + " "

reviewText = dict(df.groupby('asin')['reviewText'].sum())
df['all_reviews'] = df['asin'].map(reviewText)
df['all_reviews'].head()

In [None]:
overall_mean = dict(df.groupby(['asin']).mean().round(2)['overall'])
overall_mean


Create a new column for the average rating out of 5 stars for each `asin`.

In [None]:
df['overall_mean'] = df['asin'].map(overall_mean)
df['overall_mean'].tail()

Now that matching `asin` rows have the same values for summary and reviews and average overall score, we can drop all th duplicate asin rows.

In [None]:
df.drop_duplicates(subset='asin', inplace=True)

In [None]:
# Check shape to confirm rows were dropped 
df.shape

In [None]:
# Change types to string so that all rows are the same (some were integers before)
df['details'] = df['details'].astype(str)
df['description'] = df['description'].astype(str)

In [None]:
df['details'].value_counts()

In [None]:
df['all_text'] = df['all_reviews'] + " " + df['one_sum'] + " " + df['description']
df['all_text'].head()

In [None]:
df.dtypes.head(15)

In [None]:
#Check value counts of size column 
df['size'].value_counts() 

In [None]:
# After reviewing size, I think it is not going to be helpful so I'll drop it.
df.drop(columns='size', inplace=True)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# df.drop(columns=['reviewerID','reviewText', 'overall', 'summary', 'unixReviewTime', 'review_date', 'color', 'description', 'details', 'all_text', 'one_sum', 'all_reviews']).to_csv('data/category_only.csv')


In [None]:
df_text = df[['asin', 'name', 'color', 'details', 'description', 'all_text', 'one_sum', 'all_reviews']]

In [None]:
df_text.head(25)

In [None]:
df_text['all_text'][0]

In [None]:
df_text['one_sum'][0]

In [None]:
df['description'][0]

In [None]:
#Instantiate Tokenizer 
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
# Tokenize the color column and make all words lowercase 
df_text['color'] = df_text['color'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [None]:
df_text['color'].head()

In [None]:
# Tokenize the all_text column
df_text['all_text'] = df_text['all_text'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [None]:
df_text['all_text'][0]

In [None]:
# Check value counts of name column
df_text['name'].value_counts().head(10)

In [None]:
# Tokenize the name column and give it a new column, do not override it 
df_text['name_split'] = df_text['name'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [None]:
print(df_text['name'][0])

print(df_text['name_split'][0])

In [None]:
# Define a funtion to remove stop words 
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [None]:
# Apply the stop words function to the color column
df_text['color'] = df_text['color'].apply(lambda x : remove_stopwords(x))
df_text['color'].value_counts().head(10)

In [None]:
df_text['name_split'] = df_text['name_split'].apply(lambda x : remove_stopwords(x))


In [None]:
# Instantiate lemmatizer
lemmatizer = WordNetLemmatizer()

# Create a function for lemmatizing 
def word_lemmatizer(text):
    lem_text = " ".join([lemmatizer.lemmatize(i) for i in text])
    return lem_text

In [None]:
# Apply lemmatizer to all_text column 
# df_text['all_text'] = df['all_text'].apply(lambda x: word_lemmatizer(x))

In [None]:
df_text['name_split'] = df_text['name_split'].apply(lambda x : word_lemmatizer(x))


In [None]:
df_text.columns

In [None]:
colors = pd.read_csv('wikipedia_color_names.csv')

In [None]:
colors.head()

In [None]:
colors_list = list(colors['Name'])
[x.lower() for x in colors_list]

In [None]:
df_text['color'].head(25)

In [None]:
def colors_only(words):
    colors = []
    for i in [x.lower() for x in colors_list]:
        if i in words:
            colors.append(i)
    return colors

In [None]:
colors_only('the sky blue table is flat')

In [None]:
df_text['name_color'] = df_text['name_split'].apply(colors_only)

In [None]:
# Lemmatize color column and join list of words back together 
df_text['color'] = df_text['color'].apply(lambda x: word_lemmatizer(x))

In [None]:
df_text['color'] = df_text['color'].apply(colors_only)

In [None]:
df_text[['color', 'name_color']].head(25)

In [None]:
df_text['colors_only'] = df_text['color'] + df_text['name_color']

In [None]:
# df_text[['asin', 'name', 'name_split']].to_csv('data/names_to_vectorize.csv')

In [None]:
df_text['colors_only'][0]

In [None]:
# Generate list of products.
list_of_colors = {color for product in df_text['colors_only'] for color in product}

In [None]:
list_of_colors

In [None]:
import time

In [None]:
# Begin timer.
t0 = time.time()
# Instantiate counter (for progress statement).
count = 0
# Iterate through list of colors.
for color in list_of_colors:
    # Print out progress statement.
    if count % 100 == 0:
        print("Completed " + str(count) + " out of " + str(len(list_of_colors)) + \
            " columns in " + str(round(time.time() - t0,4)) + " seconds.")
        
        
# Create column for each color.
# Each value is 1 if the product is represented by that color .
# Each value is 0 if the product does not have that color in the description.        
    df_text[color] = [1 if color in df_text.loc[row, 'colors_only'] else 0 for row in df.index]
# Add one to counter (for progress statement).
    count += 1

# Code adapted from Matt Brems

In [None]:
df_text.iloc[:, -366:].sum()

In [None]:
tutu = df_text.loc[29].to_dict()

{k:v for k,v in tutu.items() if v != 0}

In [None]:
df_text.iloc[:5, :12]

In [None]:
# df_text.drop(columns=['color', 'details', 'description', 'all_text', 
                    'one_sum', 'all_reviews', 'name_color', 'name_split', 'colors_only']).to_csv('data/colors_split.csv')