In [47]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from string import punctuation

from textblob import TextBlob

In [48]:
#load csv file data with headers

location = "WomensClothing.csv"
df = pd.read_csv(location)

In [49]:
df = df.drop('Unnamed: 0', axis = 1)
df.head

<bound method NDFrame.head of        Clothing ID  Age                                              Title  \
0              767   33                                                NaN   
1             1080   34                                                NaN   
2             1077   60                            Some major design flaws   
3             1049   50                                   My favorite buy!   
4              847   47                                   Flattering shirt   
5             1080   49                            Not for the very petite   
6              858   39                               Cagrcoal shimmer fun   
7              858   39               Shimmer, surprisingly goes with lots   
8             1077   24                                         Flattering   
9             1077   34                                  Such a fun dress!   
10            1077   53       Dress looks like it's made of cheap material   
11            1095   39           

In [50]:
df.dtypes


Clothing ID                 int64
Age                         int64
Title                      object
Review Text                object
Rating                      int64
Recommended IND             int64
Positive Feedback Count     int64
Division Name              object
Department Name            object
Class Name                 object
dtype: object

In [51]:
#look for missing data
df.isnull().sum()

Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [52]:
#drop rows with any missing data
df_no_missing = df.dropna()
df_no_missing.isnull().sum()

Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
dtype: int64

In [53]:
df_no_missing = df_no_missing.drop(['Clothing ID','Recommended IND','Positive Feedback Count'], axis=1)

In [54]:
#list of english stopwords
nltk.download('stopwords')
eng_stopwords = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pktra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
#create a function to clean up each review
#then it will analyze and assign a sentiment polarity
def reviewSentiment(review):
    
    #str(review)
    #make text lowercase
    review = str(review).lower()
    
    #tokenize the review
    tknz_review = word_tokenize(review)
    
    #remove puntuation
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
    
    clean_tokens = []
    #remove filler words
    for token in tknz_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    #put sentence back together with remaining clean words
    clean_review = ' '.join(clean_tokens)
    
    #turn into textblob
    blob_rev = TextBlob(clean_review)
    
    #get sentiment polarity
    r_pol = blob_rev.sentiment.polarity
    
    return r_pol

In [56]:
#create a new column to hold sentiment value from function
df_no_missing['review_sentiment'] = df_no_missing['Review Text'].apply(reviewSentiment)

In [57]:
df_no_missing

Unnamed: 0,Age,Title,Review Text,Rating,Division Name,Department Name,Class Name,review_sentiment
2,60,Some major design flaws,I had such high hopes for this dress and reall...,3,General,Dresses,Dresses,0.082300
3,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,General Petite,Bottoms,Pants,0.500000
4,47,Flattering shirt,This shirt is very flattering to all due to th...,5,General,Tops,Blouses,0.500000
5,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,General,Dresses,Dresses,0.150000
6,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,General Petite,Tops,Knits,0.075625
7,39,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...",4,General Petite,Tops,Knits,0.196154
8,24,Flattering,I love this dress. i usually get an xs but it ...,5,General,Dresses,Dresses,-0.046875
9,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,General,Dresses,Dresses,0.212554
10,53,Dress looks like it's made of cheap material,Dress runs small esp where the zipper area run...,3,General,Dresses,Dresses,-0.077249
12,53,Perfect!!!,More and more i find myself reliant on the rev...,5,General Petite,Dresses,Dresses,0.164286
