In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#this is sample data
from nltk.corpus import names  

from string import punctuation

In [13]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('names')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MR_SA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MR_SA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\MR_SA\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MR_SA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\MR_SA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
#initilize function to do sentiment analysis
sid = SentimentIntensityAnalyzer()

In [26]:
eng_stopwords = stopwords.words('english')
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [21]:
import pandas as pd

#load the data from the Reviews.csv file
filepath = "women_clothing_review1.csv"
df = pd.read_csv(filepath, encoding="latin-1") #this file is encoded differently

df.head()

Unnamed: 0,Review Text
0,Absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...


In [22]:
df['Review Text'].isnull().sum()

0

In [23]:
df= df.dropna(subset=['Review Text'])

In [27]:
#create a function to clean up each review
#then it will analyze and assign a sentiment polarity
def reviewSentiment(review):
    
    #make text lowercase
    review = review.lower()
    
    #tokenize the review
    #tknz_review is a list
    tknz_review = word_tokenize(review)
    
    #remove puntuation
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
    
    #empty list to hold "cleaned" tokens
    clean_tokens = []
    
    #remove filler words
    for token in tknz_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    #put sentence back together with remaining clean words
    clean_review = ' '.join(clean_tokens)
    
    #get the polarity scores dictionary
    sid_rev = sid.polarity_scores(clean_review)
    
    #get sentiment polarity from the "compound" key in the sid_rev dictionary
    r_comp = sid_rev['compound']
    
    #return the sentiment value
    return r_comp

In [28]:
#create a new column to hold sentiment value from function
df['review_sentiment'] = df['Review Text'].apply(reviewSentiment)

In [29]:
df.head()

Unnamed: 0,Review Text,review_sentiment
0,Absolutely wonderful - silky and sexy and comf...,0.8991
1,Love this dress! it's sooo pretty. i happene...,0.971
2,I had such high hopes for this dress and reall...,0.9062
3,"I love, love, love this jumpsuit. it's fun, fl...",0.9464
4,This shirt is very flattering to all due to th...,0.9117


In [30]:
df.dtypes

Review Text          object
review_sentiment    float64
dtype: object

In [31]:
#create a function to assign a polarity category to the sentiment
def sentimentCategory(sent_num):
    if sent_num >= 0.2:
        return "positive"
    if sent_num <= -0.2:
        return "negative"
    else:
        return "neutral"

In [32]:
#create a new column to hold sentiment category
df['sentiment_category'] = df['review_sentiment'].apply(sentimentCategory)

In [33]:
df.head()

Unnamed: 0,Review Text,review_sentiment,sentiment_category
0,Absolutely wonderful - silky and sexy and comf...,0.8991,positive
1,Love this dress! it's sooo pretty. i happene...,0.971,positive
2,I had such high hopes for this dress and reall...,0.9062,positive
3,"I love, love, love this jumpsuit. it's fun, fl...",0.9464,positive
4,This shirt is very flattering to all due to th...,0.9117,positive


In [34]:
#compare frequency of positive, negative, and neutral reviews
df['sentiment_category'].value_counts()

positive    21380
neutral       717
negative      544
Name: sentiment_category, dtype: int64

In [36]:
df['Review Text'].iloc[0]

'Absolutely wonderful - silky and sexy and comfortable'

In [37]:
df['review_sentiment'].iloc[0]

0.8991

In [38]:
df['sentiment_category'].iloc[0]

'positive'

In [39]:
df['Review Text'].iloc[4]

'This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!'

In [40]:
df['review_sentiment'].iloc[4]

0.9117

In [41]:
df['sentiment_category'].iloc[4]

'positive'