### import dataset

read csv file using pandas and get rid of unnamed index column

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./dataset/WomensClothingE-CommerceReviews.csv")

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses


In [4]:
df = df.drop(df.columns[0], axis='columns')

fill na with empty string instead of drop the rows.

In [5]:
pd.DataFrame.info(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 10 columns):
Clothing ID                23486 non-null int64
Age                        23486 non-null int64
Title                      19676 non-null object
Review Text                22641 non-null object
Rating                     23486 non-null int64
Recommended IND            23486 non-null int64
Positive Feedback Count    23486 non-null int64
Division Name              23472 non-null object
Department Name            23472 non-null object
Class Name                 23472 non-null object
dtypes: int64(5), object(5)
memory usage: 1.8+ MB


In [6]:
df.fillna('', inplace=True)

### preprocess strings of "review text"

In [7]:
reviewDf = df[["Clothing ID", "Review Text"]]

In [8]:
reviewDf.head(2)

Unnamed: 0,Clothing ID,Review Text
0,767,Absolutely wonderful - silky and sexy and comf...
1,1080,Love this dress! it's sooo pretty. i happene...


In [9]:
pd.DataFrame.info(reviewDf)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 2 columns):
Clothing ID    23486 non-null int64
Review Text    23486 non-null object
dtypes: int64(1), object(1)
memory usage: 367.0+ KB


remove stopwords, tokenizing

In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sieunbae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
stopword = set(stopwords.words('english'))

In [13]:
def preprocessing(data):
    data = data.lower()
    words = RegexpTokenizer(r'[a-z]+').tokenize(data)
    words = [w for w in words if not w in stopword]
    
    for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
        words = [WordNetLemmatizer().lemmatize(x, pos) for x in words]
    
    return " ".join(words)

In [14]:
reviewDf['New Text'] = reviewDf["Review Text"].map(preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
reviewDf.head()

Unnamed: 0,Clothing ID,Review Text,New Text
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",love love love jumpsuit fun flirty fabulous ev...
4,847,This shirt is very flattering to all due to th...,shirt flat due adjustable front tie perfect le...


#### SentiWordNet

In [16]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/sieunbae/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sieunbae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sieunbae/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

In [21]:
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

lemmatizer = WordNetLemmatizer()

def swn_polarity(text):
    sentiment = 0.0
    tokens_count = 0
    
    raw_sentences = sent_tokenize(text)
    for raw_sentnece in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(text))
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
            lemma = lemmatizer.lemmatize(word, pos = wn_tag)
            if not lemma:
                continue
            synsets = wn.synsets(lemma, pos = wn_tag)
            if not synsets:
                continue
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
    if not tokens_count:
        return 0
    if sentiment > 0:
        return 1
    elif sentiment == 0:
        return 0
    
    return -1

In [22]:
reviewDf['SWN'] = reviewDf['New Text'].apply(swn_polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
reviewDf.head(2)

Unnamed: 0,Clothing ID,Review Text,New Text,SWN
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,1
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,1


#### Using TextBlob

In [24]:
from textblob import TextBlob

In [25]:
reviewDf['TextBlob'] = reviewDf['New Text'].apply(lambda text: TextBlob(text).sentiment.polarity)
reviewDf['TextBlob'] = reviewDf['TextBlob'].apply(lambda x: 1 if x>0 else (-1 if x<0 else 0) )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [26]:
reviewDf.head(3)

Unnamed: 0,Clothing ID,Review Text,New Text,SWN,TextBlob
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,1,1
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,1,1
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...,-1,1


#### Using AFINN dictionary

In [27]:
pip install afinn

Note: you may need to restart the kernel to use updated packages.


In [28]:
from afinn import Afinn

In [29]:
afn = Afinn(emoticons = True)

In [30]:
reviewDf['AFINN'] = reviewDf['New Text'].apply(lambda text: afn.score(text))
reviewDf['AFINN'] = reviewDf['AFINN'].apply(lambda x: 1 if x>0 else (-1 if x<0 else 0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [31]:
reviewDf.head(3)

Unnamed: 0,Clothing ID,Review Text,New Text,SWN,TextBlob,AFINN
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,1,1,1
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,1,1,1
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...,-1,1,1


#### Using 'rank' feature

In [32]:
def rank_polarity(rate):
    if rate >= 4:
        return 1
    elif rate >= 3:
        return 0
    else:
        return -1

In [33]:
reviewDf['Rank_Class'] = df['Rating'].map(rank_polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [34]:
reviewDf.head()

Unnamed: 0,Clothing ID,Review Text,New Text,SWN,TextBlob,AFINN,Rank_Class
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,1,1,1,1
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,1,1,1,1
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...,-1,1,1,0
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",love love love jumpsuit fun flirty fabulous ev...,1,1,1,1
4,847,This shirt is very flattering to all due to th...,shirt flat due adjustable front tie perfect le...,1,1,1,1


### Classifier

In [35]:
reviewDf['recommend'] = df['Recommended IND']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [36]:
reviewDf.head(5)

Unnamed: 0,Clothing ID,Review Text,New Text,SWN,TextBlob,AFINN,Rank_Class,recommend
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,1,1,1,1,1
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,1,1,1,1,1
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...,-1,1,1,0,0
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",love love love jumpsuit fun flirty fabulous ev...,1,1,1,1,1
4,847,This shirt is very flattering to all due to th...,shirt flat due adjustable front tie perfect le...,1,1,1,1,1


In [37]:
Df = reviewDf.iloc[:, 3:]

In [38]:
Df.head(3)

Unnamed: 0,SWN,TextBlob,AFINN,Rank_Class,recommend
0,1,1,1,1,1
1,1,1,1,1,1
2,-1,1,1,0,0


In [43]:
X = Df.iloc[:, :-1]
y = Df.iloc[:, -1]
print(X, y)

       SWN  TextBlob  AFINN  Rank_Class
0        1         1      1           1
1        1         1      1           1
2       -1         1      1           0
3        1         1      1           1
4        1         1      1           1
5        1         1      1          -1
6        1         1     -1           1
7        1         1      1           1
8       -1        -1      1           1
9        0         1      1           1
10       1         1      1           0
11      -1         1      1           1
12       1         1      1           1
13       1         1      1           1
14      -1        -1      1           0
15       1         1      1           1
16       1         1      1           0
17       1         1      1           1
18       0         1      1           1
19       1         1      1           1
20      -1         1      1           1
21      -1         1      1           1
22      -1        -1     -1          -1
23      -1         1      1           0


In [46]:
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def classifier(X, y):
    X_train, X_test, y_train, y_test =  train_test_split(X, y,test_size=0.3)
    
    gnb = GaussianNB() 
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    print("Accuracy using GaussianNB",accuracy_score(y_test, y_pred))
    
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    print("Accuracy using DecisionTree",accuracy_score(y_test, y_pred))
    
    svc = SVC(kernel='linear')
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    print("Accuracy using SVC",accuracy_score(y_test, y_pred))

In [47]:
classifier(X, y)

Accuracy using GaussianNB 0.9083167754754471
Accuracy using DecisionTree 0.9352824297473744
Accuracy using SVC 0.9351405052512064


- add avg feature of sum of the sentiment analyzed results

- add more features, review text's length

#### Furthermore, how about grouping by the clothing ID and then analyze...