### import dataset

read csv file using pandas and get rid of unnamed index column

In [39]:
import pandas as pd

In [40]:
df = pd.read_csv("./dataset/WomensClothingE-CommerceReviews.csv")

In [41]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses


In [42]:
df = df.drop(df.columns[0], axis='columns')

fill na with empty string instead of drop the rows.

In [43]:
pd.DataFrame.info(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 10 columns):
Clothing ID                23486 non-null int64
Age                        23486 non-null int64
Title                      19676 non-null object
Review Text                22641 non-null object
Rating                     23486 non-null int64
Recommended IND            23486 non-null int64
Positive Feedback Count    23486 non-null int64
Division Name              23472 non-null object
Department Name            23472 non-null object
Class Name                 23472 non-null object
dtypes: int64(5), object(5)
memory usage: 1.8+ MB


In [44]:
df.fillna('', inplace=True)

### preprocess strings of "review text"

In [45]:
reviewDf = df[["Clothing ID", "Review Text"]]

In [46]:
reviewDf.head(2)

Unnamed: 0,Clothing ID,Review Text
0,767,Absolutely wonderful - silky and sexy and comf...
1,1080,Love this dress! it's sooo pretty. i happene...


In [47]:
pd.DataFrame.info(reviewDf)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 2 columns):
Clothing ID    23486 non-null int64
Review Text    23486 non-null object
dtypes: int64(1), object(1)
memory usage: 367.0+ KB


remove stopwords, tokenizing

In [48]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [49]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sieunbae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
stopword = set(stopwords.words('english'))

In [51]:
def preprocessing(data):
    data = data.lower()
    words = RegexpTokenizer(r'[a-z]+').tokenize(data)
    words = [w for w in words if not w in stopword]
    
    for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
        words = [WordNetLemmatizer().lemmatize(x, pos) for x in words]
    
    return " ".join(words)

In [52]:
reviewDf['New Text'] = reviewDf["Review Text"].map(preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
reviewDf.head()

Unnamed: 0,Clothing ID,Review Text,New Text
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",love love love jumpsuit fun flirty fabulous ev...
4,847,This shirt is very flattering to all due to th...,shirt flat due adjustable front tie perfect le...


#### SentiWordNet

In [16]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/sieunbae/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [34]:
lemmatizer = WordNetLemmatizer()

In [24]:
from nltk.corpus import sentiwordnet as swn

In [25]:
def swn_polarity(text, tag):
    sentiment = 0.0
    
    lemma = lemmatizer.lemmatize(text, pos = )
    sentiment = swn.senti_synset(text)
    print(sentiment.all_senti_synsets())

In [None]:
for text in reviewDf['New Text'].astype('str')
reviewDf['New Text'].astype('str').split()
swn_polarity('')

In [26]:
reviewDf['SWN'] = reviewDf['New Text'].map(swn_polarity)

ValueError: not enough values to unpack (expected 3, got 1)

#### Using TextBlob

In [65]:
from textblob import TextBlob

In [68]:
reviewDf['TextBlob'] = reviewDf['New Text'].apply(lambda text: TextBlob(text).sentiment.polarity)
reviewDf['TextBlob'] = reviewDf['TextBlob'].apply(lambda x: 'positive' if x>0 else ('negative' if x<0 else 'neutral') )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [70]:
reviewDf.head(3)

Unnamed: 0,Clothing ID,Review Text,New Text,TextBlob
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,positive
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,positive
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...,positive


#### Using AFINN dictionary

In [74]:
pip install afinn

Collecting afinn
[?25l  Downloading https://files.pythonhosted.org/packages/86/e5/ffbb7ee3cca21ac6d310ac01944fb163c20030b45bda25421d725d8a859a/afinn-0.1.tar.gz (52kB)
[K     |████████████████████████████████| 61kB 871kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/sieunbae/Library/Caches/pip/wheels/b5/1c/de/428301f3333ca509dcf20ff358690eb23a1388fbcbbde008b2
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1
Note: you may need to restart the kernel to use updated packages.


In [75]:
from afinn import Afinn

In [76]:
afn = Afinn(emoticons = True)

In [78]:
reviewDf['AFINN'] = reviewDf['New Text'].apply(lambda text: afn.score(text))
reviewDf['AFINN'] = reviewDf['AFINN'].apply(lambda x: 'positive' if x>0 else ('negative' if x<0 else 'neutral'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [79]:
reviewDf.head(3)

Unnamed: 0,Clothing ID,Review Text,New Text,TextBlob,Rank_Class,AFINN
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,positive,positive,positive
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,positive,positive,positive
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...,positive,neutral,positive


#### Using 'rank' feature

In [71]:
def rank_polarity(rate):
    if rate >= 4:
        return 'positive'
    elif rate >= 3:
        return 'neutral'
    else:
        return 'negative'

In [72]:
reviewDf['Rank_Class'] = df['Rating'].map(rank_polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [73]:
reviewDf.head()

Unnamed: 0,Clothing ID,Review Text,New Text,TextBlob,Rank_Class
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,positive,positive
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,positive,positive
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...,positive,neutral
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",love love love jumpsuit fun flirty fabulous ev...,positive,positive
4,847,This shirt is very flattering to all due to th...,shirt flat due adjustable front tie perfect le...,positive,positive


### Classifier

In [62]:
print(reviewDf['New Text'].astype('str')[0].split())

['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']


In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviewDf['New Text'], df['Recommended ID'], test_size=0.2])

SyntaxError: invalid syntax (<ipython-input-55-22b03c7949d1>, line 3)

In [1]:
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

def classifier(data):
    train_set = data[1500:]
    test_set = data[:1500]
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('NaiveBayes:', nltk.classify.accuracy(classifier, test_set))
    
    classifier = nltk.DecisionTreeClassifier.train(train_set)
    print('DecisionTree:', nltk.classify.accuracy(classifier, test_set))
    
    classifier = SklearnClassifier(BernoulliNB()).train(train_set)
    print('BernoulliNB:', nltk.classify.accuracy(classifier, test_set))
    
    classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
    print('SVC:', nltk.classify.accuracy(classifier, test_set))