### import dataset

read csv file using pandas and get rid of unnamed index column

In [109]:
import pandas as pd

In [110]:
df = pd.read_csv("./dataset/WomensClothingE-CommerceReviews.csv")

In [111]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [112]:
df = df.drop(df.columns[0], axis='columns')

fill na with empty string instead of drop the rows.

In [113]:
pd.DataFrame.info(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 10 columns):
Clothing ID                23486 non-null int64
Age                        23486 non-null int64
Title                      19676 non-null object
Review Text                22641 non-null object
Rating                     23486 non-null int64
Recommended IND            23486 non-null int64
Positive Feedback Count    23486 non-null int64
Division Name              23472 non-null object
Department Name            23472 non-null object
Class Name                 23472 non-null object
dtypes: int64(5), object(5)
memory usage: 1.8+ MB


In [89]:
df.fillna('', inplace=True)

### preprocess strings of "review text"

In [90]:
reviewDf = df[["Clothing ID", "Review Text"]]

In [91]:
reviewDf.head()

Unnamed: 0,Clothing ID,Review Text
0,767,Absolutely wonderful - silky and sexy and comf...
1,1080,Love this dress! it's sooo pretty. i happene...
2,1077,I had such high hopes for this dress and reall...
3,1049,"I love, love, love this jumpsuit. it's fun, fl..."
4,847,This shirt is very flattering to all due to th...


In [92]:
pd.DataFrame.info(reviewDf)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 2 columns):
Clothing ID    23486 non-null int64
Review Text    23486 non-null object
dtypes: int64(1), object(1)
memory usage: 367.0+ KB


remove stopwords, tokenizing

In [93]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [94]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sieunbae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [95]:
stopword = set(stopwords.words('english'))

In [98]:
def preprocessing(data):
    data = data.lower()
    words = RegexpTokenizer(r'[a-z]+').tokenize(data)
    words = [w for w in words if not w in stopword]
    
    for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
        words = [WordNetLemmatizer().lemmatize(x, pos) for x in words]
    
    return " ".join(words)

In [99]:
reviewDf['New Text'] = reviewDf["Review Text"].map(preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


#### SentiWordNet

In [100]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/sieunbae/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [101]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

In [102]:
lemmatizer = WordNetLemmatizer()

In [None]:
def swn_polarity(text):
    sentiment = 0.0
    

#### Using 'rank' feature

In [105]:
def rank_polarity(rate):
    if rate >= 4:
        return 'Positive'
    elif rate >= 3:
        return 'Neutral'
    else:
        return 'Negative'

In [106]:
reviewDf['Rank_Class'] = df['Rating'].map(rank_polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [108]:
reviewDf.head()

Unnamed: 0,Clothing ID,Review Text,New Text,Rank_Class
0,767,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,Positive
1,1080,Love this dress! it's sooo pretty. i happene...,love dress sooo pretty happen find store glad ...,Positive
2,1077,I had such high hopes for this dress and reall...,high hope dress really want work initially ord...,Neutral
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",love love love jumpsuit fun flirty fabulous ev...,Positive
4,847,This shirt is very flattering to all due to th...,shirt flat due adjustable front tie perfect le...,Positive


### Classifier

In [None]:
train_set = featuresets[1500:]
test_set = featuresets[:1500]

- NaiveBayes

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
#classifier.show_most_informative_features(20)
nltk.classify.accuracy(classifier, test_set)

- DecisionTree

In [None]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
#classifier.show_most_informative_features(20)
nltk.classify.accuracy(classifier, test_set)

- from sklearn, nernoulliNB, SVC

In [114]:
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

In [None]:
classifier = SklearnClassifier(BernoulliNB()).train(train_set)
nltk.classify.accuracy(classifier, test_set)

In [None]:
classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
nltk.classify.accuracy(classifier, test_set)