In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, auc, precision_recall_fscore_support

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# raw = pd.read_csv('../../amazon_review_polarity_data/train.csv',header=None)
# smallset = raw.sample(360000)
# smallset.to_csv('small_training_set.csv', index=False, header=False)

In [5]:
data = pd.read_csv('small_training_set.csv',header=None)

In [6]:
data.columns=['label','title','body']

In [7]:
data.head()

Unnamed: 0,label,title,body
0,1,Frustrating,Breathnach's Simple Abundance and gratitude jo...
1,2,Not what I expected... But Good,I was looking for a battery grip extended batt...
2,2,well worth your money,After buying and using seveeral different and ...
3,2,Wake up sisters....,I read this book in one day. I Could not put i...
4,1,Terrible help file,Was trying to use the alignment feature with t...


In [8]:
data.title.isnull().sum()

14

In [9]:
data = data.dropna()

In [10]:
data.shape

(359986, 3)

## Title

In [32]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [34]:
data.title

0                                   Frustrating
1               Not what I expected... But Good
2                         well worth your money
3                           Wake up sisters....
4                            Terrible help file
                          ...                  
359995                      Not what I expected
359996               Very average for the roots
359997                     This movie sucks ***
359998    Doesn't cover multi-carburetor tuning
359999             You have got be kidding me!!
Name: title, Length: 359986, dtype: object

In [33]:
data.title.apply(lambda x: stemmer.stem(x))

0                                   frustrat
1            not what i expected... but good
2                      well worth your money
3                        wake up sisters....
4                          terrible help fil
                         ...                
359995                     not what i expect
359996             very average for the root
359997                  this movie sucks ***
359998    doesn't cover multi-carburetor tun
359999          you have got be kidding me!!
Name: title, Length: 359986, dtype: object

In [38]:
data.title.apply(lambda x: lemmer.lemmatize(x))

0                                   Frustrating
1               Not what I expected... But Good
2                         well worth your money
3                           Wake up sisters....
4                            Terrible help file
                          ...                  
359995                      Not what I expected
359996               Very average for the roots
359997                     This movie sucks ***
359998    Doesn't cover multi-carburetor tuning
359999             You have got be kidding me!!
Name: title, Length: 359986, dtype: object

In [35]:
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()

In [37]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shixi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

## Convert to count vector

In [11]:
count_vectorizer = CountVectorizer(stop_words='english')

In [12]:
X_train = data.title[:300000]
y_train = data.label[:300000]
X_test = data.title[300000:]
y_test = data.label[300000:]

In [13]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [14]:
count_train

<300000x53294 sparse matrix of type '<class 'numpy.int64'>'
	with 787149 stored elements in Compressed Sparse Row format>

In [15]:
count_test

<59986x53294 sparse matrix of type '<class 'numpy.int64'>'
	with 152161 stored elements in Compressed Sparse Row format>

In [16]:
sum_words = count_train.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [17]:
words_freq

[('great', 22981),
 ('good', 16702),
 ('book', 14291),
 ('best', 8008),
 ('don', 5750),
 ('read', 5184),
 ('movie', 5074),
 ('product', 5039),
 ('buy', 4991),
 ('bad', 4835),
 ('love', 4689),
 ('excellent', 4616),
 ('money', 4606),
 ('time', 3663),
 ('waste', 3563),
 ('work', 3510),
 ('just', 3476),
 ('like', 3472),
 ('quality', 3453),
 ('better', 3405),
 ('poor', 3278),
 ('worth', 2898),
 ('cd', 2844),
 ('nice', 2798),
 ('works', 2749),
 ('fun', 2655),
 ('disappointed', 2389),
 ('disappointing', 2374),
 ('review', 2370),
 ('price', 2350),
 ('really', 2321),
 ('worst', 2313),
 ('does', 2307),
 ('awesome', 2212),
 ('album', 2195),
 ('little', 2132),
 ('dvd', 2117),
 ('terrible', 2103),
 ('story', 2090),
 ('music', 2036),
 ('game', 2030),
 ('new', 1994),
 ('boring', 1948),
 ('horrible', 1917),
 ('wonderful', 1877),
 ('doesn', 1853),
 ('perfect', 1746),
 ('junk', 1708),
 ('classic', 1689),
 ('use', 1579),
 ('old', 1561),
 ('amazing', 1504),
 ('life', 1422),
 ('ve', 1415),
 ('big', 1388),


## Model

In [18]:
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

MultinomialNB()

In [19]:
pred = nb_classifier.predict(count_test)
score = accuracy_score(y_test, pred)

In [20]:
score

0.7930683826226119

In [21]:
neg_class_prob_sorted = nb_classifier.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = nb_classifier.feature_log_prob_[1, :].argsort()[::-1]

print(np.take(count_vectorizer.get_feature_names(), neg_class_prob_sorted[:20]))
print(np.take(count_vectorizer.get_feature_names(), pos_class_prob_sorted[:20]))

['good' 'book' 'don' 'bad' 'money' 'buy' 'waste' 'poor' 'great' 'work'
 'product' 'quality' 'disappointing' 'disappointed' 'worst' 'movie' 'time'
 'terrible' 'like' 'just']
['great' 'good' 'book' 'best' 'excellent' 'love' 'read' 'movie' 'product'
 'fun' 'awesome' 'works' 'nice' 'cd' 'wonderful' 'price' 'better'
 'perfect' 'album' 'just']


In [23]:
count_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
# from sklearn.inspection import permutation_importance

# imps = permutation_importance(nb_classifier, count_test.toarray(), y_test)
# importances = imps.importances_mean
# std = imps.importances_std
# indices = np.argsort(importances)[::-1]

## Body

### 1. convert to count vector

In [22]:
count_vectorizer = CountVectorizer(stop_words='english')

In [23]:
X_train = data.body[:300000]
y_train = data.label[:300000]
X_test = data.body[300000:]
y_test = data.label[300000:]

In [24]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [25]:
sum_words = count_train.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [26]:
words_freq

[('book', 150511),
 ('like', 82239),
 ('just', 75500),
 ('good', 70632),
 ('great', 67033),
 ('read', 53866),
 ('time', 52967),
 ('really', 45310),
 ('movie', 45184),
 ('don', 43240),
 ('love', 34628),
 ('buy', 32808),
 ('use', 32474),
 ('cd', 31943),
 ('product', 31870),
 ('better', 31038),
 ('bought', 30587),
 ('work', 29816),
 ('did', 29472),
 ('new', 27912),
 ('way', 27874),
 ('story', 27694),
 ('album', 27326),
 ('best', 27256),
 ('little', 26854),
 ('ve', 26797),
 ('think', 26618),
 ('does', 26225),
 ('make', 25119),
 ('know', 25039),
 ('got', 24615),
 ('music', 24424),
 ('money', 23556),
 ('people', 22957),
 ('want', 22725),
 ('books', 22166),
 ('years', 21944),
 ('recommend', 21886),
 ('old', 21776),
 ('used', 20823),
 ('bad', 20578),
 ('didn', 19662),
 ('dvd', 19220),
 ('reading', 19208),
 ('say', 19152),
 ('game', 19064),
 ('quality', 18386),
 ('life', 18239),
 ('songs', 18065),
 ('thing', 17547),
 ('thought', 17408),
 ('doesn', 16341),
 ('easy', 16274),
 ('lot', 16019),
 ('a

### 2. model creating, training and prediction

In [27]:
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

MultinomialNB()

In [28]:
pred = nb_classifier.predict(count_test)
score = accuracy_score(y_test, pred)

In [29]:
score

0.8192911679391858

In [30]:
nb_classifier.classes_

array([1, 2], dtype=int64)

In [31]:
nb_classifier.class_count_

array([149781., 150219.])

In [32]:
nb_classifier.feature_log_prob_

array([[ -8.22789527,  -9.47230164, -14.10703063, ..., -14.3947127 ,
        -15.49332499, -15.49332499],
       [ -9.05581721,  -9.61924092, -13.04142857, ..., -14.05302949,
        -14.74617667, -14.74617667]])

In [33]:
neg_class_prob_sorted = nb_classifier.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = nb_classifier.feature_log_prob_[1, :].argsort()[::-1]

print(np.take(count_vectorizer.get_feature_names(), neg_class_prob_sorted[:20]))
print(np.take(count_vectorizer.get_feature_names(), pos_class_prob_sorted[:20]))

['book' 'just' 'like' 'good' 'time' 'don' 'movie' 'read' 'really'
 'product' 'buy' 'money' 'did' 'better' 'bought' 'work' 'great' 'use'
 'bad' 'does']
['book' 'great' 'good' 'like' 'just' 'read' 'love' 'time' 'really' 'movie'
 'best' 'cd' 'album' 'use' 'story' 'don' 'little' 'music' 've' 'new']


In [34]:
nb_classifier.feature_log_prob_[0, :].argsort()[::-1]

array([ 28339, 102082, 109698, ...,  66345, 147578, 210062], dtype=int64)

In [35]:
nb_classifier.feature_log_prob_[0, :]

array([ -8.22789527,  -9.47230164, -14.10703063, ..., -14.3947127 ,
       -15.49332499, -15.49332499])

### 3. tf-tdf

In [25]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [26]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [38]:
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_train, y_train)
pred = nb_classifier.predict(tfidf_test)
score = accuracy_score(y_test, pred)
score

0.8176407828493315

In [39]:
sum_words = tfidf_train.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [40]:
tfidf_train

<300000x210063 sparse matrix of type '<class 'numpy.float64'>'
	with 8806272 stored elements in Compressed Sparse Row format>

In [41]:
words_freq

[('book', 9207.425534728178),
 ('great', 5124.245560837037),
 ('like', 5092.831873521934),
 ('good', 5045.761941366971),
 ('just', 4843.440688934946),
 ('movie', 4307.9019878267345),
 ('read', 4253.106772437303),
 ('time', 3905.2397302228546),
 ('really', 3598.7016772339316),
 ('don', 3305.9227812006793),
 ('product', 3223.6849968936476),
 ('love', 3169.234985967884),
 ('cd', 3127.535814181859),
 ('buy', 2910.04642110707),
 ('use', 2792.9315040430165),
 ('bought', 2785.8783028873704),
 ('better', 2627.0321996537537),
 ('work', 2614.2038143829623),
 ('did', 2567.61634993878),
 ('story', 2512.9172396537983),
 ('album', 2507.994734986436),
 ('best', 2422.573969702565),
 ('money', 2365.915500303173),
 ('little', 2365.441563946448),
 ('way', 2320.675215113895),
 ('new', 2319.775908554381),
 ('got', 2310.7199732163263),
 ('ve', 2310.693994924523),
 ('think', 2296.66410194374),
 ('music', 2276.391145501594),
 ('does', 2260.506527938294),
 ('recommend', 2204.366072494626),
 ('know', 2165.67081

In [42]:
neg_class_prob_sorted = nb_classifier.feature_log_prob_[0, :].argsort()[::-1] # argsore returns an array of index ordered by the value
pos_class_prob_sorted = nb_classifier.feature_log_prob_[1, :].argsort()[::-1]

print(np.take(count_vectorizer.get_feature_names(), neg_class_prob_sorted[:20]))
print(np.take(count_vectorizer.get_feature_names(), pos_class_prob_sorted[:20]))

['book' 'just' 'like' 'movie' 'don' 'good' 'time' 'product' 'money' 'read'
 'buy' 'did' 'really' 'work' 'bought' 'better' 'bad' 'waste' 'use' 'didn']
['book' 'great' 'good' 'read' 'love' 'like' 'just' 'movie' 'really' 'cd'
 'time' 'best' 'album' 'easy' 'story' 'music' 'use' 'recommend' 'little'
 've']


In [43]:
nb_classifier.feature_log_prob_[0, :]

array([ -8.36301008,  -9.65790296, -13.38936406, ..., -13.54020875,
       -13.73965496, -13.73965496])

In [44]:
nb_classifier.feature_log_prob_[0, :].argsort()

array([210062,  74527,  74526, ..., 109698, 102082,  28339], dtype=int64)

In [45]:
np.array([1,2,3,4])

array([1, 2, 3, 4])

In [46]:
np.array([1,4,5,2]).argsort()

array([0, 3, 1, 2], dtype=int64)