In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, auc, precision_recall_fscore_support

In [4]:
# raw = pd.read_csv('../../amazon_review_polarity_data/train.csv',header=None)
# smallset = raw.sample(360000)
# smallset.to_csv('small_training_set.csv', index=False, header=False)

In [5]:
data = pd.read_csv('small_training_set.csv',header=None)

In [6]:
data.columns=['label','title','body']

In [7]:
data.head()

Unnamed: 0,label,title,body
0,1,Frustrating,Breathnach's Simple Abundance and gratitude jo...
1,2,Not what I expected... But Good,I was looking for a battery grip extended batt...
2,2,well worth your money,After buying and using seveeral different and ...
3,2,Wake up sisters....,I read this book in one day. I Could not put i...
4,1,Terrible help file,Was trying to use the alignment feature with t...


In [8]:
data.title.isnull().sum()

14

In [9]:
data = data.dropna()

In [10]:
data.shape

(359986, 3)

# 1. Title

In [106]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [107]:
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()
# import nltk
# nltk.download('wordnet')

In [12]:
data.title

0                                   Frustrating
1               Not what I expected... But Good
2                         well worth your money
3                           Wake up sisters....
4                            Terrible help file
                          ...                  
359995                      Not what I expected
359996               Very average for the roots
359997                     This movie sucks ***
359998    Doesn't cover multi-carburetor tuning
359999             You have got be kidding me!!
Name: title, Length: 359986, dtype: object

In [13]:
data.title.apply(lambda x: stemmer.stem(x))

0                                   frustrat
1            not what i expected... but good
2                      well worth your money
3                        wake up sisters....
4                          terrible help fil
                         ...                
359995                     not what i expect
359996             very average for the root
359997                  this movie sucks ***
359998    doesn't cover multi-carburetor tun
359999          you have got be kidding me!!
Name: title, Length: 359986, dtype: object

In [108]:
data.title.apply(lambda x: lemmer.lemmatize(x))

0                                   Frustrating
1               Not what I expected... But Good
2                         well worth your money
3                           Wake up sisters....
4                            Terrible help file
                          ...                  
359995                      Not what I expected
359996               Very average for the roots
359997                     This movie sucks ***
359998    Doesn't cover multi-carburetor tuning
359999             You have got be kidding me!!
Name: title, Length: 359986, dtype: object

In [105]:
stemmer.stem("the boy's cars are different colors")

"the boy's cars are different color"

In [109]:
lemmer.lemmatize("the boy's cars are different colors")

"the boy's cars are different colors"

In [16]:
X_train = data.title[:300000]
y_train = data.label[:300000]
X_test = data.title[300000:]
y_test = data.label[300000:]

## 1.1 count vector

In [15]:
count_vectorizer = CountVectorizer(stop_words='english')

In [17]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [18]:
count_train

<300000x53294 sparse matrix of type '<class 'numpy.int64'>'
	with 787149 stored elements in Compressed Sparse Row format>

## 1.2 Model

In [21]:
model = LogisticRegression(max_iter=1000)
model.fit(count_train, y_train)

LogisticRegression(max_iter=1000)

In [22]:
pred = model.predict(count_test)

In [27]:
print('accuracy: ' , accuracy_score(y_test, pred))
print(('AUC: '), roc_auc_score(y_test, pred))

accuracy:  0.7970193045043844
AUC:  0.7970203504704927


In [43]:
feature_names = np.array(count_vectorizer.get_feature_names())
coef_index_sorted = model.coef_[0].argsort()
print('Smallest Coefs: {}'.format(feature_names[coef_index_sorted[:10]]))
print('Largest Coefs: {}'.format(feature_names[coef_index_sorted[-10:]]))

Smallest Coefs: ['worst' 'awful' 'disappointing' 'useless' 'horrible' 'terrible' 'junk'
 'disappointment' 'poorly' 'waste']
Largest Coefs: ['wonderful' 'amazing' 'delightful' 'fabulous' 'gem' 'fantastic'
 'underrated' 'awesome' 'excellent' 'outstanding']


## 1.3 2_gram

In [87]:
count_vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2))
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [88]:
model = LogisticRegression(max_iter=1000)
model.fit(count_train, y_train)
pred = model.predict(count_test)

In [89]:
print('accuracy: ' , accuracy_score(y_test, pred))
print(('AUC: '), roc_auc_score(y_test, pred))

accuracy:  0.8101557029973661
AUC:  0.8101488386305544


In [90]:
feature_names = np.array(count_vectorizer.get_feature_names())
coef_index_sorted = model.coef_[0].argsort()
print('Smallest Coefs: {}'.format(feature_names[coef_index_sorted[:10]]))
print('Largest Coefs: {}'.format(feature_names[coef_index_sorted[-10:]]))

Smallest Coefs: ['worst' 'disappointing' 'awful' 'horrible' 'terrible' 'useless'
 'great works' 'junk' 'disappointment' 'poorly']
Largest Coefs: ['terrific' 'gem' 'underrated' 'fantastic' 'doesn better' 'awesome'
 'excellent' 'outstanding' 'won disappointed' 'better expected']


## 1.4 3_gram

In [91]:
count_vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,3))
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [92]:
model = LogisticRegression(max_iter=1000)
model.fit(count_train, y_train)
pred = model.predict(count_test)

In [93]:
print('accuracy: ' , accuracy_score(y_test, pred))
print(('AUC: '), roc_auc_score(y_test, pred))

accuracy:  0.8101723735538292
AUC:  0.8101655537711288


In [94]:
feature_names = np.array(count_vectorizer.get_feature_names())
coef_index_sorted = model.coef_[0].argsort()
print('Smallest Coefs: {}'.format(feature_names[coef_index_sorted[:10]]))
print('Largest Coefs: {}'.format(feature_names[coef_index_sorted[-10:]]))

Smallest Coefs: ['worst' 'disappointing' 'awful' 'horrible' 'great works' 'terrible'
 'useless' 'junk' 'disappointment' 'poorly']
Largest Coefs: ['fabulous' 'terrific' 'gem' 'fantastic' 'doesn better' 'awesome'
 'outstanding' 'excellent' 'won disappointed' 'better expected']


## 1.5 TF-IDF

In [110]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=5,ngram_range=(1,2))

In [111]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [112]:
tfidf_train

<300000x21639 sparse matrix of type '<class 'numpy.float64'>'
	with 870770 stored elements in Compressed Sparse Row format>

In [113]:
model = LogisticRegression(max_iter=1000)
model.fit(tfidf_train, y_train)
pred = model.predict(tfidf_test)

In [114]:
print('accuracy: ' , accuracy_score(y_test, pred))
print(('AUC: '), roc_auc_score(y_test, pred))

accuracy:  0.804337678791718
AUC:  0.8043503823220594


## 1.6 Stemming

In [115]:
data.title = data.title.apply(lambda x: stemmer.stem(x))

In [116]:
X_train = data.title[:300000]
y_train = data.label[:300000]
X_test = data.title[300000:]
y_test = data.label[300000:]

In [117]:
count_vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2))
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [118]:
model = LogisticRegression(max_iter=1000)
model.fit(count_train, y_train)
pred = model.predict(count_test)

In [119]:
print('accuracy: ' , accuracy_score(y_test, pred))
print(('AUC: '), roc_auc_score(y_test, pred))

accuracy:  0.806621545027173
AUC:  0.8066096753796753


In [120]:
feature_names = np.array(count_vectorizer.get_feature_names())
coef_index_sorted = model.coef_[0].argsort()
print('Smallest Coefs: {}'.format(feature_names[coef_index_sorted[:10]]))
print('Largest Coefs: {}'.format(feature_names[coef_index_sorted[-10:]]))

Smallest Coefs: ['worst' 'disappointing' 'garbag' 'awful' 'useless' 'horrible' 'horribl'
 'terribl' 'terrible' 'junk']
Largest Coefs: ['fantastic' 'bad al' 'outstand' 'awesom' 'better expect' 'excel'
 'outstanding' 'awesome' 'excellent' 'doesn disappoint']


## 1.7 Lemmatizing

In [121]:
data.title = data.title.apply(lambda x: lemmer.lemmatize(x))

In [122]:
X_train = data.title[:300000]
y_train = data.label[:300000]
X_test = data.title[300000:]
y_test = data.label[300000:]

In [123]:
count_vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2))
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [124]:
model = LogisticRegression(max_iter=1000)
model.fit(count_train, y_train)
pred = model.predict(count_test)

In [125]:
print('accuracy: ' , accuracy_score(y_test, pred))
print(('AUC: '), roc_auc_score(y_test, pred))

accuracy:  0.8065715333577835
AUC:  0.8065600635440578


In [126]:
feature_names = np.array(count_vectorizer.get_feature_names())
coef_index_sorted = model.coef_[0].argsort()
print('Smallest Coefs: {}'.format(feature_names[coef_index_sorted[:10]]))
print('Largest Coefs: {}'.format(feature_names[coef_index_sorted[-10:]]))

Smallest Coefs: ['worst' 'disappointing' 'garbag' 'useless' 'awful' 'horrible' 'horribl'
 'terribl' 'terrible' 'junk']
Largest Coefs: ['fantastic' 'bad al' 'outstand' 'awesom' 'better expect' 'excel'
 'outstanding' 'awesome' 'excellent' 'doesn disappoint']


## Body

### 1. convert to count vector

In [22]:
count_vectorizer = CountVectorizer(stop_words='english')

In [23]:
X_train = data.body[:300000]
y_train = data.label[:300000]
X_test = data.body[300000:]
y_test = data.label[300000:]

In [24]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [25]:
sum_words = count_train.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [26]:
words_freq

[('book', 150511),
 ('like', 82239),
 ('just', 75500),
 ('good', 70632),
 ('great', 67033),
 ('read', 53866),
 ('time', 52967),
 ('really', 45310),
 ('movie', 45184),
 ('don', 43240),
 ('love', 34628),
 ('buy', 32808),
 ('use', 32474),
 ('cd', 31943),
 ('product', 31870),
 ('better', 31038),
 ('bought', 30587),
 ('work', 29816),
 ('did', 29472),
 ('new', 27912),
 ('way', 27874),
 ('story', 27694),
 ('album', 27326),
 ('best', 27256),
 ('little', 26854),
 ('ve', 26797),
 ('think', 26618),
 ('does', 26225),
 ('make', 25119),
 ('know', 25039),
 ('got', 24615),
 ('music', 24424),
 ('money', 23556),
 ('people', 22957),
 ('want', 22725),
 ('books', 22166),
 ('years', 21944),
 ('recommend', 21886),
 ('old', 21776),
 ('used', 20823),
 ('bad', 20578),
 ('didn', 19662),
 ('dvd', 19220),
 ('reading', 19208),
 ('say', 19152),
 ('game', 19064),
 ('quality', 18386),
 ('life', 18239),
 ('songs', 18065),
 ('thing', 17547),
 ('thought', 17408),
 ('doesn', 16341),
 ('easy', 16274),
 ('lot', 16019),
 ('a

### 2. model creating, training and prediction

In [27]:
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

MultinomialNB()

In [28]:
pred = nb_classifier.predict(count_test)
score = accuracy_score(y_test, pred)

In [29]:
score

0.8192911679391858

In [30]:
nb_classifier.classes_

array([1, 2], dtype=int64)

In [31]:
nb_classifier.class_count_

array([149781., 150219.])

In [32]:
nb_classifier.feature_log_prob_

array([[ -8.22789527,  -9.47230164, -14.10703063, ..., -14.3947127 ,
        -15.49332499, -15.49332499],
       [ -9.05581721,  -9.61924092, -13.04142857, ..., -14.05302949,
        -14.74617667, -14.74617667]])

In [33]:
neg_class_prob_sorted = nb_classifier.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = nb_classifier.feature_log_prob_[1, :].argsort()[::-1]

print(np.take(count_vectorizer.get_feature_names(), neg_class_prob_sorted[:20]))
print(np.take(count_vectorizer.get_feature_names(), pos_class_prob_sorted[:20]))

['book' 'just' 'like' 'good' 'time' 'don' 'movie' 'read' 'really'
 'product' 'buy' 'money' 'did' 'better' 'bought' 'work' 'great' 'use'
 'bad' 'does']
['book' 'great' 'good' 'like' 'just' 'read' 'love' 'time' 'really' 'movie'
 'best' 'cd' 'album' 'use' 'story' 'don' 'little' 'music' 've' 'new']


In [34]:
nb_classifier.feature_log_prob_[0, :].argsort()[::-1]

array([ 28339, 102082, 109698, ...,  66345, 147578, 210062], dtype=int64)

In [35]:
nb_classifier.feature_log_prob_[0, :]

array([ -8.22789527,  -9.47230164, -14.10703063, ..., -14.3947127 ,
       -15.49332499, -15.49332499])

### 3. tf-tdf

In [81]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=5)

In [82]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [83]:
tfidf_train

<300000x12549 sparse matrix of type '<class 'numpy.float64'>'
	with 726559 stored elements in Compressed Sparse Row format>

In [84]:
model = LogisticRegression(max_iter=1000)
model.fit(tfidf_train, y_train)
pred = model.predict(tfidf_test)

In [85]:
print('accuracy: ' , accuracy_score(y_test, pred))
print(('AUC: '), roc_auc_score(y_test, pred))

accuracy:  0.7947854499383189
AUC:  0.7948052444564214


In [39]:
sum_words = tfidf_train.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [40]:
tfidf_train

<300000x210063 sparse matrix of type '<class 'numpy.float64'>'
	with 8806272 stored elements in Compressed Sparse Row format>