In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, auc, precision_recall_fscore_support
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import time
from scipy.sparse import hstack

In [2]:
# raw = pd.read_csv('../../amazon_review_polarity_data/train.csv',header=None)
# smallset = raw.sample(360000)
# smallset.to_csv('small_training_set.csv', index=False, header=False)

In [3]:
data = pd.read_csv('small_training_set_30k.csv',header=None)

In [4]:
data.columns=['label','title','body']

In [5]:
data.head()

Unnamed: 0,label,title,body
0,2,Awesome show. Great shipping.,Two Parts to my review.The TV SHOW First..... ...
1,2,One of the best films I've ever seen,"It is as light and fun as a ""let's change the ..."
2,1,Horribly flat and under developed,"I ruined my vacation read (to Italy, none the ..."
3,2,The Definitive Brisson,"""Robert Bresson: A Spiritual Style in Film"" by..."
4,2,Classic Motown Tech.,This a slamming yet funky set of 80's electro ...


In [6]:
data.title.isnull().sum()

1

In [7]:
data = data.dropna()

In [8]:
data.shape

(29999, 3)

# 1. Title

In [9]:
X_train = data.title[:20000]
y_train = data.label[:20000]
X_test = data.title[20000:]
y_test = data.label[20000:]

### 1.1 Remove stop words

In [10]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
count_train

<20000x11775 sparse matrix of type '<class 'numpy.int64'>'
	with 52724 stored elements in Compressed Sparse Row format>

In [105]:
start_time = time.time()
model_1 = MultinomialNB()
model_1.fit(count_train, y_train)
pred_1 = model_1.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

--- 0.01299142837524414 seconds ---
accuracy:  0.7620762076207621
AUC:  0.7615496420258778


In [106]:
start_time = time.time()
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(count_train, y_train)
pred_2 = model_2.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

--- 0.5496516227722168 seconds ---
accuracy:  0.7591759175917592
AUC:  0.7594351479251585


In [107]:
start_time = time.time()
model_3 = RandomForestClassifier()
model_3.fit(count_train, y_train)
pred_3 = model_3.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

--- 46.14600992202759 seconds ---
accuracy:  0.7434743474347435
AUC:  0.7437197178460493


### 1.2 Keep all words

In [108]:
count_vectorizer = CountVectorizer()
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
count_train

<20000x12033 sparse matrix of type '<class 'numpy.int64'>'
	with 80968 stored elements in Compressed Sparse Row format>

In [109]:
start_time = time.time()
model_1 = MultinomialNB()
model_1.fit(count_train, y_train)
pred_1 = model_1.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

--- 0.011996030807495117 seconds ---
accuracy:  0.7932793279327933
AUC:  0.793125008252631


In [110]:
start_time = time.time()
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(count_train, y_train)
pred_2 = model_2.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

--- 0.5570240020751953 seconds ---
accuracy:  0.8023802380238024
AUC:  0.8020754416507982


In [111]:
start_time = time.time()
model_3 = RandomForestClassifier()
model_3.fit(count_train, y_train)
pred_3 = model_3.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

--- 35.75369071960449 seconds ---
accuracy:  0.7986798679867987
AUC:  0.7985937917007941


### 1.3 Define min df, 3-gram

In [112]:
count_vectorizer = CountVectorizer(min_df=3,ngram_range=(1,3))
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
count_train

<20000x6890 sparse matrix of type '<class 'numpy.int64'>'
	with 98435 stored elements in Compressed Sparse Row format>

In [113]:
start_time = time.time()
model_1 = MultinomialNB()
model_1.fit(count_train, y_train)
pred_1 = model_1.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

--- 0.014990091323852539 seconds ---
accuracy:  0.7998799879987999
AUC:  0.7993949871218945


In [114]:
start_time = time.time()
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(count_train, y_train)
pred_2 = model_2.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

--- 0.45305633544921875 seconds ---
accuracy:  0.8056805680568057
AUC:  0.8053074720220806


In [115]:
start_time = time.time()
model_3 = RandomForestClassifier()
model_3.fit(count_train, y_train)
pred_3 = model_3.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

--- 25.30671787261963 seconds ---
accuracy:  0.7961796179617961
AUC:  0.7963145850897265


### 1.4 tf-tdf

In [116]:
tfidf_vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,3))
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_train

<20000x6890 sparse matrix of type '<class 'numpy.float64'>'
	with 98435 stored elements in Compressed Sparse Row format>

In [117]:
start_time = time.time()
model_1 = MultinomialNB()
model_1.fit(tfidf_train, y_train)
pred_1 = model_1.predict(tfidf_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

--- 0.010986328125 seconds ---
accuracy:  0.7987798779877988
AUC:  0.7983656189593243


In [118]:
start_time = time.time()
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(tfidf_train, y_train)
pred_2 = model_2.predict(tfidf_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

--- 0.2836265563964844 seconds ---
accuracy:  0.8071807180718071
AUC:  0.8072908243147915


In [119]:
start_time = time.time()
model_3 = RandomForestClassifier()
model_3.fit(tfidf_train, y_train)
pred_3 = model_3.predict(tfidf_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

--- 24.346248865127563 seconds ---
accuracy:  0.7933793379337933
AUC:  0.7933504201139322


## 2. Body

In [130]:
X_train = data.body[:20000]
y_train = data.label[:20000]
X_test = data.body[20000:]
y_test = data.label[20000:]

### 2.1 Remove stop words

In [131]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
count_train

<20000x48195 sparse matrix of type '<class 'numpy.int64'>'
	with 586790 stored elements in Compressed Sparse Row format>

In [132]:
start_time = time.time()
model_1 = MultinomialNB()
model_1.fit(count_train, y_train)
pred_1 = model_1.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

--- 0.028980255126953125 seconds ---
accuracy:  0.8131813181318132
AUC:  0.8134360634170174


In [133]:
start_time = time.time()
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(count_train, y_train)
pred_2 = model_2.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

--- 2.146034002304077 seconds ---
accuracy:  0.8287828782878288
AUC:  0.8287472646279633


In [134]:
start_time = time.time()
model_3 = RandomForestClassifier()
model_3.fit(count_train, y_train)
pred_3 = model_3.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

--- 51.998783588409424 seconds ---
accuracy:  0.8263826382638264
AUC:  0.8264535934055778


### 2.2 no stop words

In [135]:
count_vectorizer = CountVectorizer()
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
count_train

<20000x48498 sparse matrix of type '<class 'numpy.int64'>'
	with 1051428 stored elements in Compressed Sparse Row format>

In [136]:
start_time = time.time()
model_1 = MultinomialNB()
model_1.fit(count_train, y_train)
pred_1 = model_1.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

--- 0.04297208786010742 seconds ---
accuracy:  0.8176817681768177
AUC:  0.8180646190005374


In [137]:
start_time = time.time()
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(count_train, y_train)
pred_2 = model_2.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

--- 6.62457013130188 seconds ---
accuracy:  0.8491849184918492
AUC:  0.8491604523522099


In [138]:
start_time = time.time()
model_3 = RandomForestClassifier()
model_3.fit(count_train, y_train)
pred_3 = model_3.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

--- 49.96270418167114 seconds ---
accuracy:  0.8266826682668267
AUC:  0.8270207942292003


### 2.3 min df, ngram

In [139]:
count_vectorizer = CountVectorizer(min_df=3,ngram_range=(1,3))
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
count_train

<20000x125847 sparse matrix of type '<class 'numpy.int64'>'
	with 2203932 stored elements in Compressed Sparse Row format>

In [140]:
start_time = time.time()
model_1 = MultinomialNB()
model_1.fit(count_train, y_train)
pred_1 = model_1.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

--- 0.1435389518737793 seconds ---
accuracy:  0.8661866186618662
AUC:  0.8662579430322388


In [141]:
start_time = time.time()
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(count_train, y_train)
pred_2 = model_2.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

--- 10.883943319320679 seconds ---
accuracy:  0.8744874487448745
AUC:  0.8744409117626699


In [142]:
start_time = time.time()
model_3 = RandomForestClassifier()
model_3.fit(count_train, y_train)
pred_3 = model_3.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

--- 76.82256865501404 seconds ---
accuracy:  0.8359835983598359
AUC:  0.8362846675520156


### 2.4. tf-tdf

In [87]:
tfidf_vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,3))
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_train

<20000x125847 sparse matrix of type '<class 'numpy.float64'>'
	with 2203932 stored elements in Compressed Sparse Row format>

In [88]:
model_1 = MultinomialNB()
model_1.fit(tfidf_train, y_train)
pred_1 = model_1.predict(tfidf_test)
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

accuracy:  0.8667866786678667
AUC:  0.8669954581520588


In [89]:
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(tfidf_train, y_train)
pred_2 = model_2.predict(tfidf_test)
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

accuracy:  0.8731873187318732
AUC:  0.8733096311103979


In [90]:
model_3 = RandomForestClassifier()
model_3.fit(tfidf_train, y_train)
pred_3 = model_3.predict(tfidf_test)
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

accuracy:  0.8344834483448345
AUC:  0.8347810682045436


## 3. Title + Body

In [15]:
X_train = data[:20000]
y_train = data.label[:20000]
X_test = data[20000:]
y_test = data.label[20000:]

In [16]:
count_vectorizer = CountVectorizer(min_df=3,ngram_range=(1,3))
count_train_title = count_vectorizer.fit_transform(X_train.title)
count_test_title = count_vectorizer.transform(X_test.title)
count_train_title

<20000x6890 sparse matrix of type '<class 'numpy.int64'>'
	with 98435 stored elements in Compressed Sparse Row format>

In [17]:
count_vectorizer = CountVectorizer(min_df=3,ngram_range=(1,3))
count_train_body = count_vectorizer.fit_transform(X_train.body)
count_test_body = count_vectorizer.transform(X_test.body)
count_train_body

<20000x125847 sparse matrix of type '<class 'numpy.int64'>'
	with 2203932 stored elements in Compressed Sparse Row format>

In [18]:
count_train = hstack((count_train_title,count_train_body))
count_test =  hstack((count_test_title,count_test_body))
count_train

<20000x132737 sparse matrix of type '<class 'numpy.int64'>'
	with 2302367 stored elements in COOrdinate format>

In [19]:
start_time = time.time()
model_1 = MultinomialNB()
model_1.fit(count_train, y_train)
pred_1 = model_1.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_1))
print(('AUC: '), roc_auc_score(y_test, pred_1))

--- 0.25185656547546387 seconds ---
accuracy:  0.8838883888388839
AUC:  0.8839018479091134


In [20]:
start_time = time.time()
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(count_train, y_train)
pred_2 = model_2.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_2))
print(('AUC: '), roc_auc_score(y_test, pred_2))

--- 13.08782696723938 seconds ---
accuracy:  0.8953895389538954
AUC:  0.8953901903926972


In [21]:
start_time = time.time()
model_3 = RandomForestClassifier()
model_3.fit(count_train, y_train)
pred_3 = model_3.predict(count_test)
print("--- %s seconds ---" % (time.time() - start_time))
print('accuracy: ' , accuracy_score(y_test, pred_3))
print(('AUC: '), roc_auc_score(y_test, pred_3))

--- 84.34293007850647 seconds ---
accuracy:  0.8565856585658566
AUC:  0.8567888642899356


## X. Lemmatizing

a quick practice

In [121]:
lemmer=WordNetLemmatizer()

In [126]:
lemmer.lemmatize("Finaly, I'm so disappointed. It's disappointing.",'v')

"Finaly, I'm so disappointed. It's disappointing."

In [129]:
WordNetLemmatizer().lemmatize('running')

'running'

In [127]:
WordNetLemmatizer().lemmatize("bought",'v')

'buy'

In [None]:
nltk.pos_tag(['taking']), nltk.pos_tag(['disappointed']), nltk.pos_tag(['finaly']),nltk.pos_tag(['bought']),nltk.pos_tag(['wrote'])

In [10]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'feet'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])
#> ['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']

foot
['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']


In [11]:
sentence = "Finaly I became disappointed at him. It's disappointing. He bought cars quickly"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

['Finaly', 'I', 'become', 'disappointed', 'at', 'him', '.', 'It', "'s", 'disappoint', '.', 'He', 'bought', 'car', 'quickly']


In [39]:
X_train = data.title[:1000].apply(lemmatize_sentence)

In [45]:
data.title[50:1000]

50                                    Worst. Item. Ever.
51                                              Twilight
52                      Another love story gone wrong...
53                        Some good laughs, but not many
54                 Garmin Forerunner 350 Charging Cradle
                             ...                        
995                                          Bridge Book
996                                            Too small
997                                           Compelling
998    Another first person account of life in the fa...
999                       an inspiration to a new mother
Name: title, Length: 950, dtype: object

In [47]:
X_train.sample(10)

426                   Not 32 oz . !
671    ABSOULUTE GARBAGE HORRIBBILE
289                    Good quality
549                     Cool Device
691    No eye irritation , high SPF
579                   Inspirational
647       Expo should be pay me ! !
696             confidence reassure
468            Completely worthless
911        Just one minor criticism
Name: title, dtype: object

In [30]:
def lemmatize_sentence(sentence):
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

In [18]:
data.title[:10]

0           Awesome show. Great shipping.
1    One of the best films I've ever seen
2       Horribly flat and under developed
3                  The Definitive Brisson
4                    Classic Motown Tech.
5             A surprising disappointment
6                    A disgrace to Oneida
7          This movie was poorly created.
8                  Prophet Muhammad (saw)
9                            great boots!
Name: title, dtype: object

In [24]:
data.title[:10].apply(lemmatize_sentence)

0           [Awesome, show, ., Great, shipping, .]
1    [One, of, the, best, film, I, 've, ever, see]
2          [Horribly, flat, and, under, developed]
3                       [The, Definitive, Brisson]
4                       [Classic, Motown, Tech, .]
5                  [A, surprising, disappointment]
6                        [A, disgrace, to, Oneida]
7             [This, movie, be, poorly, create, .]
8                   [Prophet, Muhammad, (, saw, )]
9                                 [great, boot, !]
Name: title, dtype: object

In [31]:
X_train = data.title[:10].apply(lambda x: lemmatize_sentence(x))

In [32]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)

<10x36 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [19]:
data.title.apply(lambda x: lemmatize_sentence(x))

KeyboardInterrupt: 

In [None]:
X_train = data.title[:20000]
y_train = data.label[:20000]
X_test = data.title[20000:]
y_test = data.label[20000:]

In [None]:
count_vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2),min_df=3)
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(count_train, y_train)
pred = model.predict(count_test)

In [None]:
print('accuracy: ' , accuracy_score(y_test, pred))
print(('AUC: '), roc_auc_score(y_test, pred))

In [None]:
feature_names = np.array(count_vectorizer.get_feature_names())
coef_index_sorted = model.coef_[0].argsort()
print('Smallest Coefs: {}'.format(feature_names[coef_index_sorted[:10]]))
print('Largest Coefs: {}'.format(feature_names[coef_index_sorted[-10:]]))

In [None]:
idx = count_vectorizer.vocabulary_['dissapoint']
count_train.sum(axis=0)[0,idx]