## Introduction to NLP   at Karakun             Part 2     

In [46]:
import nltk              
import random
import numpy as np
random.seed(0)

# Sentiment analysis with logistic 

We'll use the IMDB dataset that contains the text of 50,000 movie reviews from the Internet Movie Database. These are split into 25,000 reviews for training and 25,000 reviews for testing/evaluation. The training and testing sets are balanced, meaning they contain an equal number of positive and negative reviews.

In [47]:
train_input = []
test_input  = []

with open('./Sentiment-Analysis-Data/imdb/train-pos.txt','r') as f:
    for line in f:
        train_input.append((line,1))
f.close()

with open('./Sentiment-Analysis-Data/imdb/train-neg.txt','r') as f:
    for line in f:
        train_input.append((line,0))
f.close()

with open('./Sentiment-Analysis-Data/imdb/test-pos.txt','r') as f:
    for line in f:
        test_input.append((line,1))
f.close()

with open('./Sentiment-Analysis-Data/imdb/test-neg.txt','r') as f:
    for line in f:
        test_input.append((line,0))
f.close()


random.shuffle(train_input)
random.shuffle(test_input)

In [48]:
train_reviews = []
train_sent    = []
test_reviews  = []
test_sent     = []

for (words, sentiment) in train_input:
    review_filtered = ' '.join(e.lower() for e in words.split() if len(e) >= 2)
    train_reviews.append(review_filtered)
    train_sent.append(sentiment)

for (words, sentiment) in test_input:
    review_filtered = ' '.join(e.lower() for e in words.split() if len(e) >= 2)
    test_reviews.append(review_filtered)
    test_sent.append(sentiment)


In [49]:
print( len(train_reviews) , len(test_reviews))

25000 25000


In [50]:
for i in range(10):
    print(test_sent[i])   #  should be  0 1 1 0 1 1 0 0 1 1

0
1
1
0
1
1
0
0
1
1


In [51]:
N_test= int(len(test_reviews)/2)

In [52]:
eval_reviews = test_reviews[:N_test]
eval_sent    = test_sent[:N_test]
test_reviews = test_reviews[N_test:]
test_sent    = test_sent[N_test:]
print(len(train_reviews),len(test_reviews),len(eval_reviews))

25000 12500 12500


### TF and TF-IDF Vectorizer

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=7000, min_df=4, ngram_range=[1,4])

## Logistic regression: regression and sigmoid activation
Now we use the document-term matrix to extract features for logistics regression.

In [54]:
X_train = vectorizer.fit(train_reviews).transform(train_reviews)
y_train = train_sent

In [55]:
X_train.shape

(25000, 373319)

In [56]:
X_eval  = vectorizer.transform(eval_reviews)
X_test  = vectorizer.transform(test_reviews)

feature_names = vectorizer.get_feature_names()
print('Number of features: ', len(feature_names), '\n')

print('X_train is a very sparse matrix with ',X_train.shape[0]*X_train.shape[1], 'elements in total')
print('         ', repr(X_train))
print('X_eval:  ', repr(X_eval) )
print('X_test:  ', repr(X_test) )

from sklearn import preprocessing
mm_scaler = preprocessing.MaxAbsScaler()
X_train_scaled = mm_scaler.fit_transform(X_train)
X_test_scaled = mm_scaler.transform(X_test)
X_eval_scaled = mm_scaler.transform(X_eval)

from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=10000)
LR.fit(X_train_scaled, y_train)

cm = nltk.ConfusionMatrix(test_sent, LR.predict(X_test_scaled))

print("accuracy with logistic regression on test set: %5.2f %%" % 
            ((cm[1,1]+cm[0,0])/ (1.0*cm[1,1]+cm[0,0]+cm[1,0]+cm[0,1])*100) )
print(cm.pretty_format(sort_by_count=True, show_percents=True) )

Number of features:  373319 

X_train is a very sparse matrix with  9332975000 elements in total
          <25000x373319 sparse matrix of type '<class 'numpy.int64'>'
	with 8079943 stored elements in Compressed Sparse Row format>
X_eval:   <12500x373319 sparse matrix of type '<class 'numpy.int64'>'
	with 3771564 stored elements in Compressed Sparse Row format>
X_test:   <12500x373319 sparse matrix of type '<class 'numpy.int64'>'
	with 3720934 stored elements in Compressed Sparse Row format>
accuracy with logistic regression on test set: 90.34 %
  |      1      0      2 |
--+----------------------+
1 | <45.5%>  4.5%      . |
0 |   5.1% <44.9%>     . |
2 |      .      .     <.>|
--+----------------------+
(row = reference; col = test)



In [67]:
import numpy as np
if True:
    bestCval = 0
    bestCvalParam = 0
    for Cval in np.linspace(start=0.0250, stop=0.0350, num=11):
        LR = LogisticRegression(max_iter=10000, C=Cval, solver='liblinear')
        LR.fit(X_train_scaled, y_train)
        accuracy = sum(LR.predict(X_test_scaled) == test_sent) / len(eval_sent)
        print('C = ',Cval,'  --> ', accuracy)
        if accuracy > bestCvalParam :
            bestCvalParam = accuracy
            bestCval = Cval
    print('bestCval = ',bestCval,'  --> ', bestCvalParam)

C =  0.025   -->  0.90424
C =  0.026000000000000002   -->  0.90464
C =  0.027000000000000003   -->  0.90504
C =  0.028000000000000004   -->  0.90512
C =  0.029   -->  0.90544
C =  0.030000000000000002   -->  0.90552
C =  0.031000000000000003   -->  0.90544
C =  0.032   -->  0.90576
C =  0.033   -->  0.90584
C =  0.034   -->  0.906
C =  0.035   -->  0.90584
bestCval =  0.034   -->  0.906


In [68]:
LR = LogisticRegression(max_iter=10000, C=bestCval, solver='liblinear')
LR.fit(X_train_scaled, y_train)

cm = nltk.ConfusionMatrix(test_sent, LR.predict(X_test_scaled))

print("accuracy with logistic regression on test set: %5.2f %%" % 
            ((cm[1,1]+cm[0,0])/ (1.0*cm[1,1]+cm[0,0]+cm[1,0]+cm[0,1])*100) )
print(cm.pretty_format(sort_by_count=True, show_percents=True) )

accuracy with logistic regression on test set: 90.60 %
  |      1      0      2 |
--+----------------------+
1 | <45.8%>  4.3%      . |
0 |   5.1% <44.8%>     . |
2 |      .      .     <.>|
--+----------------------+
(row = reference; col = test)



In [69]:
?LogisticRegression

In [70]:
from sklearn.model_selection import RandomizedSearchCV