## Introduction to NLP   at Karakun             Part 2     

In [1]:
import nltk              
import random
random.seed(0)

# Sentiment analysis with logistic 

We'll use the IMDB dataset that contains the text of 50,000 movie reviews from the Internet Movie Database. These are split into 25,000 reviews for training and 25,000 reviews for testing/evaluation. The training and testing sets are balanced, meaning they contain an equal number of positive and negative reviews.

In [203]:
train_input = []
test_input  = []

with open('./Sentiment-Analysis-Data/IMDb/train-pos.txt','r') as f:
    for line in f:
        train_input.append((line,1))
f.close()

with open('./Sentiment-Analysis-Data/IMDb/train-neg.txt','r') as f:
    for line in f:
        train_input.append((line,0))
f.close()

with open('./Sentiment-Analysis-Data/IMDb/test-pos.txt','r') as f:
    for line in f:
        test_input.append((line,1))
f.close()

with open('./Sentiment-Analysis-Data/IMDb/test-neg.txt','r') as f:
    for line in f:
        test_input.append((line,0))
f.close()


random.shuffle(train_input)
random.shuffle(test_input)

In [204]:
train_reviews = []
train_sent    = []
test_reviews  = []
test_sent     = []

for (words, sentiment) in train_input:
    review_filtered = ' '.join(e.lower() for e in words.split() if len(e) >= 2)
    train_reviews.append(review_filtered)
    train_sent.append(sentiment)

for (words, sentiment) in test_input:
    review_filtered = ' '.join(e.lower() for e in words.split() if len(e) >= 2)
    test_reviews.append(review_filtered)
    test_sent.append(sentiment)


In [205]:
print(len(train_reviews), len(test_reviews))

25000 25000


In [206]:
for i in range(10):
    print(test_sent[i])   #  should be  0 1 1 0 1 1 0 0 1 1

1
1
0
0
0
0
1
0
1
1


In [207]:
N_test= int(len(test_reviews)/2)

In [208]:
eval_reviews = test_reviews[:N_test]
eval_sent    = test_sent[:N_test]
test_reviews = test_reviews[N_test:]
test_sent    = test_sent[N_test:]
print(len(train_reviews),len(test_reviews),len(eval_reviews))

25000 12500 12500


In [209]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stopwords_E = stopwords.words('english')
porter = PorterStemmer()

def tokenizer_stemmer(text):
    return [porter.stem(word) for word in text.split()]

def preprocessor(text):
    return " ".join([word for word in text.split() if word not in stopwords_E])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christianr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### TF and TF-IDF Vectorizer

In [211]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = CountVectorizer(min_df=1)
vectorizer = TfidfVectorizer(use_idf=True, norm="l2", min_df=10,
                             smooth_idf=True,
                             tokenizer=tokenizer_stemmer, preprocessor=preprocessor)

In [212]:
corpus = [
...     'This is the first document.',
...     'This is the second document.',
...     'And the third one.',
...     'Is this the first document?',
... ]

In [213]:
term_doc_matrix = vectorizer.fit_transform(corpus)
term_doc_matrix.shape

ValueError: max_df corresponds to < documents than min_df

In [189]:
term_doc_matrix.toarray()

array([[0.        , 0.57735027, 0.        , 0.57735027, 0.        ,
        0.        , 0.57735027, 0.        ],
       [0.        , 0.52640543, 0.        , 0.        , 0.        ,
        0.66767854, 0.52640543, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.70710678],
       [0.61761437, 0.        , 0.61761437, 0.48693426, 0.        ,
        0.        , 0.        , 0.        ]])

In [190]:
vocabulary = vectorizer.vocabulary_
vocabulary

{'thi': 6,
 'first': 3,
 'document.': 1,
 'second': 5,
 'third': 7,
 'one.': 4,
 'Is': 0,
 'document?': 2}

In [192]:
vocabulary['document?']

2

In [193]:
vocab = vectorizer.get_feature_names()
vocab

['Is', 'document.', 'document?', 'first', 'one.', 'second', 'thi', 'third']

In [194]:
mask= term_doc_matrix.toarray()[2]
mask

array([0.        , 0.        , 0.        , 0.        , 0.70710678,
       0.        , 0.        , 0.70710678])

In [195]:
' '.join(vocab[i] for i in range(len(vocab)) if mask[i] )

'one. third'

In [196]:
vectorizer.transform(['This is a test text to check the output of CountVectorizer']).toarray()

array([[0., 0., 0., 0., 0., 0., 1., 0.]])

## Logistic regression: regression and sigmoid activation
Now we use the document-term matrix to extract features for logistics regression.

In [214]:
X_train = vectorizer.fit(train_reviews).transform(train_reviews)
y_train = train_sent



In [215]:
X_train.shape

(25000, 12863)

In [None]:
X_eval  = vectorizer.transform(eval_reviews)
X_test  = vectorizer.transform(test_reviews)

feature_names = vectorizer.get_feature_names()
print('Number of features: ', len(feature_names), '\n')

print('X_train is a very sparse matrix with ',X_train.shape[0]*X_train.shape[1], 'elements in total')
print('         ', repr(X_train))
print('X_eval:  ', repr(X_eval) )
print('X_test:  ', repr(X_test) )

Logistic regression takes a regular linear regression, and applies a sigmoid to the output of the linear regression.

rienear Regression:
$$z = \theta_0 x_0 + \theta_1 x_1 + \theta_2 x_2 + ... \theta_N x_N$$

Note that the $\theta$ values are "weights" to be learned, 'z' is refered to as 'logits' and is input for the activation function (sigmoid): 

$$ h(z) = \frac{1}{1+\exp^{-z}}$$

### The cost function

The cost function used for logistic regression is the average of the log loss across all training examples:

$$J(\theta) = -\frac{1}{m} \sum_{i=1}^m y^{(i)}\log (h(z(\theta)^{(i)})) + (1-y^{(i)})\log (1-h(z(\theta)^{(i)}))\tag{5} $$
* $m$ is the number of training examples
* $y^{(i)}$ is the actual sentiment label of the i-th training example, hence $0$ or $1$.
* $h(z(\theta)^{(i)})$ is the model's prediction for the i-th training example.


* All the $h$ values are between 0 and 1, so the logs will be negative. That is the reason for the factor of -1 applied to the sum of the two loss terms.
* Note that when the model predicts 1 ($h(z(\theta)) = 1$) and the label $y$ is also 1, the loss for that training example is 0. 
* Similarly, when the model predicts 0 ($h(z(\theta)) = 0$) and the actual label is also 0, the loss for that training example is 0. 
* However, when the model prediction is close to 1 ($h(z(\theta)) = 0.9999$) and the label is 0, the second term of the log loss becomes a large negative number, which is then multiplied by the overall factor of -1 to convert it to a positive loss value. $-1 \times (1 - 0) \times log(1 - 0.9999) \approx 9.2$ The closer the model prediction gets to 1, the larger the loss.

In [200]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LR.fit(X_train, y_train)

cm = nltk.ConfusionMatrix(test_sent, LR.predict(X_test))

print("accuracy with logistic regression on test set: %5.2f %%" % 
            ((cm[1,1]+cm[0,0])/ (1.0*cm[1,1]+cm[0,0]+cm[1,0]+cm[0,1])*100) )
print(cm.pretty_format(sort_by_count=True, show_percents=True) )

accuracy with logistic regression on test set: 81.62 %
  |      1      0      2 |
--+----------------------+
1 | <43.5%>  6.6%      . |
0 |  11.8% <38.1%>     . |
2 |      .      .     <.>|
--+----------------------+
(row = reference; col = test)



In [202]:
for Cval in [0.01,0.03,0.1,0.3,1.0]:
    LR = LogisticRegression(C=Cval, dual=True)
    LR.fit(X_train, y_train)
    print('C = ',Cval,'  --> ',sum(LR.predict(X_eval) == eval_sent) / len(eval_sent) )

ValueError: Solver lbfgs supports only dual=False, got dual=True

In [164]:
LR = LogisticRegression(C=1.0) 
LR.fit(X_train, y_train)

cm = nltk.ConfusionMatrix(test_sent, LR.predict(X_test))

print("accuracy with logistic regression on test set: %5.2f %%" % 
            ((cm[1,1]+cm[0,0])/ (1.0*cm[1,1]+cm[0,0]+cm[1,0]+cm[0,1])*100) )
print(cm.pretty_format(sort_by_count=True, show_percents=True) )

accuracy with logistic regression on test set: 88.28 %
  |      1      0      2 |
--+----------------------+
1 | <44.5%>  5.8%      . |
0 |   5.9% <43.7%>     . |
2 |      .      .     <.>|
--+----------------------+
(row = reference; col = test)

