Intro

# www.imdb.com

<img src="review.png"> 

Get archives for labeled training and test data from: http://ai.stanford.edu/~amaas/data/sentiment/'

## Load the data




In [1]:
import io
import pandas as pd
import numpy as np

with io.open('data/aclImdb/train-pos.txt', encoding='utf-8') as f:
    train_pos = pd.DataFrame({'review': list(f)})    
with io.open('data/aclImdb/train-neg.txt', encoding='utf-8') as f:
    train_neg = pd.DataFrame({'review': list(f)}) 
train_reviews = pd.concat([train_neg, train_pos], ignore_index=True)

with io.open('data/aclImdb/test-pos.txt', encoding='utf-8') as f:
    test_pos = pd.DataFrame({'review': list(f)})
with io.open('data/aclImdb/test-neg.txt', encoding='utf-8') as f:
    test_neg = pd.DataFrame({'review': list(f)})    
test_reviews = pd.concat([test_neg, test_pos], ignore_index=True)
  
X_train = train_reviews['review']
X_test = test_reviews['review']

y_train = np.append(np.zeros(12500), np.ones(12500))
y_test = np.append(np.zeros(12500), np.ones(12500)) 

## First review - good or bad?




In [2]:
X_train[0]

u"a reasonable effort is summary for this film .  a good sixties film but lacking any sense of achievement .  maggie smith gave a decent performance which was believable enough but not as good as she could have given ,  other actors were just dreadful !  a terrible portrayal .  it wasn't very funny and so it didn't really achieve its genres as it wasn't particularly funny and it wasn't dramatic .  the only genre achieved to a satisfactory level was romance .  target audiences were not hit and the movie sent out confusing messages .  a very basic plot and a very basic storyline were not pulled off or performed at all well and people were left confused as to why the film wasn't as good and who the target audiences were etc .  however maggie was quite good and the storyline was alright with moments of capability .   4 . \n"

## What people thought




In [3]:
y_train[0]

0.0

# A naive approach: word counts  

Basic principle: count and weight positive words, count and weight negative words, highest score wins

Two types of challenge if we want to do this in an automated way:
- how do I obtain the weights (how do I even know if something's positive or negative?)
- how do I deal with complexity introduced by this being <i>language</i>?


# Challenge no. 1: handling language (syntax / semantics)

Some examples rom our review above:
> performance which was believable enough but not as good as she could have given

> lacking any sense of achievement 

> it wasn't very funny

> the only genre achieved to a satisfactory level was romance

# So what's best: unigrams, bigrams, trigrams...?

Instead of hypothesizing let's check what works best on our data set.

First, let's inspect the most frequent unigrams, bigrams and trigrams and their actual frequencies.

But there's another question to be answered before...



## How to handle stopwords?

In [4]:
from nltk.corpus import stopwords
stopwords_nltk = set(stopwords.words("english"))
print(stopwords_nltk)

set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u'where', u'few', u'because', u'doing', u'some', u'hasn', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'while', u're', u'does', u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u'were', u'here', u'shouldn', u'hers', u'by', u'on', u'about', u'couldn', u'of', u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u'mightn', u'wasn', u'your', u'from', u'her', u'their', u'aren', u'there', u'been', u'whom', u'too', u'wouldn', u'themselves', u'weren', u'was', u'until', u'more', u'himself', u'that', u'but', u'don', u'with', u'than', u'those', u'he', u'me', u'myself', u'ma', u'these', u'up', u'will', u'below', u'ain

In [5]:
# don't want to exclude the negations
relevant_words = set(['not', 'nor', 'no', 'wasn', 'ain', 'aren', 'very', 'only', 'but', 'don', 'isn', 'weren'])
stopwords_filtered = list(stopwords_nltk.difference(relevant_words))

## Most frequent unigrams

In [None]:
# if we had the time to do this live ...
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(stop_words = stopwords_filtered, max_features = 5000, ngram_range = (1,1)) ...
word_counts_unigram = pd.read_csv('word_counts_sorted_ngram_1_stopwords_removed.csv', 
                                  usecols=['word', 'count'])
print('overall word count: {}'.format(word_counts_unigram['count'].sum()))
word_counts_unigram.head(10)

## Most frequent bigrams

In [None]:
word_counts_unigram = pd.read_csv('word_counts_sorted_ngram_2_stopwords_removed.csv', 
                                  usecols=['word', 'count'])
print('overall bigram count: {}'.format(word_counts_unigram['count'].sum()))
word_counts_unigram.head(10)

## Most frequent trigrams

In [None]:
word_counts_unigram = pd.read_csv('word_counts_sorted_ngram_3_stopwords_removed.csv', 
                                  usecols=['word', 'count'])
print('overall trigram count: {}'.format(word_counts_unigram['count'].sum()))
word_counts_unigram.head(10)

## Most frequent four-grams

In [None]:
word_counts_unigram = pd.read_csv('word_counts_sorted_ngram_4_stopwords_removed.csv', 
                                  usecols=['word', 'count'])
print('overall bigram count: {}'.format(word_counts_unigram['count'].sum()))
word_counts_unigram.head(10)

# Challenge no. 2: Determine word sentiment and weight

This is a classical classification task.
Can use one of the usual suspects:
- Logistic regression
- Decision trees
- Support Vector Machines
- ...

TBD : Logsitic Regression explained

# The right combination

- Which classifier works best?
- With what input (unigrams, bigrams, trigrams ...)?
- Using which parameter settings?

==> Perform a grid search

In [None]:
# logistic regression with 1-grams, 1-2-grams, 1-3-grams and different complexity penalty values

# vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
#                             stop_words = None, max_features = 10000)

#logistic_model = LogisticRegression() 
#logistic_pipeline = Pipeline([("vectorizer", vectorizer), ("logistic", logistic_model)])
#search_params = dict(vectorizer__ngram_range = [(1,1), (1,2), (1,3)],
#                     vectorizer__stop_words = [stopwords_filtered, None],
#                     logistic__C = [0.01, 0.03, 0.05, 0.1])

#best_logistic = GridSearchCV(logistic_pipeline, param_grid=search_params, cv=5, verbose=1)
#best_logistic.fit(X_train, y_train)
#print(best_logistic.best_params_)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def assess_classification_performance(model, X_train, y_train, X_test, y_test):
  
    accuracy_train = accuracy_score(y_train, model.predict(X_train))
    accuracy_test = accuracy_score(y_test, model.predict(X_test))
    print('\nClassification performance overview:\n************************************')
    print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test))
    # confusion matrix
    # rows: actual group
    # columns: predicted group
    print('Confusion_matrix (training data):')
    print(confusion_matrix(y_train, model.predict(X_train)))
    print('Confusion_matrix (test data):')
    print(confusion_matrix(y_test, model.predict(X_test)))


# Logistic Regression best fit

- unigrams and bigrams
- C = 0.03

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                             stop_words = stopwords_filtered, max_features = 10000, ngram_range = (1,2))
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

logistic_model = LogisticRegression(C=0.03) 
logistic_model.fit(X_train_features, y_train)

## Logistic Regression - how well did it work?

In [None]:
assess_classification_performance(logistic_model, X_train_features, y_train, X_test_features, y_test)

## Which words make it positive?

In [None]:
vocabulary = vectorizer.get_feature_names()
coefs = logistic_model.coef_
word_importances = pd.DataFrame({'word': vocabulary, 'coef': coefs.tolist()[0]})
word_importances_sorted = word_importances.sort_values(by='coef', ascending = False)
word_importances_sorted[:10]

## Which words make it negative?

In [None]:
word_importances_sorted[-11:-1]

## How about the bigrams? (1)

In [None]:
word_importances_bigrams = word_importances_sorted[word_importances_sorted.word.apply(lambda c: len(c.split()) >= 2)]
word_importances_bigrams[:10]

## How about the bigrams? (2)

In [None]:
word_importances_bigrams[-11:-1]

## So that's Logistic Regression with bigrams, - how about other classifiers?

### best accuracies per classifier
<table border="1">
<tr>
<th></th><th>1-grams</th><th>1-2-grams, with stopword filtering</th><th>1-3-grams, without stopword filtering</th>
</tr>
<tr>
<th>Logistic Regression</th><td></td><td>0.89</td><td></td>
</tr>
<tr>
<th>Support Vector Machine</th><td></td><td></td><td>0.84</td>
</tr>
<tr>
<th>Random Forest</th><td>tbd</td><td></td>
</tr>
</table>

# Beyond word counts: Word embeddings

## If we had time to train the model, we'd actually execute this...

In [None]:
# can use train_reviews and test_reviews from above
#import nltk.data
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#sentences_train = []  
#for review in train_reviews["review"]:
#    sentences_train += [s.split() for s in tokenizer.tokenize(review)]
#sentences_test = []  
#for review in test_reviews["review"]:
#    sentences_test += [s.split() for s in tokenizer.tokenize(review)]
#all_sentences = sentences_train + sentences_test

# model parameters
#num_features = 100    # Word vector dimensionality                      
#min_word_count = 20   # Minimum word count                        
#num_workers = 2       # Number of threads to run in parallel
#context = 10          # Context window size                                                                                    
#downsampling = 1e-3   # Downsample setting for frequent words

#model = word2vec.Word2Vec(all_sentences, workers=num_workers, size=num_features, min_count = min_word_count,
#                          window = context, sample = downsampling)
#model.init_sims(replace=True)
#model_name = "models/word2vec_100features"
#model.save(model_name)

Let's instead load the model from disk and inspect it:

In [None]:
from gensim.models import word2vec
model = word2vec.Word2Vec.load('models/word2vec_100features')
print(model.syn0.shape)
print(model['movie'])


## Which words are similar to <i>awesome</i>?

In [None]:
model.most_similar('awesome', topn=10)

## ... and to <i> awful</i>?

In [None]:
model.most_similar('awful', topn=10)

## Can we "subtract out" <i>awful</i>?

In [None]:
model.most_similar(positive=['awesome'], negative=['awful'])

## Let's try this again with <i>good</i> - <i>bad</i>: <i>Good</i> ...

In [None]:
model.most_similar('good', topn=10)

## ... and <i>bad</i>:

In [None]:
model.most_similar('bad', topn=10)

## So <i>good</i> minus <i>bad</i> is ...

In [None]:
model.most_similar(positive=['good'], negative=['bad'])

## Which word doesn't match?

In [None]:
model.doesnt_match("good bad awful terrible".split())

In [None]:
model.doesnt_match("awesome bad awful terrible".split())

In [None]:
model.doesnt_match("nice pleasant fine excellent".split())

## Visualize in 2d

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
from sklearn.manifold import TSNE
words = ['story', 'movie','plot', 'film', 'good', 'bad', 'awful', 'awesome', 'man', 'woman', 'like', 'actor', 'actress']
vectors = model.syn0

pca = PCA(n_components=2)
pca_2d = pca.fit_transform(vectors)

tsne = TSNE(n_components=2, random_state=0, verbose=10, init='pca')
tsne_2d = tsne.fit_transform(vectors)

first = True
for name, transform in zip(['PCA', 'TSNE'], [pca_2d, tsne_2d]):  
#for name, transform in zip(['PCA'], [pca_2d]):   
    plt.figure(figsize=(6,6))
    for point, word in zip(transform , words):
        plt.scatter(point[0], point[1], c='r' if first else 'g')
        plt.annotate(
            word, 
            xy = (point[0], point[1]),
            xytext = (-7, -6) if first else (7, -6),
            textcoords = 'offset points',
            ha = 'right' if first else 'left',
            va = 'bottom',
            size = "medium"
            )
        first = not first 
    plt.title(name)
    plt.tight_layout()
plt.show()
