Intro

# www.imdb.com

<img src="review.png"> 

Get archives for labeled training and test data from: http://ai.stanford.edu/~amaas/data/sentiment/'

## Load the data




In [1]:
import io
import pandas as pd
import numpy as np

with io.open('data/aclImdb/train-pos.txt', encoding='utf-8') as f:
    train_pos = pd.DataFrame({'review': list(f)})    
with io.open('data/aclImdb/train-neg.txt', encoding='utf-8') as f:
    train_neg = pd.DataFrame({'review': list(f)}) 
train_reviews = pd.concat([train_neg, train_pos], ignore_index=True)

with io.open('data/aclImdb/test-pos.txt', encoding='utf-8') as f:
    test_pos = pd.DataFrame({'review': list(f)})
with io.open('data/aclImdb/test-neg.txt', encoding='utf-8') as f:
    test_neg = pd.DataFrame({'review': list(f)})    
test_reviews = pd.concat([test_neg, test_pos], ignore_index=True)
  
X_train = train_reviews['review']
X_test = test_reviews['review']

y_train = np.append(np.zeros(12500), np.ones(12500))
y_test = np.append(np.zeros(12500), np.ones(12500)) 

## First review - good or bad?




In [2]:
X_train[0]

"a reasonable effort is summary for this film .  a good sixties film but lacking any sense of achievement .  maggie smith gave a decent performance which was believable enough but not as good as she could have given ,  other actors were just dreadful !  a terrible portrayal .  it wasn't very funny and so it didn't really achieve its genres as it wasn't particularly funny and it wasn't dramatic .  the only genre achieved to a satisfactory level was romance .  target audiences were not hit and the movie sent out confusing messages .  a very basic plot and a very basic storyline were not pulled off or performed at all well and people were left confused as to why the film wasn't as good and who the target audiences were etc .  however maggie was quite good and the storyline was alright with moments of capability .   4 . \n"

## What people thought




In [3]:
y_train[0]

0.0

# A naive approach: word counts  

Basic principle: count and weight positive words, count and weight negative words, highest score wins

Two types of challenge if we want to do this in an automated way:
- how do I obtain the weights (how do I even know if something's positive or negative?)
- how do I deal with complexity introduced by this being <i>language</i>?


# Challenge no. 1: handling language (syntax / semantics)

Some examples rom our review above:
> performance which was believable enough but not as good as she could have given

> lacking any sense of achievement 

> it wasn't very funny

> the only genre achieved to a satisfactory level was romance

# So what's best: unigrams, bigrams, trigrams...?

Instead of hypothesizing let's check what works best on our data set.

First, let's inspect the most frequent unigrams, bigrams and trigrams and their actual frequencies.

But there's another question to be answered before...



## How to handle stopwords?

In [6]:
from nltk.corpus import stopwords
stopwords_nltk = set(stopwords.words("english"))
print(stopwords_nltk)

{'shouldn', 'my', 'should', 'on', 'we', 've', 'a', 'hers', 'before', 'she', 'the', 'our', 'wouldn', 'as', 'hadn', 'here', 'against', 'yourself', 'between', 'own', 'than', 'which', 'by', 'their', 'all', 'can', 'through', 'when', 'mustn', 'do', 'an', 'it', 'have', 'over', 'only', 'couldn', 'shan', 'weren', 'just', 'both', 's', 'any', 'having', 'while', 'himself', 'ain', 'mightn', 're', 'has', 'had', 'this', 'yourselves', 'its', 'his', 'will', 'that', 'or', 'there', 'won', 'were', 'doesn', 'down', 'to', 'isn', 'you', 'am', 'but', 'been', 'up', 'whom', 'about', 'further', 'then', 'into', 'theirs', 'off', 'why', 'few', 'is', 'where', 'no', 'him', 'being', 'ours', 'because', 'until', 'with', 'doing', 'more', 'o', 'very', 'aren', 'from', 'hasn', 'below', 'and', 'did', 'out', 'nor', 'some', 'so', 'your', 'was', 'too', 'same', 'once', 'they', 'under', 'for', 'during', 'i', 'at', 'of', 'them', 'myself', 'each', 'are', 'needn', 'd', 'most', 'y', 'what', 'after', 'how', 'he', 'themselves', 'didn',

In [7]:
# don't want to exclude the negations
relevant_words = set(['not', 'nor', 'no', 'wasn', 'ain', 'aren', 'very', 'only', 'but', 'don', 'isn', 'weren'])
stopwords_filtered = list(stopwords_nltk.difference(relevant_words))

## Most frequent unigrams

In [8]:
# if we had the time to do this live ...
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(stop_words = stopwords_filtered, max_features = 5000, ngram_range = (1,1)) ...
word_counts_unigram = pd.read_csv('word_counts_sorted_ngram_1_stopwords_removed.csv', 
                                  usecols=['word', 'count'])
print('overall word count: {}'.format(word_counts_unigram['count'].sum()))
word_counts_unigram.head(10)

overall word count: 2827541


Unnamed: 0,count,word
0,44047,movie
1,42623,but
2,40159,film
3,30632,not
4,26795,one
5,20281,like
6,15147,good
7,14067,very
8,12727,time
9,12716,no


## Most frequent bigrams

In [9]:
word_counts_unigram = pd.read_csv('word_counts_sorted_ngram_2_stopwords_removed.csv', 
                                  usecols=['word', 'count'])
print('overall bigram count: {}'.format(word_counts_unigram['count'].sum()))
word_counts_unigram.head(10)

overall bigram count: 324066


Unnamed: 0,count,word
0,1925,but not
1,1321,ever seen
2,1284,not only
3,1200,very good
4,1113,special effects
5,1043,even though
6,1032,movie but
7,1024,don know
8,1007,movie not
9,888,one best


## Most frequent trigrams

In [10]:
word_counts_unigram = pd.read_csv('word_counts_sorted_ngram_3_stopwords_removed.csv', 
                                  usecols=['word', 'count'])
print('overall trigram count: {}'.format(word_counts_unigram['count'].sum()))
word_counts_unigram.head(10)

overall trigram count: 6176


Unnamed: 0,count,word
0,262,movie ever seen
1,243,worst movie ever
2,205,don waste time
3,177,movies ever seen
4,164,new york city
5,162,don get wrong
6,160,one worst movies
7,141,worst movies ever
8,120,film ever seen
9,114,movie ever made


## Most frequent four-grams

In [11]:
word_counts_unigram = pd.read_csv('word_counts_sorted_ngram_4_stopwords_removed.csv', 
                                  usecols=['word', 'count'])
print('overall bigram count: {}'.format(word_counts_unigram['count'].sum()))
word_counts_unigram.head(10)

overall bigram count: 456


Unnamed: 0,count,word
0,132,worst movie ever seen
1,121,one worst movies ever
2,86,worst movies ever seen
3,61,worst film ever seen
4,56,one worst films ever


# Challenge no. 2: Determine word sentiment and weight

This is a classical classification task.
Can use one of the usual suspects:
- Logistic regression
- Decision trees
- Support Vector Machines
- ...

TBD : Logsitic Regression explained

# The right combination

- Which classifier works best?
- With what input (unigrams, bigrams, trigrams ...)?
- Using which parameter settings?

==> Perform a grid search

In [14]:
# logistic regression with 1-grams, 1-2-grams, 1-3-grams and different complexity penalty values

# vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
#                             stop_words = None, max_features = 10000)

#logistic_model = LogisticRegression() 
#logistic_pipeline = Pipeline([("vectorizer", vectorizer), ("logistic", logistic_model)])
#search_params = dict(vectorizer__ngram_range = [(1,1), (1,2), (1,3)],
#                     vectorizer__stop_words = [stopwords_filtered, None],
#                     logistic__C = [0.01, 0.03, 0.05, 0.1])

#best_logistic = GridSearchCV(logistic_pipeline, param_grid=search_params, cv=5, verbose=1)
#best_logistic.fit(X_train, y_train)
#print(best_logistic.best_params_)

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def assess_classification_performance(model, X_train, y_train, X_test, y_test):
  
    accuracy_train = accuracy_score(y_train, model.predict(X_train))
    accuracy_test = accuracy_score(y_test, model.predict(X_test))
    print('\nClassification performance overview:\n************************************')
    print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test))
    # confusion matrix
    # rows: actual group
    # columns: predicted group
    print('Confusion_matrix (training data):')
    print(confusion_matrix(y_train, model.predict(X_train)))
    print('Confusion_matrix (test data):')
    print(confusion_matrix(y_test, model.predict(X_test)))


# Logistic Regression best fit

- unigrams and bigrams
- C = 0.03

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                             stop_words = stopwords_filtered, max_features = 5000, ngram_range = (1,2))
X_train = vectorizer.fit_transform(X_train)

logistic_model = LogisticRegression(C=0.03) 
logistic_model.fit(words_array, y_train)
assess_classification_performance(logistic_model, X_train, y_train, X_test, y_test)

ValueError: could not convert string to float: 'the  " silver screen "  gets freshly polished with this beautiful film about aging happily and enjoying life\'s rainbows .  there\'s plenty of silver hair on this silver screen ,  but the film\'s namesake is more like 85-going on-25 with his energy ,  humor and lust for life .  the story of entertainer extraordinaire uncle frank ,  his devoted wife aunt tillie ,  and the zippy residents of the local area nursing homes inspires us to  " live each day as if it\'s your last "  and brings a glimmer of hope to those often-dreaded golden years .  a great movie for young and old audiences ! \n'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer