In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import movie_reviews, stopwords
import nltk
from collections import Counter
from collections import defaultdict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble

# Goals

* Clean data, process it through spacy    X
* create bag of words    X
* create tf_idf 
* create a gradient boosted model  X
* create a logistic regression model  X
* create a support vector model
* run cross-validation on each model
* try to improve the best model by 5 percentage points

In [2]:
nltk.download()
#download the movie reviews corpus then close

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
# Create a dictionary style list to hold positive and negative reviews
movie_dict = defaultdict(list)
movies = movie_reviews.fileids()
for i in movies:
    movie_dict[i.split('/')[0]].append(i)

    
#Analyzing the first 10 positive and negative reviews
negatives = movie_dict['neg'][:40]
positives = movie_dict['pos'][:40]
reviews = ''

In [30]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'([()])',' ',text) #remove parenthesis
    text = re.sub(r'([\\/])', '', text) #remove backslashes and forwards slashes
    text = re.sub(r'([-_])', " ", text) #remove forward slashes
    text = re.sub(r'([<>,`+$=\*])', '', text)
    text = re.sub(r'([\d])', '', text)
    text = ' '.join(text.split())
    return text

In [31]:
positive_reviews = ''
negative_reviews = ''
for x in positives:
    positive_reviews = positive_reviews + movie_reviews.raw(x)
for x in negatives:
    negative_reviews = negative_reviews + movie_reviews.raw(x)

In [32]:
positive_clean = text_cleaner(positive_reviews)
negative_clean = text_cleaner(negative_reviews)

In [33]:
negative_clean



In [80]:
nlp = spacy.load('en')
positive_docs = nlp(positive_clean)
negative_docs = nlp(negative_clean)

In [81]:
pos_sentences = [[sents, 'positive'] for sents in positive_docs.sents]
neg_sentences = [[sents, 'negative'] for sents in negative_docs.sents]


In [83]:
df_sents = pd.DataFrame(pos_sentences + neg_sentences)
df_sents

Unnamed: 0,0,1
0,"(films, adapted, from, comic, books, have, had...",positive
1,"(but, there, 's, never, really, been, a, comic...",positive
2,"(for, starters, it, was, created, by, alan, mo...",positive
3,"(to, say, moore, and, campbell, thoroughly, re...",positive
4,"(the, book, or, "", graphic, novel, "", if, you,...",positive
5,"(in, other, words, do, n't, dismiss, this, fil...",positive
6,"(if, you, can, get, past, the, whole, comic, b...",positive
7,"(getting, the, hughes, brothers, to, direct, t...",positive
8,"(the, ghetto, in, question, is, of, course, wh...",positive
9,"(it, 's, a, filthy, sooty, place, where, the, ...",positive


## Bag of words

In [37]:
def bag_of_words(text):
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]

    return [item for item in Counter(allwords).most_common(2500)]
positive_words = bag_of_words(positive_docs)
negative_words = bag_of_words(negative_docs)
common_words = set(positive_words + negative_words)

In [38]:
sorted(common_words)

[("'d", 1),
 ("'s", 188),
 ("'s", 195),
 ('-PRON-', 3),
 ('-PRON-', 5),
 ('abandon', 2),
 ('abberline', 2),
 ('aberdeen', 9),
 ('abigail', 7),
 ('ability', 3),
 ('able', 5),
 ('able', 6),
 ('ably', 1),
 ('aboard', 1),
 ('abound', 2),
 ('abraham', 2),
 ('abroad', 1),
 ('absinthe', 1),
 ('absolutely', 2),
 ('absolutely', 7),
 ('abstract', 2),
 ('abuse', 3),
 ('academy', 3),
 ('accent', 3),
 ('accent', 4),
 ('accentuate', 1),
 ('accept', 2),
 ('acceptable', 2),
 ('access', 1),
 ('accident', 3),
 ('accidentally', 1),
 ('accompany', 2),
 ('account', 2),
 ('accurate', 1),
 ('accuse', 4),
 ('achieve', 1),
 ('achievement', 2),
 ('achiever', 1),
 ('acme', 2),
 ('act', 10),
 ('act', 20),
 ('acting', 9),
 ('acting', 10),
 ('action', 16),
 ('action', 32),
 ('actor', 24),
 ('actor', 31),
 ('actress', 3),
 ('actual', 2),
 ('actual', 5),
 ('actually', 20),
 ('actually', 24),
 ('ad', 4),
 ('adam', 1),
 ('adam', 3),
 ('adapt', 2),
 ('adaptation', 2),
 ('adaptation', 4),
 ('add', 9),
 ('add', 13),
 ('ad

In [39]:
len(common_words)

4799

In [40]:
def bow_features(sentences, common_words):

    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_sentiment'] = sentences[1]
    df.loc[:, common_words] = 0

    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]

        for word in words:
            df.loc[i, word] += 1

        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [41]:
bow_data = bow_features(df_sents, common_words)

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500


## Tf-idf

In [91]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=3, # only use words that appear at least three times
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True, #using inverse term frequency
                             norm=u'l2', 
                             smooth_idf=True, #avoiding divide by 0 errors
                             token_pattern=r"\b\w[\w']+\b"
                            )

In [92]:
reviews_tf_idf = vectorizer.fit_transform(df_sents[0].astype(str))
reviews_csr = reviews_tf_idf.tocsr()

#number of sentences
n = reviews_csr.shape[0]
tfidf_bysent = [{} for _ in range(0,n)]


print("Number of features: %d" % reviews_tf_idf.get_shape()[1])
#List of features
terms = vectorizer.get_feature_names()
#for each sentence, lists the feature words and their tf-idf scores
for i, j in zip(*reviews_csr.nonzero()):
    tfidf_bysent[i][terms[j]] = reviews_csr[i, j]

print('Original sentence:', [df_sents.iloc[5]])
print('Tf_idf vector:', tfidf_bysent[5])

Number of features: 2289
Original sentence: [0    (in, other, words, do, n't, dismiss, this, fil...
1                                             positive
Name: 5, dtype: object]
Tf_idf vector: {'words': 0.74158913175852426, "don't": 0.57235027296195962, 'film': 0.34994388792777736}


In [95]:
reviews_tf_idf.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [84]:
full_sents = [sents for sents in positive_docs.sents] + [sents for sents in negative_docs.sents]
full_sents

[films adapted from comic books have had plenty of success whether they're about superheroes batman superman spawn or geared toward kids casper or the arthouse crowd ghost world,
 but there's never really been a comic book like from hell before .,
 for starters it was created by alan moore and eddie campbell who brought the medium to a whole new level in the mid 's with a part series called the watchmen .,
 to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd .,
 the book or " graphic novel " if you will is over pages long and includes nearly more that consist of nothing but footnotes .,
 in other words don't dismiss this film because of its source .,
 if you can get past the whole comic book thing you might find another stumbling block in from hell's directors albert and allen hughes .,
 getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in well anyth

In [96]:
#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
sents_by_component = pd.DataFrame(reviews_tf_idf.toarray(), index=df_sents.index, columns=terms)

In [107]:
df_sents.columns=['text_source', 'text_sentiment']
tfidf_df = pd.concat([sents_by_component, df_sents], axis=1)
tfidf_df

Unnamed: 0,aberdeen,abigail,ability,able,absolutely,abstract,abuse,academy,accent,acceptable,...,young,younger,youth,yuen,zany,zero,zombie,zoolander,text_source,text_sentiment
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(films, adapted, from, comic, books, have, had...",positive
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(but, there, 's, never, really, been, a, comic...",positive
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(for, starters, it, was, created, by, alan, mo...",positive
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(to, say, moore, and, campbell, thoroughly, re...",positive
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(the, book, or, "", graphic, novel, "", if, you,...",positive
5,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(in, other, words, do, n't, dismiss, this, fil...",positive
6,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(if, you, can, get, past, the, whole, comic, b...",positive
7,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(getting, the, hughes, brothers, to, direct, t...",positive
8,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(the, ghetto, in, question, is, of, course, wh...",positive
9,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(it, 's, a, filthy, sooty, place, where, the, ...",positive


# Bag of Words evaluation
## Gradient Boosting Classifier

In [101]:
gbc = ensemble.GradientBoostingClassifier()
Y = bow_data['text_sentiment']
X = np.array(bow_data.drop(['text_sentence','text_sentiment'], 1))

X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.35,
                                                    random_state=0)
train = gbc.fit(X_train, Y_train)

print('\nTest set score:', gbc.score(X_test, Y_test))

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)



Test set score: 0.492401215805


In [102]:
cross_val_score(gbc, X, Y, cv=10)

array([ 0.5035461 ,  0.5035461 ,  0.5035461 ,  0.5035461 ,  0.5035461 ,
        0.5035461 ,  0.5035461 ,  0.5035461 ,  0.5035461 ,  0.50357143])

## Logistic Regression model

In [103]:
lr = LogisticRegression(tol=.01)

lr.fit(X_train, Y_train)

print('Training set score:', lr.score(X_train, Y_train))
print('\nTest set score:', lr.score(X_test, Y_test))

Training set score: 0.509557618788

Test set score: 0.492401215805


In [104]:
cross_val_score(lr, X, Y, cv=10)

array([ 0.5035461 ,  0.5035461 ,  0.5035461 ,  0.5035461 ,  0.5035461 ,
        0.5035461 ,  0.5035461 ,  0.5035461 ,  0.5035461 ,  0.50357143])

In [105]:
pd.crosstab(Y_test, lr.predict(X_test))

col_0,negative
text_sentiment,Unnamed: 1_level_1
negative,486
positive,501


# TF_IDF Evaluation
## Gradient Boosted Classifier

In [109]:
gbc = ensemble.GradientBoostingClassifier()
Y = tfidf_df['text_sentiment']
X = np.array(tfidf_df.drop(['text_source','text_sentiment'], 1))

X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.35,
                                                    random_state=0)
train = gbc.fit(X_train, Y_train)

print('\nTest set score:', gbc.score(X_test, Y_test))


Test set score: 0.589665653495


In [110]:
cross_val_score(gbc, X, Y, cv=10)

array([ 0.5248227 ,  0.56382979,  0.56382979,  0.53546099,  0.54609929,
        0.54255319,  0.5141844 ,  0.52836879,  0.46808511,  0.51071429])

## Logistic Regression model

In [111]:
lr = LogisticRegression(tol=.01)

lr.fit(X_train, Y_train)

print('Training set score:', lr.score(X_train, Y_train))
print('\nTest set score:', lr.score(X_test, Y_test))

Training set score: 0.898962315674

Test set score: 0.682877406282


In [115]:
cross_val_score(lr, X, Y, cv=6)

array([ 0.63057325,  0.52340426,  0.57021277,  0.54371002,  0.55010661,
        0.52665245])

In [113]:
pd.crosstab(Y_test, lr.predict(X_test))

col_0,negative,positive
text_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,346,140
positive,173,328
