# Import required packages

In [12]:
# https://scikit-learn.org/
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# http://www.numpy.org/
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
# https://www.nltk.org/
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
import string
import nltk

# Download movie reviews and stopwords from Natural Language Toolkit

In [13]:
nltk.download('movie_reviews')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\scook\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Get Data Here
This function transforms the downloaded movie_reviews package into a dataset containing only the data we need

In [14]:
def get_data():
    """
    Get movie review data
    """
    dataset = []
    y_labels = []
    # Extract categories
    for cat in movie_reviews.categories():
        # for files in each cateogry    
        for fileid in movie_reviews.fileids(cat):
            # Get the words in that category
            words = list(movie_reviews.words(fileid))
            # the resulting dataset stores sentences
            sentences = " ".join(word for word in words)
            dataset.append((sentences))
            y_labels.append(cat)
    return dataset,y_labels


In [15]:
dataset, labels = get_data()
target_labels = ['neg','pos']

In [16]:
print(labels[1001])
print(dataset[1001])

pos
every now and then a movie comes along from a suspect studio , with every indication that it will be a stinker , and to everybody ' s surprise ( perhaps even the studio ) the film becomes a critical darling . mtv films ' _election , a high school comedy starring matthew broderick and reese witherspoon , is a current example . did anybody know this film existed a week before it opened ? the plot is deceptively simple . george washington carver high school is having student elections . tracy flick ( reese witherspoon ) is an over - achiever with her hand raised at nearly every question , way , way , high . mr . " m " ( matthew broderick ) , sick of the megalomaniac student , encourages paul , a popular - but - slow jock to run . and paul ' s nihilistic sister jumps in the race as well , for personal reasons . the dark side of such sleeper success is that , because expectations were so low going in , the fact that this was quality stuff made the reviews even more enthusiastic than the

### Create data structures for stopwords and punctuation

In [17]:
stopwords = stopwords.words('english')
punctuation = string.punctuation

In [18]:
# count of words that have been negated 
count = 0
# container for sentences that have undergone first transformation
secondsents = []
# traverse sentences in the dataset
for sent in dataset:
    final_words = []
    # split sentence back into words
    words = sent.split(" ")
    # negation flag and negation words to look at
    negate = False
    negate_words = ['no','not']
    # go through all the words in the sentence to see if the sentence is negated
    for word in words:
        # if the negate flag is true append the current word with Not_, increase the negation count, and reset the flag
        if negate:
            word = 'Not_' + word
            count += 1
            print(word)
            negate = False
        # if the flag isn't set and the current word isn't a negation add it to the final word list
        if word not in negate_words:
            final_words.append(word)
        # if the negation flag was false and the word was a negation flip the flag to true
        else:
            negate = True
    # overwrite words with modified words
    words = final_words
    # remove stopwords and punctuation from words list
    words = [word for word in words if word.lower() not in stopwords and word.lower() not in punctuation]
    # overwrite sentence with sentence constructed from new words list
    sent = " ".join(word for word in words)
    # put new sentence into data structure
    secondsents.append(sent)
print(count)

Not_idea
Not_explained
Not_really
Not_a
Not_substance
Not_?
Not_.
Not_enough
Not_justify
Not_one
Not_reason
Not_contest
Not_special
Not_big
Not_provide
Not_stalked
Not_guesswork
Not_exception
Not_anything
Not_all
Not_one
Not_.
Not_so
Not_as
Not_a
Not_make
Not_really
Not_great
Not_talking
Not_.
Not_,
Not_the
Not_me
Not_just
Not_originality
Not_even
Not_caught
Not_after
Not_only
Not_just
Not_bound
Not_original
Not_as
Not_sure
Not_something
Not_speak
Not_that
Not_content
Not_surprises
Not_ethics
Not_points
Not_points
Not_the
Not_no
Not_have
Not_nearly
Not_that
Not_the
Not_return
Not_a
Not_problem
Not_*
Not_.
Not_appeal
Not_only
Not_that
Not_for
Not_nudity
Not_familiar
Not_,
Not_.
Not_safe
Not_a
Not_only
Not_showing
Not_found
Not_because
Not_,
Not_stranger
Not_explanation
Not_idea
Not_explained
Not_even
Not_made
Not_nearly
Not_exaggeration
Not_problem
Not_problem
Not_idea
Not_even
Not_redeeming
Not_just
Not_even
Not_subtlety
Not_be
Not_the
Not_emotional
Not_to
Not_only
Not_surprisingly
Not

Not_way
Not_way
Not_give
Not_a
Not_actors
Not_matter
Not_matter
Not_denying
Not_worth
Not_going
Not_all
Not_so
Not_so
Not_a
Not_like
Not_fun
Not_going
Not_served
Not_men
Not_only
Not_one
Not_matter
Not_saying
Not_life
Not_simply
Not_even
Not_the
Not_waste
Not_,
Not_care
Not_interesting
Not_exist
Not_given
Not_a
Not_:
Not_positive
Not_real
Not_sense
Not_as
Not_woman
Not_material
Not_,
Not_character
Not_way
Not_much
Not_bothering
Not_very
Not_our
Not_one
Not_one
Not_one
Not_.
Not_wanting
Not_help
Not_-
Not_very
Not_idea
Not_to
Not_worth
Not_like
Not_good
Not_-
Not_cease
Not_directly
Not_know
Not_able
Not_know
Not_hichock
Not_scary
Not_enough
Not_matter
Not_feel
Not_in
Not_less
Not_humor
Not_entirely
Not_that
Not_since
Not_humorous
Not_,
Not_one
Not_the
Not_seen
Not_better
Not_marketing
Not_soul
Not_murphy
Not_,
Not_one
Not_sure
Not_have
Not_one
Not_one
Not_to
Not_copy
Not_a
Not_enough
Not_mistaken
Not_starred
Not_mention
Not_purpose
Not_counting
Not_money
Not_accept
Not_to
Not_only
Not_o

Not_,
Not_around
Not_ready
Not_reason
Not_room
Not_men
Not_so
Not_recourse
Not_sure
Not_prone
Not_trey
Not_inherent
Not_reason
Not_means
Not_only
Not_be
Not_amount
Not_amount
Not_one
Not_commentary
Not_.
Not_depth
Not_need
Not_going
Not_one
Not_very
Not_funny
Not_just
Not_create
Not_worth
Not_the
Not_the
Not_reason
Not_unlike
Not_the
Not_with
Not_exaggeration
Not_too
Not_all
Not_matter
Not_something
Not_creatures
Not_enough
Not_what
Not_have
Not_from
Not_sense
Not_sympathy
Not_enough
Not_been
Not_wit
Not_the
Not_way
Not_--
Not_--
Not_different
Not_reason
Not_hope
Not_self
Not_supposed
Not_overtly
Not_be
Not_that
Not_humour
Not_up
Not_up
Not_croissant
Not_tell
Not_attempt
Not_"
Not_longer
Not_longer
Not_count
Not_deliver
Not_sympathy
Not_to
Not_done
Not_i
Not_sooner
Not_really
Not_longer
Not_ready
Not_just
Not_,
Not_sequel
Not_because
Not_-
Not_draw
Not_-
Not_,
Not_as
Not_of
Not_the
Not_look
Not_characters
Not_,
Not_because
Not_only
Not_joking
Not_-
Not_to
Not_harm
Not_only
Not_convinci

Not_popular
Not_do
Not_trim
Not_to
Not_one
Not_even
Not_very
Not_'
Not_problem
Not_only
Not_read
Not_that
Not_other
Not_because
Not_the
Not_make
Not_a
Not_a
Not_joke
Not_special
Not_know
Not_know
Not_point
Not_much
Not_like
Not_much
Not_that
Not_nearly
Not_to
Not_so
Not_end
Not_a
Not_in
Not_deliver
Not_captured
Not_the
Not_sure
Not_enough
Not_enough
Not_,
Not_the
Not_enough
Not_necessarily
Not_nearly
Not_to
Not_too
Not_tarnish
Not_a
Not_-
Not_very
Not_much
Not_get
Not_always
Not_know
Not_shown
Not_bad
Not_work
Not_his
Not_work
Not_maintain
Not_doubt
Not_time
Not_sense
Not_unlike
Not_being
Not_a
Not_?
Not_suspense
Not_intended
Not_have
Not_the
Not_denying
Not_doubt
Not_something
Not_shoes
Not_limbs
Not_torsos
Not_charred
Not_possibly
Not_emotionally
Not_entirely
Not_likely
Not_recaptured
Not_likable
Not_a
Not_more
Not_just
Not_in
Not_only
Not_cleaner
Not_enough
Not_even
Not_movie
Not_women
Not_cry
Not_even
Not_outrageous
Not_,
Not_more
Not_less
Not_that
Not_joke
Not_the
Not_bored
Not_-


Not_trying
Not_and
Not_experience
Not_prominent
Not_attention
Not_they
Not_matter
Not_only
Not_have
Not_to
Not_recommending
Not_go
Not_making
Not_get
Not_a
Not_only
Not_matter
Not_longer
Not_seen
Not_remembering
Not_only
Not_to
Not_observe
Not_.
Not_exception
Not_really
Not_entirely
Not_very
Not_only
Not_do
Not_the
Not_stark
Not_outright
Not_-
Not_hindrance
Not_easy
Not_enough
Not_get
Not_recommended
Not_claus
Not_do
Not_matter
Not_be
Not_effort
Not_hundreds
Not_the
Not_goddamned
Not_harm
Not_see
Not_be
Not_book
Not_conclusion
Not_connection
Not_so
Not_bad
Not_worth
Not_limits
Not_have
Not_unlike
Not_idea
Not_right
Not_idea
Not_idea
Not_have
Not_.
Not_playing
Not_and
Not_feel
Not_means
Not_-
Not_lover
Not_directly
Not_one
Not_sincere
Not_bible
Not_considering
Not_recently
Not_meat
Not_substance
Not_as
Not_jake
Not_nearly
Not_.
Not_deep
Not_meaningful
Not_profound
Not_matter
Not_been
Not_afraid
Not_wanted
Not_afraid
Not_completely
Not_refer
Not_have
Not_the
Not_feel
Not_needed
Not_so
No

Not_less
Not_a
Not_humans
Not_scared
Not_made
Not_one
Not_only
Not_,
Not_other
Not_nearly
Not_a
Not_at
Not_subtle
Not_one
Not_seen
Not_reply
Not_much
Not_quite
Not_have
Not_exception
Not_writing
Not_the
Not_perfect
Not_part
Not_be
Not_what
Not_longer
Not_tell
Not_bad
Not_very
Not_embarrassingly
Not_amused
Not_wonder
Not_way
Not_alienating
Not_aware
Not_laugh
Not_is
Not_being
Not_miss
Not_really
Not_just
Not_the
Not_to
Not_longer
Not_reason
Not_room
Not_making
Not_telling
Not_to
Not_too
Not_mistake
Not_top
Not_have
Not_think
Not_clear
Not_to
Not_long
Not_.
Not_mean
Not_only
Not_thanks
Not_much
Not_doubt
Not_christmas
Not_to
Not_the
Not_always
Not_going
Not_reinvent
Not_wonder
Not_the
Not_insight
Not_original
Not_greater
Not_idea
Not_need
Not_quite
Not_to
Not_somehow
Not_include
Not_happen
Not_matter
Not_a
Not_question
Not_conceived
Not_introduced
Not_better
Not_third
Not_much
Not_get
Not_action
Not_true
Not_good
Not_big
Not_intense
Not_properly
Not_battleship
Not_too
Not_too
Not_much
No

Not_that
Not_a
Not_called
Not_only
Not_biases
Not_singing
Not_as
Not_confined
Not_be
Not_sure
Not_surprising
Not_revealing
Not_gore
Not_special
Not_criticism
Not_need
Not_do
Not_color
Not_superhuman
Not_rock
Not_ballet
Not_a
Not_to
Not_part
Not_be
Not_more
Not_fewer
Not_mentioned
Not_problem
Not_matter
Not_only
Not_view
Not_as
Not_a
Not_the
Not_discernible
Not_so
Not_real
Not_an
Not_easy
Not_reveal
Not_many
Not_particular
Not_only
Not_fun
Not_reasons
Not_in
Not_exist
Not_longer
Not_to
Not_for
Not_only
Not_out
Not_superb
Not_saying
Not_up
Not_soon
Not_the
Not_of
Not_as
Not_about
Not_in
Not_wonder
Not_discernable
Not_doubt
Not_sure
Not_more
Not_doubt
Not_a
Not_least
Not_that
Not_fake
Not_only
Not_frame
Not_only
Not_have
Not_only
Not_too
Not_stand
Not_be
Not_plan
Not_recommending
Not_,
Not_distinct
Not_just
Not_so
Not_real
Not_necessarily
Not_,
Not_full
Not_have
Not_of
Not_only
Not_concrete
Not_being
Not_being
Not_been
Not_sure
Not_rich
Not_that
Not_closure
Not_travolta
Not_about
Not_that

Not_-
Not_just
Not_acting
Not_-
Not_that
Not_remember
Not_be
Not_,
Not_a
Not_your
Not_even
Not_reachable
Not_=
Not_do
Not_see
Not_many
Not_say
Not_for
Not_imagine
Not_read
Not_as
Not_much
Not_at
Not_apparent
Not_mentioned
Not_recommend
Not_work
Not_before
Not_ordinary
Not_wanting
Not_dead
Not_heart
Not_longer
Not_likeable
Not_that
Not_quite
Not_too
Not_miss
Not_matter
Not_a
Not_very
Not_likely
Not_known
Not_an
Not_man
Not_one
Not_the
Not_reveal
Not_be
Not_see
Not_way
Not_only
Not_by
Not_more
Not_the
Not_just
Not_backing
Not_play
Not_hard
Not_a
Not_to
Not_only
Not_knowing
Not_sense
Not_hard
Not_fancy
Not_only
Not_,
Not_sure
Not_an
Not_acknowledged
Not_claiming
Not_means
Not_the
Not_only
Not_worth
Not_only
Not_so
Not_such
Not_tell
Not_real
Not_they
Not_.
Not_wholly
Not_chance
Not_movie
Not_try
Not_,
Not_resolved
Not_entirely
Not_given
Not_by
Not_sufficiently
Not_only
Not_on
Not_to
Not_as
Not_least
Not_a
Not_better
Not_my
Not_a
Not_about
Not_good
Not_convincing
Not_all
Not_wasted
Not_mind

In [20]:
# container for sentences that have undergone second transformation(bigram detection)
thirdsents = []
for sent in secondsents:
    print(sent)
    words = sent.split(" ")
    # find bigrams within current sentence by using BigramCollationFinder package and choosing the top 400 bigrams by frequency
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq,400)
    # combine bigrams into single string and add to word list
    for bigram in bigrams:
        BG = bigram[0] + "_" + bigram[1]
        words.append(BG)
    # create new sentence w/ bigrams included
    sent = " ".join(word for word in words)
    print(sent)
    thirdsents.append(sent)

IndexError: tuple index out of range

# Test Results Here

In [8]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [9]:
def build_test_model_NB(data,labels):
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)
    text_clf.fit(X_train,y_train)
    predicted = text_clf.predict(X_test)
    print("Accuracy is: %0.4f"%np.mean(predicted == y_test) + "%")
    print(metrics.classification_report(y_test,predicted))

In [10]:
build_test_model_NB(dataset,labels)

Accuracy is: 0.7803%
             precision    recall  f1-score   support

        neg       0.72      0.89      0.80       325
        pos       0.87      0.67      0.76       335

avg / total       0.80      0.78      0.78       660

