In [1]:
# https://scikit-learn.org/
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# http://www.numpy.org/
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
# https://www.nltk.org/
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
import string
import nltk

# Download movie reviews and stopwords from Natural Language Toolkit

In [2]:
nltk.download('movie_reviews')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\scook\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def get_data():
    """
    Get movie review data
    """
    dataset = []
    y_labels = []
    # Extract categories
    for cat in movie_reviews.categories():
        # for files in each cateogry    
        for fileid in movie_reviews.fileids(cat):
            # Get the words in that category
            words = list(movie_reviews.words(fileid))
            # the resulting dataset stores sentences
            sentences = " ".join(word for word in words)
            dataset.append((sentences))
            y_labels.append(cat)
    return dataset,y_labels


In [4]:
dataset, labels = get_data()
target_labels = ['neg','pos']

### Create data structures for stopwords and punctuation

In [5]:
stopwords = stopwords.words('english')
punctuation = string.punctuation

In [7]:
# count of words that have been negated 
count = 0
# container for sentences that have undergone first transformation
secondsents = []
# traverse sentences in the dataset
for sent in dataset:
    final_words = []
    # split sentence back into words
    words = sent.split(" ")
    # negation flag and negation words to look at
    negate = False
    negate_words = ['no','not']
    # go through all the words in the sentence to see if the sentence is negated
    for word in words:
        # if the negate flag is true append the current word with Not_, increase the negation count, and reset the flag
        if negate:
            word = 'Not_' + word
            count += 1
            print(word)
            negate = False
        # if the flag isn't set and the current word isn't a negation add it to the final word list
        if word not in negate_words:
            final_words.append(word)
        # if the negation flag was false and the word was a negation flip the flag to true
        else:
            negate = True
    # overwrite words with modified words
    words = final_words
    # remove stopwords and punctuation from words list
    words = [word for word in words if word.lower() not in stopwords and word.lower() not in punctuation]
    # overwrite sentence with sentence constructed from new words list
    sent = " ".join(word for word in words)
    # put new sentence into data structure
    secondsents.append(sent)
print(count)

8047


In [8]:
# container for sentences that have undergone second transformation(bigram detection)
thirdsents = []
for sent in secondsents:
    print(sent)
    words = sent.split(" ")
    # find bigrams within current sentence by using BigramCollationFinder package and choosing the top 400 bigrams by frequency
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq,400)
    # combine bigrams into single string and add to word list
    for bigram in bigrams:
        BG = bigram[0] + "_" + bigram[1]
        words.append(BG)
    # create new sentence w/ bigrams included
    sent = " ".join(word for word in words)
    print(sent+"\n\n")
    thirdsents.append(sent)

In [9]:
'''
Count Vectorizer: Convert a collection of text documents to a matrix of token counts
tfid: Transform a count matrix to a normalized tf or tf-idf representation. Tf means term-frequency while 
      tf-idf means term-frequency times inverse document-frequency.
MultinomialNB: Naive Bayes classifier for multinomial models
               The multinomial Naive Bayes classifier is suitable for classification with discrete features 
               (e.g., word counts for text classification). The multinomial distribution normally requires 
               integer feature counts. However, in practice, fractional counts such as tf-idf may also work.
'''

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [11]:
'''
Stochastic Gradient Descent (SGD): is a simple yet very efficient approach 
to discriminative learning of linear classifiers under convex loss functions
such as (linear) Support Vector Machines and Logistic Regression. Even
though SGD has been around in the machine learning community for a long time,
it has received a considerable amount of attention just recently in the context of large-scale learning.
'''
text_sgd = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42)),
])

In [67]:
def build_test_model_NB(data,labels):
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=5)
    text_clf.fit(X_train,y_train)
    predicted = text_clf.predict(X_test)
    print("Accuracy is: %0.4f"%np.mean(predicted == y_test) + "%")
    print(metrics.classification_report(y_test,predicted))

In [68]:
def build_test_model_SGD(data,labels):
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=5)
    text_sgd.fit(X_train,y_train)
    predicted = text_sgd.predict(X_test)
    print("Accuracy is: %0.4f"%np.mean(predicted == y_test) + "%")
    print(metrics.classification_report(y_test,predicted))

In [69]:
build_test_model_NB(dataset,labels)

Accuracy is: 0.8030%
             precision    recall  f1-score   support

        neg       0.76      0.88      0.82       329
        pos       0.86      0.73      0.79       331

avg / total       0.81      0.80      0.80       660



In [70]:
build_test_model_NB(secondsents,labels)

Accuracy is: 0.8197%
             precision    recall  f1-score   support

        neg       0.80      0.85      0.83       329
        pos       0.84      0.79      0.81       331

avg / total       0.82      0.82      0.82       660



In [71]:
build_test_model_NB(thirdsents,labels)

Accuracy is: 0.8061%
             precision    recall  f1-score   support

        neg       0.85      0.74      0.79       329
        pos       0.77      0.87      0.82       331

avg / total       0.81      0.81      0.81       660



In [74]:
build_test_model_SGD(secondsents,labels)

Accuracy is: 0.8424%
             precision    recall  f1-score   support

        neg       0.84      0.85      0.84       329
        pos       0.85      0.84      0.84       331

avg / total       0.84      0.84      0.84       660



In [75]:
build_test_model_SGD(thirdsents,labels)

Accuracy is: 0.8303%
             precision    recall  f1-score   support

        neg       0.82      0.84      0.83       329
        pos       0.84      0.82      0.83       331

avg / total       0.83      0.83      0.83       660



In [108]:
X_train, X_test, y_train, y_test = train_test_split(secondsents, labels, test_size=0.33, random_state=5)

In [109]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)
X_train_counts.shape


(1340, 34733)

In [110]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
X_train_tfidf.shape

(1340, 34733)

In [111]:
clf = SGDClassifier().fit(X_train_tfidf, y_train)

In [112]:
predicted = clf.predict(X_test_tfidf)
np.mean(predicted == y_test)

0.8196969696969697

In [113]:
feature_names = list(count_vect.get_feature_names())
top10 = np.argsort(clf.coef_[0])[-10:]
#print(top10)
print(" ".join(feature_names[j] for j in top10))


time also well good story life like one movie film
