In [1]:
import nltk
nltk.download('wordnet', quiet = True)
nltk.download('stopwords', quiet = True)

[nltk_data] Downloading package wordnet to /Users/stefan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stefan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [90]:
from nltk.corpus import movie_reviews, stopwords 
from nltk import NaiveBayesClassifier
from nltk import DecisionTreeClassifier
from nltk import ConditionalExponentialClassifier
from nltk import classify 
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.metrics import ConfusionMatrix
from sklearn.metrics import confusion_matrix
from collections import defaultdict
import random
import string
import nltk.stem  as stem
import pandas as pd

In [91]:
documents = []

for category in movie_reviews.categories():
    for file_id in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(file_id), category))

In [None]:
documents

In [92]:
random.seed(1)
random.shuffle(documents)

In [93]:
def remove_stop_words_and_stem(review):
    # initialize stopwords list
    stopWords = set(stopwords.words("english"))
    # initialize stemmer
    wordnet_lemmatizer = stem.WordNetLemmatizer()
    # remove punctuation
    no_punctuation = [word.lower() for word in review if word not in string.punctuation]
    # remove stop words
    no_stop_words = [word for word in no_punctuation if word not in stopWords]
    # stem words
    stemmed = [wordnet_lemmatizer.lemmatize(word) for word in no_stop_words]
    # remove words less than 3 characters long
    final = [word for word in stemmed if len(word)>2]
    return final


In [94]:
docs = []
for pair in documents:
    document = list(pair[0])
    sentiment = pair[1]
    new = remove_stop_words_and_stem(document)
    docs.append((new, sentiment))

In [None]:
# docs[0]

In [95]:
all_words = [word.lower() for word in movie_reviews.words()]
all_words = remove_stop_words_and_stem(all_words)

In [104]:
freqdist = FreqDist(all_words)

In [105]:
most_common_words = freqdist.most_common(5000)
most_common_words

[('film', 11053),
 ('movie', 6977),
 ('one', 6028),
 ('character', 3879),
 ('like', 3789),
 ('time', 2979),
 ('get', 2814),
 ('scene', 2671),
 ('make', 2634),
 ('even', 2568),
 ('good', 2429),
 ('story', 2345),
 ('would', 2109),
 ('much', 2049),
 ('also', 1967),
 ('well', 1921),
 ('life', 1913),
 ('two', 1911),
 ('see', 1885),
 ('way', 1882),
 ('first', 1836),
 ('year', 1732),
 ('thing', 1661),
 ('take', 1579),
 ('plot', 1574),
 ('really', 1558),
 ('come', 1510),
 ('little', 1505),
 ('know', 1494),
 ('people', 1470),
 ('could', 1427),
 ('man', 1404),
 ('bad', 1395),
 ('work', 1379),
 ('never', 1374),
 ('director', 1347),
 ('best', 1334),
 ('end', 1328),
 ('performance', 1317),
 ('new', 1292),
 ('look', 1278),
 ('many', 1268),
 ('action', 1260),
 ('actor', 1252),
 ('love', 1209),
 ('play', 1205),
 ('star', 1160),
 ('role', 1155),
 ('show', 1151),
 ('great', 1150),
 ('another', 1121),
 ('find', 1119),
 ('made', 1084),
 ('audience', 1079),
 ('back', 1070),
 ('give', 1068),
 ('big', 1064),

In [106]:
word_features = [item[0] for item in most_common_words]

In [107]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = (word in document_words)
    return features

In [108]:
feature_set = [(document_features(doc), category) for (doc, category) in docs]
# print (feature_set[0])

In [109]:
# our dataset contains 2000 reviews,
# so we will do a 80-20 train-test split
# Note: the reviews have already been shuffled previously
test_set = feature_set[:400]
train_set = feature_set[400:]

In [110]:
NB = NaiveBayesClassifier.train(train_set)

In [19]:
accuracy = classify.accuracy(NB, test_set)
print (accuracy)

0.785


In [42]:
def get_confusion_matrix(classifier, testset):
    refsets = defaultdict(set)
    testsets = defaultdict(set)
    labels = []
    tests = []
    for i, (feats, label) in enumerate(testset):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
        labels.append(label)
        tests.append(observed)

    print(nltk.ConfusionMatrix(labels, tests))

In [41]:
get_confusion_matrix(NB, test_set)

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<158> 30 |
pos |  56<156>|
----+---------+
(row = reference; col = test)



In [81]:
def predict_sentiment(text) :
    tokens = list(word_tokenize(text))
    processed_tokens = remove_stop_words_and_stem(tokens)
    features = document_features(processed_tokens)
    return NB.classify(features)

In [86]:
# FOR TESTING PURPOSES ONLY
import pandas as pd
import glob

def load_reviews (dirname):
    negative_path = dirname + '/neg/*.txt'
    negative_file_list = glob.glob(negative_path)
    kind = ["neg"] * len(negative_file_list)
    texts = [str(open(file).read()) for file in negative_file_list]
    positive_path = dirname + '/pos/*.txt'
    positive_file_list = glob.glob(positive_path)
    kind2 = ["pos"] * len(positive_file_list)
    kind.extend(kind2)
    text2 = [str(open(file).read()) for file in positive_file_list]
    texts.extend(text2)
    filename = negative_file_list + positive_file_list
    filename = [x.split("/")[-1:][0] for x in filename]
    df = pd.DataFrame(list(zip(filename,kind,texts)), columns = ['filename', 'kind', 'text'])
    return df

from pathlib import Path

dataset = load_reviews(str(Path.home()) + '/nltk_data/corpora/movie_reviews')

In [89]:
%%timeit
for x in list(dataset['text'])[:1000]:
    predict_sentiment(x)

17.6 s ± 170 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
maxent = MaxentClassifier.train(train_set)
accuracy = classify.accuracy(maxent, test_set)
print (accuracy)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.492


  exp_nf_delta = 2 ** nf_delta
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
  deltas -= (ffreq_empirical - sum1) / -sum2


         Final               nan        0.507
0.47


In [37]:
dec_tree = DecisionTreeClassifier.train(train_set)

In [38]:
accuracy = classify.accuracy(dec_tree, test_set)
print (accuracy)

0.555


In [39]:
get_confusion_matrix(dec_tree, test_set)

[[110  78]
 [100 112]]
    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<110> 78 |
pos | 100<112>|
----+---------+
(row = reference; col = test)



In [50]:
c_expo = ConditionalExponentialClassifier.train(train_set)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.492


  exp_nf_delta = 2 ** nf_delta
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
  deltas -= (ffreq_empirical - sum1) / -sum2


         Final               nan        0.507


In [51]:
accuracy = classify.accuracy(c_expo, test_set)
print (accuracy)

0.47


# Try Logistic Regression

In [112]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
merged = []
labels = []
for pair in docs:
    words = pair[0]
    label = pair[1]
    labels.append(label)
    merged.append(" ".join(w for w in words))
    
text_test = merged[0:400]
y_test = labels[0:400]
text_train = merged[400:]
y_train = labels[400:]

In [None]:
#text_train

In [None]:
vect = CountVectorizer(min_df=5, ngram_range=(2, 2))
X_train = vect.fit(text_train).transform(text_train)
X_test = vect.transform(text_test)

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)

In [None]:
lr = grid.best_estimator_
lr.fit(X_train, y_train)
lr.predict(X_test)
print("Score: {:.2f}".format(lr.score(X_test, y_test)))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

predictions = lr.predict(X_test)
confusion_matrix(y_test,predictions)

In [113]:
from sklearn.naive_bayes import MultinomialNB

In [114]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vec = TfidfVectorizer()
X_vectorized = vec.fit_transform(text_train)
Y_train = np.array(y_train)
clf = MultinomialNB()
clf.fit(X_vectorized, Y_train)


MultinomialNB()

In [116]:
test_vec = vec.transform(text_test)
print(clf.score(test_vec, np.array(y_test)))

0.8225


In [118]:
from pathlib import Path
print(str(Path.home()))

/Users/stefan


In [119]:
import os
os.getcwd()

'/Users/stefan/Desktop/ml-stefanroata'