In [61]:
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.stem import WordNetLemmatizer

import numpy as np
import random
import sklearn
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [9]:
documents = []
for category in movie_reviews.categories():
    for file_id in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(file_id)), category))

In [10]:
print(documents[1])
random.shuffle(documents)

(['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', 'review', 'damn', 'that', 'y2k', 'bug', '.', 'it', "'", 's', 'got', 'a', 'head', 'start', 'in', 'this', 'movie', 'starring', 'jamie', 'lee', 'curtis', 'and', 'another', 'baldwin', 'brother', '(', 'william', 'this', 'time', ')', 'in', 'a', 'story', 'regarding', 'a', 'crew', 'of', 'a', 'tugboat', 'that', 'comes', 'across', 'a', 'deserted', 'russian', 'tech', 'ship', 'that', 'has', 'a', 'strangeness', 'to', 'it', 'when', 'they', 'kick', 'the', 'power', 'back', 'on', '.', 'little', 'do', 'they', 'know', 'the', 'power', 'within', '.', '.', '.', 'going', 'for', 'the', 'gore', 'and', 'bringing', 'on', 'a', 'few', 'action', 'sequences', 'here', 'and', 'there', ',', 'virus', 'still', 'feels', 'very', 'empty', ',', 'like', 'a', 'movie', 'going', 'for', 'all', 'flash', 'and', 'no', 'substance', '.', 'we', 'don', "'", 't', 'know', 'why', 'the', 'crew', 'was', 'really', 'out', 'in', 'the', 'middle', 'of', 'nowhere', ',', 'we', 'don', "'", 't'

In [11]:
all_words = []
for word in movie_reviews.words():
    all_words.append(word.lower())
    
print(all_words[5:15])

['go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive']


In [12]:
all_words = nltk.FreqDist(all_words)
all_words.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

In [13]:
all_words['bastard']

46

In [25]:
stopwords_en = set(stopwords.words('english'))
print(stopwords_en)

lemmatizer = WordNetLemmatizer()

{'hadn', 'why', 'we', "shan't", 'wouldn', 'didn', 'these', 'above', 'she', 'all', 'other', 'do', "hadn't", 'him', 'because', 'for', 'won', 'same', 'ma', "doesn't", 'them', 'then', 'did', "hasn't", 'her', 'hers', 'whom', "that'll", 'no', 'too', 'about', 'wasn', "needn't", "couldn't", 'i', 'who', 'should', 'aren', 'further', 'where', 'couldn', "you'd", 'just', "aren't", 'it', 'was', 's', 't', "mightn't", 'again', 'from', "you'll", 'they', 'and', 'are', 'mustn', 'very', 'had', "don't", "isn't", 'a', 'his', 'herself', 'on', "won't", "you're", "should've", 'o', 'been', 'were', 'being', 'while', 'my', 'than', 'doesn', 'to', 'more', 'below', 'some', 'your', 'don', "didn't", 'has', 'shouldn', 'its', 'an', 'doing', 'shan', 'hasn', 'during', 'nor', 'in', "mustn't", "it's", 'before', 'weren', 'isn', 'or', 'over', 'both', 'those', 'once', 'up', 'only', 'as', 'so', 'm', 'what', 'into', "shouldn't", 'our', "wasn't", 'needn', 'yourself', 'this', 're', 'he', 'is', 'if', 'y', 'can', "haven't", 'down', 

In [56]:
def filter_doc(doc):
    words = doc[0]
    filtered_doc = []
    
    for word in words:
        lemma = lemmatizer.lemmatize(word.lower())
        if lemma not in stopwords_en and lemma.isalpha():
            filtered_doc.append(lemma)
    
    return ' '.join(filtered_doc), doc[1]

In [57]:
corpus_X = []
labels_y = []

for doc in documents:
    doc, label = filter_doc(doc)
    corpus_X.append(doc)
    labels_y.append(label)
    
assert len(corpus_X) == len(labels_y)

In [66]:
y = []
for label in labels_y:
    if label == 'neg':
        y.append(0)
    else:
        y.append(1)

X_raw = np.array(corpus_X)
y = np.array(y)

In [72]:
tfidf = TfidfVectorizer(max_features=2000)
X = tfidf.fit_transform(X_raw).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
model = GaussianNB()
preds = model.fit(X_train, y_train).predict(X_test)

accuracy = sklearn.metrics.accuracy_score(y_test, preds)
print(f'Accuracy {accuracy*100}%')

Accuracy 69.5%
