In [10]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

nltk.download('punkt')
nltk.download('wordnet')

wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

positive_reviews = BeautifulSoup(open('positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')
negative_reviews = BeautifulSoup(open('negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

np.random.shuffle(positive_reviews)
np.random.shuffle(negative_reviews)

def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]

    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
positive_tokenized = []
negative_tokenized = []

word_index_map = {}
current_index = 0

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [0]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map)+1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x /= x.sum()
    x[-1] = label
    return x

In [0]:
N = len(positive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word_index_map) + 1))
i = 0

for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i, :] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i, :] = xy
    i += 1

np.random.shuffle(data)
X = data[:, :-1]
y = data[:, -1]

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.7075


In [19]:
threshold = 0.7
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

sound 0.90642942908951
price 2.452030044125941
n't -1.944548445655888
you 1.0324737007749956
month -0.7208810814779492
easy 1.5072267155063
highly 0.8854056001373386
quality 1.3069117164531037
love 0.9970779509403283
wa -1.453654853996005
lot 0.7687760063914272
excellent 1.2199409203604985
cable 0.7616221958507047
doe -1.052638833829289
speaker 0.9674517342095352
little 0.9330135607088245
time -0.7502836529004621
perfect 0.8844408223918653
support -0.9280758811498445
return -1.019719412797275
waste -0.829310467762352
poor -0.7290065852059215


In [21]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.775
