In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    reviewsFile = open('../data/reviews.txt','r')
    reviews = list(map(lambda x:x[:-1],reviewsFile.readlines()))
    reviewsFile.close()

    labelsFile = open('../data/labels.txt','r')
    labels = list(map(lambda x:x[:-1],labelsFile.readlines()))
    labelsFile.close()
    
    return reviews,labels

In [3]:
reviews,labels = load_data()

In [4]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("\w+\'?\w+|\w+")

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [6]:
from spacy.lang.en.stop_words import STOP_WORDS

In [7]:
exceptionStopWords = {
    'again',
    'against',
    'ain',
    'almost',
    'among',
    'amongst',
    'amount',
    'anyhow',
    'anyway',
    'aren',
    "aren't",
    'below',
    'bottom',
    'but',
    'cannot',
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'done',
    'down',
    'except',
    'few',
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'however',
    'isn',
    "isn't",
    'least',
    'mightn',
    "mightn't",
    'move',
    'much',
    'must',
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'neither',
    'never',
    'nevertheless',
    'no',
    'nobody',
    'none',
    'noone',
    'nor',
    'not',
    'nothing',
    'should',
    "should've",
    'shouldn',
    "shouldn't",
    'too',
    'top',
    'up',
    'very'
    'wasn',
    "wasn't",
    'well',
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't",
}

In [8]:
stop_words = set(stop_words).union(STOP_WORDS)

In [9]:
final_stop_words = stop_words-exceptionStopWords

In [10]:
import spacy
nlp = spacy.load("en",disable=['parser', 'tagger', 'ner'])

In [11]:
def make_token(review):
    return tokenizer.tokenize(str(review))

In [12]:
def remove_stopwords(review):
    return [token for token in review if token not in final_stop_words]

In [13]:
def lemmatization(review):
    lemma_result = []
    
    for words in review:
        doc = nlp(words)
        for token in doc:
            lemma_result.append(token.lemma_)
    return lemma_result

In [14]:
def pipeline(review):
    review = make_token(review)
    review = remove_stopwords(review)
    return lemmatization(review)

In [15]:
%%time
reviews = list(map(lambda review: pipeline(review),reviews))

CPU times: user 39 s, sys: 88.2 ms, total: 39.1 s
Wall time: 39.1 s


In [16]:
reviews[:2]

[['bromwell',
  'high',
  'cartoon',
  'comedy',
  'run',
  'time',
  'program',
  'school',
  'life',
  'teacher',
  'year',
  'teach',
  'profession',
  'lead',
  'believe',
  'bromwell',
  'high',
  'satire',
  'much',
  'close',
  'reality',
  'teacher',
  'scramble',
  'survive',
  'financially',
  'insightful',
  'student',
  'right',
  'pathetic',
  'teacher',
  'pomp',
  'pettiness',
  'situation',
  'remind',
  'school',
  'know',
  'student',
  'see',
  'episode',
  'student',
  'repeatedly',
  'try',
  'burn',
  'down',
  'school',
  'immediately',
  'recall',
  'high',
  'classic',
  'line',
  'inspector',
  'sack',
  'teacher',
  'student',
  'welcome',
  'bromwell',
  'high',
  'expect',
  'adult',
  'age',
  'think',
  'bromwell',
  'high',
  'far',
  'fetch',
  'pity',
  'isn'],
 ['story',
  'man',
  'unnatural',
  'feeling',
  'pig',
  'start',
  'open',
  'scene',
  'terrific',
  'example',
  'absurd',
  'comedy',
  'formal',
  'orchestra',
  'audience',
  'turn',
  '

In [17]:
from gensim.models import Word2Vec

In [18]:
embedding_dimension = 100

In [19]:
model = Word2Vec(reviews,size=embedding_dimension, window=3, min_count=3, workers=4)

In [20]:
model.sg

0

In [21]:
word_vectors = model.wv

In [22]:
del model

In [23]:
len(word_vectors.vocab)

28165

In [24]:
word_vectors.similar_by_word(word="good", topn=5)

[('decent', 0.7319575548171997),
 ('alright', 0.6960811018943787),
 ('okay', 0.6700797080993652),
 ('darn', 0.6603134870529175),
 ('great', 0.6344092488288879)]

In [25]:
word_vectors.similar_by_word(word="bad", topn=5)

[('horrible', 0.7185400724411011),
 ('lame', 0.6830708384513855),
 ('terrible', 0.6805494427680969),
 ('suck', 0.672660768032074),
 ('lousy', 0.6538549661636353)]

In [26]:
word_vectors.most_similar(positive="bad",topn=4)

[('horrible', 0.7185400724411011),
 ('lame', 0.6830708384513855),
 ('terrible', 0.6805494427680969),
 ('suck', 0.672660768032074)]

In [27]:
word_vectors.similarity("good","bad")

0.58493686

In [28]:
word_vectors.similarity("good","be")

0.2628381

In [29]:
word_vectors.similar_by_word(word="school", topn=5)

[('college', 0.765923261642456),
 ('schooler', 0.7632982134819031),
 ('class', 0.7375858426094055),
 ('student', 0.7067117094993591),
 ('teacher', 0.6980537176132202)]

In [30]:
word_vectors.similar_by_word(word="comedy", topn=5)

[('farce', 0.6577758193016052),
 ('satire', 0.6503036022186279),
 ('slapstick', 0.6476813554763794),
 ('humor', 0.6358456611633301),
 ('parody', 0.6339165568351746)]

In [31]:
word_vectors.similar_by_word(word="action", topn=5)

[('suspense', 0.608113169670105),
 ('gory', 0.5757965445518494),
 ('thrill', 0.5684385299682617),
 ('fantasy', 0.5647292137145996),
 ('overlong', 0.5647001266479492)]

In [32]:
word_vectors.similar_by_word(word="sad", topn=5)

[('depress', 0.7832858562469482),
 ('cry', 0.7202650308609009),
 ('happy', 0.7075549960136414),
 ('heartwarming', 0.7021917700767517),
 ('anyways', 0.6821246147155762)]

In [33]:
word_vectors.most_similar(negative=["bad"],positive=["decent"],topn=5)

[('fine', 0.41307878494262695),
 ('solid', 0.4069543182849884),
 ('splendid', 0.37130245566368103),
 ('tremendous', 0.36157888174057007),
 ('outstanding', 0.35796302556991577)]