In [1]:
import pandas as pd
from tqdm import tqdm
import regex as re
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from nltk import sent_tokenize, word_tokenize

In [2]:
df = pd.DataFrame.from_csv("../data/labeledTrainData.tsv", sep='\t')

  """Entry point for launching an IPython kernel.


In [3]:
len(df)

25000

In [4]:
words = ['It', 'has', 'many', 'permissions']
allowed = {'JJ', 'RB'}

tags = nltk.pos_tag(words)
print(tags)
a = list(filter(lambda x: x[1] in allowed, tags))
fwords, ftags = zip(*a)
print(fwords)
print(ftags)

[('It', 'PRP'), ('has', 'VBZ'), ('many', 'JJ'), ('permissions', 'NNS')]
('many',)
('JJ',)


In [5]:
tags = [tag for word, tag in nltk.pos_tag(words)]
tags

list(filter(lambda x: x in allowed, tags))



['JJ']

In [6]:
all_grams = []
allowed_tags = {'JJ', 'JJ NN', 'JJ NNS', 'JJ NN NN', 'RB JJ NN', 'JJ TO VB', 'VB JJ NN'}
banned_words = {'', 'br'}

def filter_banned_words(words, banned_words):
    filtered_words = list(filter(lambda x: x not in banned_words, words))
    
    return filtered_words

def merge_n_grams(ngrams, ntags):
    merged_n_grams = []
    merged_n_tags = []
    
    for ngram, ntag in zip(ngrams, ntags):
        merged_n_grams.append(' '.join(ngram))
        merged_n_tags.append(' '.join(ntag))
        
    return merged_n_grams, merged_n_tags

def filter_by_tags(ngrams, ntags, allowed_tags):
    pos = zip(ngrams, ntags)
    
    filtered_pos = list(filter(lambda x: len(x[0]) > 1 and x[1] in allowed_tags, pos))
    
    # filtered tags and words
    filtered_words = [word for word, tag in filtered_pos]
    filtered_tags = [tag for word, tag in filtered_pos]
    
    return filtered_words, filtered_tags

def extend_all_grams(review, all_grams):
    sentences = sent_tokenize(review)
    
    # pos tagging and filtering
    pos_tags = []
    for sentence in sentences:
        split_words = [re.sub(r"\p{P}+", "", x).lower() for x in sentence.split()]
        words = filter_banned_words(split_words, banned_words)
        pos_tags.extend(nltk.pos_tag(words))
    
    # split words and tags
    words = [word.lower() for word, tag in pos_tags]
    tags = [tag for word, tag in pos_tags]
    
    # creation of unigrams, bigrams and trigrams
    unigrams, utags = filter_by_tags(words, tags, allowed_tags)
    bigrams = zip(words, words[1:])
    bitags = zip(tags, tags[1:])
    trigrams = zip(words, words[1:], words[2:])
    tritags = zip(tags, tags[1:], tags[2:])
    
    mbigrams, mbitags = merge_n_grams(bigrams, bitags)
    fbigrams, fbitags = filter_by_tags(mbigrams, mbitags, allowed_tags)
    mtrigrams, mtritags = merge_n_grams(trigrams, tritags)
    ftrigrams, ftritags = filter_by_tags(mtrigrams, mtritags, allowed_tags)
    
    #all_grams.extend(list(set(unigrams)))
    all_grams.extend(list(set(fbigrams)))
    all_grams.extend(list(set(ftrigrams)))

for i in tqdm(range(len(df))):
    extend_all_grams(df.iloc[i]['review'], all_grams)

100%|██████████| 25000/25000 [06:57<00:00, 59.90it/s]


In [7]:
import string
import regex as re

from nltk import sent_tokenize, word_tokenize

text = "'I wonder how many miles I've fallen by this time?' she said aloud. Are you alive? Hey!"
split_test = [re.sub(r"\p{P}+", "", x).lower() for x in text.split()]
split_test

['i',
 'wonder',
 'how',
 'many',
 'miles',
 'ive',
 'fallen',
 'by',
 'this',
 'time',
 'she',
 'said',
 'aloud',
 'are',
 'you',
 'alive',
 'hey']

In [8]:
from collections import Counter

counts = Counter(all_grams)

In [9]:
counts.most_common(1000)

[('special effects', 929),
 ('i dont', 926),
 ('first time', 637),
 ('<br ><br', 583),
 ('new york', 548),
 ('good movie', 522),
 ('low budget', 516),
 ('same time', 509),
 ('main character', 444),
 ('only thing', 439),
 ('high school', 418),
 ('real life', 415),
 ('main characters', 386),
 ('many people', 382),
 ('long time', 372),
 ('great movie', 347),
 ('whole movie', 345),
 ('bad movie', 329),
 ('many times', 325),
 ('whole thing', 323),
 ('good job', 308),
 ('good film', 302),
 ('other hand', 291),
 ('great film', 276),
 ('good thing', 275),
 ('little bit', 274),
 ('i guess', 267),
 ('other films', 256),
 ('bad guys', 253),
 ('few years', 251),
 ('only reason', 250),
 ('i didnt', 244),
 ('other movies', 223),
 ('first movie', 219),
 ('entire movie', 217),
 ('first film', 214),
 ('bad guy', 214),
 ('great job', 212),
 ('young woman', 210),
 ('first place', 198),
 ('true story', 197),
 ('many years', 195),
 ('whole film', 192),
 ('entire film', 190),
 ('big fan', 186),
 ('other peo

In [10]:
DIRECTORY = '/mnt/c/Users/gvs/ubuntu/neural-review-summarization/output/'
FILE_NAME = 'important_phrases.csv'

In [11]:
df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
df = df.rename(columns={'index':'Phrase', 0:'Count'})

In [12]:
df.to_csv(DIRECTORY + FILE_NAME, encoding='utf-8')

In [13]:
df = pd.DataFrame.from_csv(DIRECTORY + FILE_NAME)

  """Entry point for launching an IPython kernel.


In [14]:
import numpy as np

def load_glove_model(glove_file):
    print("Loading Glove Model")
    with open(glove_file,'r') as f:
        model = {}
        for line in tqdm(f):
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            model[word] = embedding
            
    print("Done.",len(model)," words loaded!")
    return model

In [16]:
GLOVE_DIR = '/home/gvs/neural-review-summarization/model/embeddings/'
GLOVE_FILE = 'glove.6B.300d.txt'
glove = load_glove_model(GLOVE_DIR + GLOVE_FILE)

653it [00:00, 6528.03it/s]

Loading Glove Model


400000it [00:45, 8853.95it/s]

Done. 400000  words loaded!





In [17]:
len(glove['hi'])

300

In [35]:
from sklearn.preprocessing import normalize

def vectorize_phrase(string_phrase, emb_size=300):
    emb_sum = None
    keys = string_phrase.split()
    
    all_in_glove = True
    
    for key in keys:
        if key not in glove:
            all_in_glove = False
            break
    
    if all_in_glove:
        emb_sum = np.zeros(emb_size)
        
        for key in keys:
            vec = glove[key]
            emb_sum += vec

        emb_sum /= len(keys)
        emb_sum = normalize(emb_sum[:,np.newaxis], axis=0).ravel()
    
        """
        norm = np.linalg.norm(emb_sum)
        if norm!=0:
            emb_sum /= np.linalg.norm(norm)
        """
    
    return emb_sum

    
meanings = []
meaning_labels = []
emb_size = len(glove['hi'])
for i in tqdm(range(len(df))):
    string_phrase =  df.iloc[i]['Phrase']
    
    vectorized_phrase = vectorize_phrase(string_phrase, emb_size=emb_size)
    
    if vectorize_phrase is not None:
        meaning_labels.append(string_phrase)
        meanings.append(vectorized_phrase)

print(meanings[0])

100%|██████████| 245713/245713 [01:49<00:00, 2246.64it/s]

[ -1.41665713e-02  -7.72734012e-02   2.08478552e-02  -2.92156377e-02
   3.13777861e-02  -7.66548265e-03  -2.57958371e-02  -8.56748522e-02
  -4.59723725e-04  -3.16541334e-02   1.14547144e-01  -4.10542522e-02
  -4.73876648e-02  -8.49996330e-02  -6.21448990e-02   6.50622976e-02
   7.10261970e-02   7.06106909e-03  -4.06147440e-03   3.53796401e-02
   2.66505332e-02  -1.69215273e-03   3.65235107e-02   6.41104232e-02
  -4.35034098e-02  -4.24444034e-02  -4.94069602e-03  -3.94677949e-02
   9.26794820e-03  -7.39765253e-02  -4.31319366e-02  -1.64628297e-02
  -2.91585930e-02   4.91432348e-03  -2.75782927e-01  -5.85727020e-02
   6.51206866e-02  -3.88107389e-02   5.97618164e-02  -1.74869464e-02
   6.98728754e-02  -7.44516416e-02   5.74230849e-02   5.98402260e-02
  -4.56928441e-02  -2.55781866e-02  -2.88045646e-03   1.01600996e-02
  -1.65850465e-02  -3.19518251e-02   1.78721703e-02  -7.99570384e-02
  -6.32693750e-02   5.78059485e-02   3.32475687e-02   3.02719864e-04
  -7.55660611e-02   3.84772339e-02




In [19]:
(meanings[1]==meanings[2]).all()

False

In [20]:
a = np.array([0,1])
b = np.array([2,3])
(a + b) / 2

array([ 1.,  2.])

In [21]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
from scipy import spatial

print(df.iloc[0])
print(df.iloc[5])

print(1 - spatial.distance.cosine(meanings[0], meanings[5]))

Phrase    veteran richard
Count                   1
Name: 0, dtype: object
Phrase    tara reid
Count             2
Name: 5, dtype: object
0.0693620457


In [22]:
from sklearn import cluster

k = 600
kmeans = cluster.KMeans(n_clusters=k, max_iter=300, n_init=5, n_jobs=2, verbose=1)
kmeans.fit_predict(meanings)

Initialization complete
Initialization complete
start iteration
done sorting
end inner loop
start iteration
done sorting
end inner loop
Iteration 0, inertia 116942.207936
start iteration
done sorting
Iteration 0, inertia 116987.271362
start iteration
done sorting
end inner loop
end inner loop
Iteration 1, inertia 109017.011056
start iteration
done sorting
Iteration 1, inertia 109721.616202
start iteration
done sorting
end inner loop
end inner loop
Iteration 2, inertia 106662.027581
start iteration
done sorting
Iteration 2, inertia 107095.909826
start iteration
done sorting
end inner loop
end inner loop
Iteration 3, inertia 105663.805125
start iteration
done sorting
Iteration 3, inertia 105848.278931
start iteration
done sorting
end inner loop
Iteration 4, inertia 105133.370102
start iteration
done sorting
end inner loop
Iteration 4, inertia 105221.304249
start iteration
done sorting
end inner loop
Iteration 5, inertia 104800.680239
start iteration
done sorting
end inner loop
end inner 

start iteration
done sorting
end inner loop
Iteration 51, inertia 103686.815041
start iteration
done sorting
end inner loop
Iteration 51, inertia 103552.420723
start iteration
done sorting
end inner loop
Iteration 52, inertia 103685.971166
start iteration
done sorting
end inner loop
Iteration 52, inertia 103550.019394
start iteration
done sorting
end inner loop
Iteration 53, inertia 103684.965967
start iteration
done sorting
end inner loop
Iteration 53, inertia 103546.858812
start iteration
done sorting
end inner loop
Iteration 54, inertia 103684.155149
start iteration
done sorting
end inner loop
Iteration 54, inertia 103543.241142
start iteration
done sorting
end inner loop
Iteration 55, inertia 103683.475789
start iteration
done sorting
end inner loop
Iteration 55, inertia 103540.234648
start iteration
done sorting
end inner loop
Iteration 56, inertia 103682.906025
start iteration
done sorting
end inner loop
Iteration 56, inertia 103538.060317
start iteration
done sorting
end inner l

Iteration 102, inertia 103507.101463
start iteration
done sorting
end inner loop
Iteration 102, inertia 103640.094349
start iteration
done sorting
end inner loop
Iteration 103, inertia 103507.088273
start iteration
done sorting
end inner loop
Iteration 103, inertia 103637.370941
start iteration
done sorting
end inner loop
Iteration 104, inertia 103507.085064
start iteration
done sorting
end inner loop
Iteration 104, inertia 103635.426972
start iteration
done sorting
end inner loop
Iteration 105, inertia 103507.085064
center shift 0.000000e+00 within tolerance 2.764228e-07
Iteration 105, inertia 103634.40332
start iteration
done sorting
end inner loop
Iteration 106, inertia 103633.75
start iteration
done sorting
end inner loop
Iteration 107, inertia 103633.171167
start iteration
done sorting
end inner loop
Iteration 108, inertia 103632.683902
start iteration
done sorting
end inner loop
Iteration 109, inertia 103632.148821
start iteration
done sorting
end inner loop
Iteration 110, inerti

start iteration
done sorting
Iteration 43, inertia 103491.153361
start iteration
done sorting
end inner loop
end inner loop
Iteration 44, inertia 103486.374371
Iteration 15, inertia 103778.037363
start iteration
start iteration
done sorting
done sorting
end inner loop
end inner loop
Iteration 45, inertia 103482.525352
start iteration
done sorting
Iteration 16, inertia 103757.247687
start iteration
done sorting
end inner loop
end inner loop
Iteration 46, inertia 103479.758049
start iteration
done sorting
end inner loop
Iteration 17, inertia 103735.128557
start iteration
done sorting
end inner loop
Iteration 47, inertia 103476.769126
start iteration
done sorting
end inner loop
Iteration 18, inertia 103713.732456
start iteration
done sorting
end inner loop
Iteration 48, inertia 103473.306987
start iteration
done sorting
end inner loop
Iteration 19, inertia 103696.557049
start iteration
done sorting
end inner loop
Iteration 49, inertia 103468.171793
start iteration
done sorting
end inner l

start iteration
done sorting
end inner loop
Iteration 65, inertia 103530.580288
start iteration
done sorting
end inner loop
Iteration 96, inertia 103404.992473
start iteration
done sorting
end inner loop
Iteration 66, inertia 103530.244674
start iteration
done sorting
end inner loop
Iteration 97, inertia 103404.922696
start iteration
done sorting
end inner loop
Iteration 67, inertia 103529.847791
start iteration
done sorting
end inner loop
Iteration 98, inertia 103404.861855
start iteration
done sorting
end inner loop
Iteration 68, inertia 103529.373682
start iteration
done sorting
end inner loop
Iteration 99, inertia 103404.815846
start iteration
done sorting
end inner loop
Iteration 69, inertia 103528.891735
start iteration
done sorting
end inner loop
Iteration 100, inertia 103404.766598
start iteration
done sorting
end inner loop
Iteration 70, inertia 103528.46486
start iteration
done sorting
end inner loop
Iteration 101, inertia 103404.740205
start iteration
done sorting
end inner 

end inner loop
Iteration 33, inertia 103575.03758
start iteration
done sorting
end inner loop
Iteration 34, inertia 103571.195836
start iteration
done sorting
end inner loop
Iteration 35, inertia 103568.413428
start iteration
done sorting
end inner loop
Iteration 36, inertia 103565.787324
start iteration
done sorting
end inner loop
Iteration 37, inertia 103563.728561
start iteration
done sorting
end inner loop
Iteration 38, inertia 103561.894198
start iteration
done sorting
end inner loop
Iteration 39, inertia 103560.050727
start iteration
done sorting
end inner loop
Iteration 40, inertia 103558.297229
start iteration
done sorting
end inner loop
Iteration 41, inertia 103556.782158
start iteration
done sorting
end inner loop
Iteration 42, inertia 103555.417739
start iteration
done sorting
end inner loop
Iteration 43, inertia 103553.740619
start iteration
done sorting
end inner loop
Iteration 44, inertia 103551.894034
start iteration
done sorting
end inner loop
Iteration 45, inertia 1035

array([ 40, 143, 577, ..., 487, 495, 201], dtype=int32)

In [23]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
print(len(meaning_labels))

206578


In [25]:
import numpy as np

for i in range(1000):
    print(meaning_labels[i], labels[i])

veteran richard 40
homegrown texas 143
severe disability 577
fake death 88
tara reid 194
awful medium 514
trite cast 167
new experience 187
affable gas 335
unnecessary plot point 233
unconvincing heist movie 413
hg wells 149
main reasons 542
very brutal labor 423
frequent moments 590
real clinkers 269
good time 142
paul revere 334
earnest job 230
awful animatronics 140
attentive viewer 480
enchanted isle 517
influential entertainers 281
laughable story 505
light escapist entertainment 533
innocent man steve 559
hopeless role 108
so enter sharkboy 459
convincing horror movie 524
potentially exciting conflagration 90
dark boring 355
medical research facility 382
incorrect closet 262
annoying lilt 211
big shootouts 531
real joy 269
somewhat weird story 505
standard technologies 154
old vaudeville 41
bolivian official 19
repetitive entertainment 533
first flies 124
same vicinity 122
own brand 361
several tunes 544
so hidden beneath 569
corinne isnt 536
absolute abc 218
russian immigrant ja

In [26]:
cluster_counts = Counter(labels)

cluster_examples = {}

for i in range(len(labels)):
    if labels[i] not in cluster_examples:
        cluster_examples[labels[i]] = []
    
    cluster_examples[labels[i]].append(meaning_labels[i])

In [98]:
print(cluster_counts.most_common(10))

SHOW = 30

for pair in cluster_counts.most_common(600):
    cluster_label, _ = pair
    
    print("\nPrinting examples for cluster {}:".format(cluster_label))
    for i in range(SHOW):
        print(cluster_examples[cluster_label][i])

[(142, 1645), (547, 1451), (124, 1149), (187, 1143), (77, 1129), (94, 1120), (269, 1116), (308, 1112), (446, 1084), (115, 1068)]

Printing examples for cluster 142:
good time
good tight
good guys
really good ghost
good augury
good arab
good lawyer
fairly good reason
good question
good guest
really good episode
good motives
good thing hamburg
good picture
good hedy
terrific while
good publicity spin
good appliances
good roll
good cameo
good advice wait
solid turns
good heroine
good flair
perfectly good story
good image
good candidate
good luck
good superhero
surprisingly good job

Printing examples for cluster 547:
able to award
little to mask
give birth
ordinary to be
long to possess
similar to mask
little to discuss
good to promote
abusive to show
close to goren
adjacent to adolf
similar to hal
bound to be
much to cassavetes
meant to make
much to give
stellar to begin
worthy to tie
ryan to be
sure to set
own to tack
comfortable to drive
possible to make
much to create
close to unwatch

stereotyped roles
meaningless role
straight roles
little role
meager role
boring role
diverse roles
manipulative role malone
dual roles
essentially primary role

Printing examples for cluster 214:
dutch henry robs
anne francis walter
henry thomas
edward hopper
louis jeffries
thomas riedelsheimer
edward munks
retired lawman samuel
ardent lloyd
rev smith
vivian charles
superman james
alfred molina joan
herbert coward
frederic downs
bertie wooster master
sir hector macdonald
sir leonard
veteran henry
albert gran bill
herbert marshall clifton
terrible henry jaglom
uncredited james
edward morbius walter
admiral charlton
edward end
francis capra
teddy roosevelt
joseph cotton baron
o henry

Printing examples for cluster 278:
nice mix
nice example
nice kid
neat touches
nice things
nice improvement
nice chat
nice play
nice breakfast
nice life
nice turn
nice book
nice philosophy
nice sight
nice reinforcement
nice pace
nice starter
nice boat jennifer
nice messages
nice spin
nice watch
nice charac

slipshod script
ridiculous screenplay
wholly pointless script
sometimes melodramatic script
wretched writing
melodramatic script
twisty script
laborious screenplay
romantic script team
altered screenplay
disconnected script
observed script
similar scripts
tough script
own script
daffy script
broad screenplay
tight script
cleverly dour script
imaginative script
understated script
actual screenplay

Printing examples for cluster 476:
satisfying viewing
enjoyable thing
enjoyable crime
lavish escapist fare
entertaining account
enjoyable end
boisterously entertaining action
entertaining lets
therefore entertaining gi
entertaining surplus
enjoyable cheesiness
entertaining productions
terribly entertaining work
entertaining heels
very enjoyable thriller
entertaining dare
incredibly entertaining view
enjoyable interlude
very enjoyable evening
enjoyable way
entertaining jam session
entertaining emphasis
enjoyable christmastime
enjoyable girlfriend
very entertaining ride
entertaining night
enter

endless hype
frequent requests
regular viewing
occasional glances
endless recounting
frequent disagreements
occasional pouty
occasional friends
meandering succession
occasional boob
occasional bouts

Printing examples for cluster 270:
complex piece
penns piece
s piece
sleazy piece
joyful piece
simon piece
genuine piece
refreshing piece
deft piece
pathetic piece
completely braindead piece
delightful piece
unfinished note
sharp piece
reprehensible piece
outstanding pieces
moving pieces
whole piece
pretty straightforward piece
depressing piece
symphonic piece
delirious piece
now inspired piece
unsatisfying piece
upper piece
such piece
braindead piece
photographed piece
confusing piece
fluffy piece

Printing examples for cluster 442:
kiddy flick eh
offensive chick flick
absolutely putrid slasher
other slasher flick
good slasher flick
albert pyun flick
interesting flicks
first slasher
average slasher flick
teen exploitation flick
sadly routine slasher
slasher fare
many slasher
scary creatur

birdie fan club
dedicated fan base
huge travolta fan
keen movie fan
sinatra fan
huge snl fan
scifi movie fan
big carrey fan
inflexible nitpicking fan
hard lynch fan
much fan
true combs fan
good fan
huge grisham fan

Printing examples for cluster 246:
criminal law
criminal figure
greedy criminal
past criminal
criminal career
criminal oddball couple
gossip corruption
criminal treatment
criminal stakeouts
repressed criminal mindset
enigmatic criminal
disgusting criminal
criminal plots
complex corruption
criminal whos image
lowkey crime
extraordinary crime
unforgettable crime
portray computer crime
small time criminal
criminal activities
criminal intent fan
forensic investigation
criminal mastermind
heinous crimes
same crime
juvenile smear
criminal fascination
criminal psychology
criminal record

Printing examples for cluster 267:
rocky relationship
respective relationships
wonderful relationships
friendly rivalry
incestuous relationship
tenuous connection
unconvincing group relationship
p

explosive hitler
nazi witch master
nazi history treasure
pro hitler rally
nazi hate
nazi storm
former nazis
nazi concentration
nazi career
hitler youth
nazi history underwater
future fascists
wwii enemies
nazi yelling
nazi time
famous hitler
pro hitler
nazi juggernaut
nazi website
fascist rally
crazed wwii

Printing examples for cluster 204:
plus size lady
native lady
lady ada
lady pooja gandhi
fortunately phantom lady
wicked lady skelton
great lady
lady won
lady deadlock
new lady
lady mona barrie
lady friend
effective lady
lady death herself
desperate lady
famous lady
old crazy lady
strong lady
young lady jewel
creepy lady
remarkable lady
leading lady
uncanny lady
lady mona
first lady
scary lunch lady
first lady julia
sophisticated lady
prime lady
lady caroline

Printing examples for cluster 345:
heavy resemblance
heavy losses
heavy storyline
heavy cross
heavy typical
heavy storm i
heavy day
heavy narcotics
heavy criticism
heavy self
torrential rainfall
heavy drugs
heavy dramas
heavy 

yes william h
william r
terrific william forsythe

Printing examples for cluster 462:
once controversial adaptation
realistic duh adaptation
gory adaptation
ok adaptation
proper sequel
faithful adaptation
masterful adaptation
novel adaptation
horrid adaptation
usher adaptations
adapted courtesy
screen adaptation
true adaptation
cinematic adaptation music
accurate adaptation
flat adaptation
wilfully bizarre adaptation
appalling adaptation
fabulous adaptation
delightful miniseries
other adaptation
horrendous adaptation
fine adaptation
noteworthy adaptations
loyal adaptation
essential film adaptation
austen adaption
filmed adaptation
clarke adaptation
watchable adaptation

Printing examples for cluster 436:
aged norman
same age group
new age type
middle age set
new age hokum
early age mj
edge space age
technological age
vincent age
nephews age
last ages
quiet age
rather young age
early age anne
child age
appropriate age
middle age golddigger
golden age science
new age
minimum age
new age 

IndexError: list index out of range

In [28]:
DIRECTORY = '/mnt/c/Users/gvs/ubuntu/neural-review-summarization/output/'
KMEANS_MODEL_FILE = "k_means_model.pkl"

In [32]:
import pickle

with open(DIRECTORY + KMEANS_MODEL_FILE, 'wb') as model_file:
    pickle.dump(kmeans, model_file, protocol=pickle.HIGHEST_PROTOCOL)

In [87]:
test_df = pd.DataFrame.from_csv("../data/test.tsv", sep='\t', encoding='latin-1')

  """Entry point for launching an IPython kernel.


In [88]:
test_all_grams = []
for i in tqdm(range(len(test_df))):
    extend_all_grams(test_df.iloc[i]['review'], test_all_grams)

100%|██████████| 31/31 [00:00<00:00, 36.19it/s]


In [92]:
test_counts = Counter(test_all_grams)
print(test_counts.most_common(20))

[('bad movie', 6), ('special effects', 4), ('i dont', 4), ('whoopi goldberg', 4), ('dinosaur detective', 3), ('futuristic scenes', 2), ('entire length', 2), ('jurassic park', 2), ('weird character', 2), ('dinosaurhuman detective', 2), ('bad comedian', 2), ('few films', 2), ('expensive directtovideo', 2), ('lame cop comedy', 2), ('awful lines', 2), ('lame cop', 2), ('bad impersonator', 2), ('huge ripoff', 2), ('buddy cop comedy', 2), ('really bad movie', 2)]


In [93]:
with open(DIRECTORY + KMEANS_MODEL_FILE, 'rb') as model_file:
    kmeans = pickle.load(model_file)

In [107]:
manual_phrases = ['fundamental problems', 'unrelenting tedium', 'most horrible blockbuster']

vectorized_phrases = []
test_labels = []

for phrase in test_all_grams:
    vectorized_phrase = vectorize_phrase(phrase)
    if vectorized_phrase is not None:
        vectorized_phrases.append(vectorized_phrase)
        test_labels.append(phrase)

In [108]:
test_predictions = kmeans.predict(vectorized_phrases)
print(test_predictions)

[343 426 563 210 389 328 210 210 205 425 505 497 163 319 329  48 228  69
 217 233 199 539  16 580 100 417 163 231  17 555   7 531 522 144 170 206
 108 131  45  12 569 229   7 161  58 142 280 342 100  89 302 166  74 322
 446 166 100 531 175 557 366 105 118  70  12 147 555 217 425  38 410 118
 531  12 190 140  77 317 216 332 329 411 100 146 556 374 233 187 216 247
  60 491  58 474 410 389 123 425 100 294 205  65 483 265  85 413 104 225
 250 239 562 496 459  23 397 272 506 564 265 515 214 558 316 557 556 100
 224 497 200 313 199 393 224 240 594 224 556 100 200 553 187 243 122  60
 268 487 479 359 122 471 107 147 423  87 332 141  29 285 307 250 199 556
 272  14 555   2 254  13 255  39 517   5 122 111  95 444 433 427 291 547
 199 244 224  12  77 555  29 492 446 479   9 580 216 410 342 325  13 495
 307 479  30 413  60 580 226 100 226 250 500 427  14 547  77 265 578 593
 373 189 124 555 555 373 593 124 343 426 563 210 389 328 210 210 205 425
 505 497 163 319 329  48 228 100  69 217 233 199 53

In [112]:
Counter(test_predictions).most_common(30)

[(100, 12),
 (555, 11),
 (210, 9),
 (497, 9),
 (580, 7),
 (479, 7),
 (199, 6),
 (250, 6),
 (495, 6),
 (142, 5),
 (425, 5),
 (12, 4),
 (556, 4),
 (77, 4),
 (101, 4),
 (118, 4),
 (122, 4),
 (163, 4),
 (217, 4),
 (224, 4),
 (265, 4),
 (410, 4),
 (433, 4),
 (313, 4),
 (16, 3),
 (531, 3),
 (23, 3),
 (539, 3),
 (29, 3),
 (547, 3)]

In [113]:
good_clusters = {100, 210, 228, 265, 313, 328, 389, 417, 426, 563, 142, 12}
bad_clusters = {199, 329, 425, 497, 517, 555, 580, 479, 250, 495}
filtered_labels = []

for i in range(len(test_predictions)):
    test_prediction = test_predictions[i]
    
    if test_prediction in good_clusters:
        filtered_labels.append(test_labels[i])

test_labels_count = Counter(filtered_labels)
print(test_labels_count)

Counter({'bad movie': 6, 'special effects': 4, 'bad films': 2, 'lame cop comedy': 2, 'bad buddy': 2, 'bad comedian': 2, 'weird character': 2, 'few films': 2, 'really bad movie': 2, 'awful lines': 2, 'bad impersonator': 2, 'futuristic scenes': 2, 'good idea': 2, 'lame cop': 2, 'bad guys': 2, 'amazing movie': 1, 'horrible movie': 1, 'spielberg look trite': 1, 'good jokes': 1, 'good reasons': 1, 'good category': 1, 'spielberg tv': 1, 'jurassic park tv': 1, 'spielberg tv show': 1, 'bad dinosaur': 1, 'particularly bad episode': 1, 'porn scenes': 1, 'spielberg look': 1, 'bad taste': 1, 'cleverness comedy': 1, 'gags lame': 1, 'acclaimed tv': 1})


In [60]:
data_df = pd.DataFrame.from_csv("../data/labeledTrainData.tsv", sep='\t', header=0)

  """Entry point for launching an IPython kernel.


In [83]:
ids = data_df.index.values
print(ids[:10])

split_ids = []
for id in ids:
    split_id = str(id).split('_')
    split_ids.append(split_id[0])
    
id_counts = Counter(split_ids)

print(id_counts.most_common(1000))

['5814_8' '2381_9' '7759_3' '3630_4' '9495_8' '8196_8' '7166_2' '10633_1'
 '319_1' '8713_10']
[('2952', 2), ('3624', 2), ('3848', 2), ('4739', 2), ('12247', 2), ('5328', 2), ('3385', 2), ('8746', 2), ('7776', 2), ('6126', 2), ('12384', 2), ('2267', 2), ('178', 2), ('4250', 2), ('2353', 2), ('5462', 2), ('477', 2), ('2687', 2), ('242', 2), ('2795', 2), ('10482', 2), ('10536', 2), ('2515', 2), ('12344', 2), ('6067', 2), ('6867', 2), ('12219', 2), ('6279', 2), ('8046', 2), ('2790', 2), ('3148', 2), ('6125', 2), ('1501', 2), ('1535', 2), ('2986', 2), ('529', 2), ('3517', 2), ('54', 2), ('162', 2), ('8059', 2), ('7011', 2), ('12472', 2), ('794', 2), ('6077', 2), ('1126', 2), ('7125', 2), ('8944', 2), ('11232', 2), ('5804', 2), ('309', 2), ('8605', 2), ('1433', 2), ('9893', 2), ('7443', 2), ('10591', 2), ('11530', 2), ('9907', 2), ('10974', 2), ('6950', 2), ('7110', 2), ('9515', 2), ('4196', 2), ('10669', 2), ('1457', 2), ('9672', 2), ('5599', 2), ('5021', 2), ('12322', 2), ('5093', 2), ('19

In [247]:
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
print(kmeans.cluster_centers_.)
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % meaning_labels[ind]),

SyntaxError: invalid syntax (<ipython-input-247-445f186ab7e3>, line 3)