In [1]:
import pandas as pd
from tqdm import tqdm
import regex as re
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from nltk import sent_tokenize, word_tokenize

In [2]:
df = pd.DataFrame.from_csv("../data/labeledTrainData.tsv", sep='\t')

  """Entry point for launching an IPython kernel.


In [3]:
len(df)

25000

In [4]:
words = ['It', 'has', 'many', 'permissions']
allowed = {'JJ', 'RB'}

tags = nltk.pos_tag(words)
print(tags)
a = list(filter(lambda x: x[1] in allowed, tags))
fwords, ftags = zip(*a)
print(fwords)
print(ftags)

[('It', 'PRP'), ('has', 'VBZ'), ('many', 'JJ'), ('permissions', 'NNS')]
('many',)
('JJ',)


In [5]:
tags = [tag for word, tag in nltk.pos_tag(words)]
tags

list(filter(lambda x: x in allowed, tags))



['JJ']

In [6]:
all_grams = []
allowed_tags = {'JJ', 'JJ NN', 'JJ NNS', 'JJ NN NN', 'RB JJ NN', 'JJ TO VB'}
banned_words = {'', 'br'}

def filter_banned_words(words, banned_words):
    filtered_words = list(filter(lambda x: x not in banned_words, words))
    
    return filtered_words

def merge_n_grams(ngrams, ntags):
    merged_n_grams = []
    merged_n_tags = []
    
    for ngram, ntag in zip(ngrams, ntags):
        merged_n_grams.append(' '.join(ngram))
        merged_n_tags.append(' '.join(ntag))
        
    return merged_n_grams, merged_n_tags

def filter_by_tags(ngrams, ntags, allowed_tags):
    pos = zip(ngrams, ntags)
    
    filtered_pos = list(filter(lambda x: len(x[0]) > 1 and x[1] in allowed_tags, pos))
    
    # filtered tags and words
    filtered_words = [word for word, tag in filtered_pos]
    filtered_tags = [tag for word, tag in filtered_pos]
    
    return filtered_words, filtered_tags

def extend_all_grams(review, all_grams):
    sentences = sent_tokenize(review)
    
    # pos tagging and filtering
    pos_tags = []
    for sentence in sentences:
        split_words = [re.sub(r"\p{P}+", "", x).lower() for x in sentence.split()]
        words = filter_banned_words(split_words, banned_words)
        pos_tags.extend(nltk.pos_tag(words))
    
    # split words and tags
    words = [word.lower() for word, tag in pos_tags]
    tags = [tag for word, tag in pos_tags]
    
    # creation of unigrams, bigrams and trigrams
    unigrams, utags = filter_by_tags(words, tags, allowed_tags)
    bigrams = zip(words, words[1:])
    bitags = zip(tags, tags[1:])
    trigrams = zip(words, words[1:], words[2:])
    tritags = zip(tags, tags[1:], tags[2:])
    
    mbigrams, mbitags = merge_n_grams(bigrams, bitags)
    fbigrams, fbitags = filter_by_tags(mbigrams, mbitags, allowed_tags)
    mtrigrams, mtritags = merge_n_grams(trigrams, tritags)
    ftrigrams, ftritags = filter_by_tags(mtrigrams, mtritags, allowed_tags)
    
    #all_grams.extend(list(set(unigrams)))
    all_grams.extend(list(set(fbigrams)))
    all_grams.extend(list(set(ftrigrams)))

for i in tqdm(range(len(df))):
    extend_all_grams(df.iloc[i]['review'], all_grams)

100%|██████████| 25000/25000 [06:57<00:00, 59.90it/s]


In [7]:
import string
import regex as re

from nltk import sent_tokenize, word_tokenize

text = "'I wonder how many miles I've fallen by this time?' she said aloud. Are you alive? Hey!"
split_test = [re.sub(r"\p{P}+", "", x).lower() for x in text.split()]
split_test

['i',
 'wonder',
 'how',
 'many',
 'miles',
 'ive',
 'fallen',
 'by',
 'this',
 'time',
 'she',
 'said',
 'aloud',
 'are',
 'you',
 'alive',
 'hey']

In [8]:
from collections import Counter

counts = Counter(all_grams)

In [9]:
counts.most_common(1000)

[('special effects', 929),
 ('i dont', 926),
 ('first time', 637),
 ('<br ><br', 583),
 ('new york', 548),
 ('good movie', 522),
 ('low budget', 516),
 ('same time', 509),
 ('main character', 444),
 ('only thing', 439),
 ('high school', 418),
 ('real life', 415),
 ('main characters', 386),
 ('many people', 382),
 ('long time', 372),
 ('great movie', 347),
 ('whole movie', 345),
 ('bad movie', 329),
 ('many times', 325),
 ('whole thing', 323),
 ('good job', 308),
 ('good film', 302),
 ('other hand', 291),
 ('great film', 276),
 ('good thing', 275),
 ('little bit', 274),
 ('i guess', 267),
 ('other films', 256),
 ('bad guys', 253),
 ('few years', 251),
 ('only reason', 250),
 ('i didnt', 244),
 ('other movies', 223),
 ('first movie', 219),
 ('entire movie', 217),
 ('first film', 214),
 ('bad guy', 214),
 ('great job', 212),
 ('young woman', 210),
 ('first place', 198),
 ('true story', 197),
 ('many years', 195),
 ('whole film', 192),
 ('entire film', 190),
 ('big fan', 186),
 ('other peo

In [10]:
DIRECTORY = '/mnt/c/Users/gvs/ubuntu/neural-review-summarization/output/'
FILE_NAME = 'important_phrases.csv'

In [11]:
df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
df = df.rename(columns={'index':'Phrase', 0:'Count'})

In [12]:
df.to_csv(DIRECTORY + FILE_NAME, encoding='utf-8')

In [13]:
df = pd.DataFrame.from_csv(DIRECTORY + FILE_NAME)

  """Entry point for launching an IPython kernel.


In [14]:
import numpy as np

def load_glove_model(glove_file):
    print("Loading Glove Model")
    with open(glove_file,'r') as f:
        model = {}
        for line in tqdm(f):
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            model[word] = embedding
            
    print("Done.",len(model)," words loaded!")
    return model

In [16]:
GLOVE_DIR = '/home/gvs/neural-review-summarization/model/embeddings/'
GLOVE_FILE = 'glove.6B.300d.txt'
glove = load_glove_model(GLOVE_DIR + GLOVE_FILE)

653it [00:00, 6528.03it/s]

Loading Glove Model


400000it [00:45, 8853.95it/s]

Done. 400000  words loaded!





In [17]:
len(glove['hi'])

300

In [18]:
from sklearn.preprocessing import normalize

meanings = []
meaning_labels = []
for i in tqdm(range(len(df))):
    emb_sum = np.zeros(len(glove['hi']))
    
    keys = df.iloc[i]['Phrase'].split()
    
    all_in_glove = True
    
    for key in keys:
        if key not in glove:
            all_in_glove = False
            break
    
    if not all_in_glove:
        continue
    
    
    for key in keys:
        vec = glove[key]
        
        emb_sum += vec
    
    emb_sum /= len(keys)
    emb_sum = normalize(emb_sum[:,np.newaxis], axis=0).ravel()
    
    """
    norm = np.linalg.norm(emb_sum)
    if norm!=0:
        emb_sum /= np.linalg.norm(norm)
    """

    meaning_labels.append(' '.join(keys))
    meanings.append(emb_sum)

print(meanings[0])

100%|██████████| 245713/245713 [01:05<00:00, 3760.46it/s]

[ -1.41665713e-02  -7.72734012e-02   2.08478552e-02  -2.92156377e-02
   3.13777861e-02  -7.66548265e-03  -2.57958371e-02  -8.56748522e-02
  -4.59723725e-04  -3.16541334e-02   1.14547144e-01  -4.10542522e-02
  -4.73876648e-02  -8.49996330e-02  -6.21448990e-02   6.50622976e-02
   7.10261970e-02   7.06106909e-03  -4.06147440e-03   3.53796401e-02
   2.66505332e-02  -1.69215273e-03   3.65235107e-02   6.41104232e-02
  -4.35034098e-02  -4.24444034e-02  -4.94069602e-03  -3.94677949e-02
   9.26794820e-03  -7.39765253e-02  -4.31319366e-02  -1.64628297e-02
  -2.91585930e-02   4.91432348e-03  -2.75782927e-01  -5.85727020e-02
   6.51206866e-02  -3.88107389e-02   5.97618164e-02  -1.74869464e-02
   6.98728754e-02  -7.44516416e-02   5.74230849e-02   5.98402260e-02
  -4.56928441e-02  -2.55781866e-02  -2.88045646e-03   1.01600996e-02
  -1.65850465e-02  -3.19518251e-02   1.78721703e-02  -7.99570384e-02
  -6.32693750e-02   5.78059485e-02   3.32475687e-02   3.02719864e-04
  -7.55660611e-02   3.84772339e-02




In [19]:
(meanings[1]==meanings[2]).all()

False

In [20]:
a = np.array([0,1])
b = np.array([2,3])
(a + b) / 2

array([ 1.,  2.])

In [21]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
from scipy import spatial

print(df.iloc[0])
print(df.iloc[5])

print(1 - spatial.distance.cosine(meanings[0], meanings[5]))

Phrase    veteran richard
Count                   1
Name: 0, dtype: object
Phrase    tara reid
Count             2
Name: 5, dtype: object
0.0693620457


In [22]:
from sklearn import cluster

k = 600
kmeans = cluster.KMeans(n_clusters=k, max_iter=300, n_init=5, n_jobs=2, verbose=1)
kmeans.fit_predict(meanings)

Initialization complete
Initialization complete
start iteration
done sorting
end inner loop
start iteration
done sorting
end inner loop
Iteration 0, inertia 116942.207936
start iteration
done sorting
Iteration 0, inertia 116987.271362
start iteration
done sorting
end inner loop
end inner loop
Iteration 1, inertia 109017.011056
start iteration
done sorting
Iteration 1, inertia 109721.616202
start iteration
done sorting
end inner loop
end inner loop
Iteration 2, inertia 106662.027581
start iteration
done sorting
Iteration 2, inertia 107095.909826
start iteration
done sorting
end inner loop
end inner loop
Iteration 3, inertia 105663.805125
start iteration
done sorting
Iteration 3, inertia 105848.278931
start iteration
done sorting
end inner loop
Iteration 4, inertia 105133.370102
start iteration
done sorting
end inner loop
Iteration 4, inertia 105221.304249
start iteration
done sorting
end inner loop
Iteration 5, inertia 104800.680239
start iteration
done sorting
end inner loop
end inner 

start iteration
done sorting
end inner loop
Iteration 51, inertia 103686.815041
start iteration
done sorting
end inner loop
Iteration 51, inertia 103552.420723
start iteration
done sorting
end inner loop
Iteration 52, inertia 103685.971166
start iteration
done sorting
end inner loop
Iteration 52, inertia 103550.019394
start iteration
done sorting
end inner loop
Iteration 53, inertia 103684.965967
start iteration
done sorting
end inner loop
Iteration 53, inertia 103546.858812
start iteration
done sorting
end inner loop
Iteration 54, inertia 103684.155149
start iteration
done sorting
end inner loop
Iteration 54, inertia 103543.241142
start iteration
done sorting
end inner loop
Iteration 55, inertia 103683.475789
start iteration
done sorting
end inner loop
Iteration 55, inertia 103540.234648
start iteration
done sorting
end inner loop
Iteration 56, inertia 103682.906025
start iteration
done sorting
end inner loop
Iteration 56, inertia 103538.060317
start iteration
done sorting
end inner l

Iteration 102, inertia 103507.101463
start iteration
done sorting
end inner loop
Iteration 102, inertia 103640.094349
start iteration
done sorting
end inner loop
Iteration 103, inertia 103507.088273
start iteration
done sorting
end inner loop
Iteration 103, inertia 103637.370941
start iteration
done sorting
end inner loop
Iteration 104, inertia 103507.085064
start iteration
done sorting
end inner loop
Iteration 104, inertia 103635.426972
start iteration
done sorting
end inner loop
Iteration 105, inertia 103507.085064
center shift 0.000000e+00 within tolerance 2.764228e-07
Iteration 105, inertia 103634.40332
start iteration
done sorting
end inner loop
Iteration 106, inertia 103633.75
start iteration
done sorting
end inner loop
Iteration 107, inertia 103633.171167
start iteration
done sorting
end inner loop
Iteration 108, inertia 103632.683902
start iteration
done sorting
end inner loop
Iteration 109, inertia 103632.148821
start iteration
done sorting
end inner loop
Iteration 110, inerti

start iteration
done sorting
Iteration 43, inertia 103491.153361
start iteration
done sorting
end inner loop
end inner loop
Iteration 44, inertia 103486.374371
Iteration 15, inertia 103778.037363
start iteration
start iteration
done sorting
done sorting
end inner loop
end inner loop
Iteration 45, inertia 103482.525352
start iteration
done sorting
Iteration 16, inertia 103757.247687
start iteration
done sorting
end inner loop
end inner loop
Iteration 46, inertia 103479.758049
start iteration
done sorting
end inner loop
Iteration 17, inertia 103735.128557
start iteration
done sorting
end inner loop
Iteration 47, inertia 103476.769126
start iteration
done sorting
end inner loop
Iteration 18, inertia 103713.732456
start iteration
done sorting
end inner loop
Iteration 48, inertia 103473.306987
start iteration
done sorting
end inner loop
Iteration 19, inertia 103696.557049
start iteration
done sorting
end inner loop
Iteration 49, inertia 103468.171793
start iteration
done sorting
end inner l

start iteration
done sorting
end inner loop
Iteration 65, inertia 103530.580288
start iteration
done sorting
end inner loop
Iteration 96, inertia 103404.992473
start iteration
done sorting
end inner loop
Iteration 66, inertia 103530.244674
start iteration
done sorting
end inner loop
Iteration 97, inertia 103404.922696
start iteration
done sorting
end inner loop
Iteration 67, inertia 103529.847791
start iteration
done sorting
end inner loop
Iteration 98, inertia 103404.861855
start iteration
done sorting
end inner loop
Iteration 68, inertia 103529.373682
start iteration
done sorting
end inner loop
Iteration 99, inertia 103404.815846
start iteration
done sorting
end inner loop
Iteration 69, inertia 103528.891735
start iteration
done sorting
end inner loop
Iteration 100, inertia 103404.766598
start iteration
done sorting
end inner loop
Iteration 70, inertia 103528.46486
start iteration
done sorting
end inner loop
Iteration 101, inertia 103404.740205
start iteration
done sorting
end inner 

end inner loop
Iteration 33, inertia 103575.03758
start iteration
done sorting
end inner loop
Iteration 34, inertia 103571.195836
start iteration
done sorting
end inner loop
Iteration 35, inertia 103568.413428
start iteration
done sorting
end inner loop
Iteration 36, inertia 103565.787324
start iteration
done sorting
end inner loop
Iteration 37, inertia 103563.728561
start iteration
done sorting
end inner loop
Iteration 38, inertia 103561.894198
start iteration
done sorting
end inner loop
Iteration 39, inertia 103560.050727
start iteration
done sorting
end inner loop
Iteration 40, inertia 103558.297229
start iteration
done sorting
end inner loop
Iteration 41, inertia 103556.782158
start iteration
done sorting
end inner loop
Iteration 42, inertia 103555.417739
start iteration
done sorting
end inner loop
Iteration 43, inertia 103553.740619
start iteration
done sorting
end inner loop
Iteration 44, inertia 103551.894034
start iteration
done sorting
end inner loop
Iteration 45, inertia 1035

array([ 40, 143, 577, ..., 487, 495, 201], dtype=int32)

In [23]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
print(len(meaning_labels))

206578


In [25]:
import numpy as np

for i in range(1000):
    print(meaning_labels[i], labels[i])

veteran richard 40
homegrown texas 143
severe disability 577
fake death 88
tara reid 194
awful medium 514
trite cast 167
new experience 187
affable gas 335
unnecessary plot point 233
unconvincing heist movie 413
hg wells 149
main reasons 542
very brutal labor 423
frequent moments 590
real clinkers 269
good time 142
paul revere 334
earnest job 230
awful animatronics 140
attentive viewer 480
enchanted isle 517
influential entertainers 281
laughable story 505
light escapist entertainment 533
innocent man steve 559
hopeless role 108
so enter sharkboy 459
convincing horror movie 524
potentially exciting conflagration 90
dark boring 355
medical research facility 382
incorrect closet 262
annoying lilt 211
big shootouts 531
real joy 269
somewhat weird story 505
standard technologies 154
old vaudeville 41
bolivian official 19
repetitive entertainment 533
first flies 124
same vicinity 122
own brand 361
several tunes 544
so hidden beneath 569
corinne isnt 536
absolute abc 218
russian immigrant ja

In [26]:
cluster_counts = Counter(labels)

cluster_examples = {}

for i in range(len(labels)):
    if labels[i] not in cluster_examples:
        cluster_examples[labels[i]] = []
    
    cluster_examples[labels[i]].append(meaning_labels[i])

In [27]:
print(cluster_counts.most_common(10))

SHOW = 30

for pair in cluster_counts.most_common(500):
    cluster_label, _ = pair
    
    print("\nPrinting examples for cluster {}:".format(cluster_label))
    for i in range(SHOW):
        print(cluster_examples[cluster_label][i])

[(142, 1645), (547, 1451), (124, 1149), (187, 1143), (77, 1129), (94, 1120), (269, 1116), (308, 1112), (446, 1084), (115, 1068)]

Printing examples for cluster 142:
good time
good tight
good guys
really good ghost
good augury
good arab
good lawyer
fairly good reason
good question
good guest
really good episode
good motives
good thing hamburg
good picture
good hedy
terrific while
good publicity spin
good appliances
good roll
good cameo
good advice wait
solid turns
good heroine
good flair
perfectly good story
good image
good candidate
good luck
good superhero
surprisingly good job

Printing examples for cluster 547:
able to award
little to mask
give birth
ordinary to be
long to possess
similar to mask
little to discuss
good to promote
abusive to show
close to goren
adjacent to adolf
similar to hal
bound to be
much to cassavetes
meant to make
much to give
stellar to begin
worthy to tie
ryan to be
sure to set
own to tack
comfortable to drive
possible to make
much to create
close to unwatch

so im gonna
disagree im
so ridiculous im
im sol
little im
indeed special im
steep im
glad im

Printing examples for cluster 356:
extremely boring film
very prolific film
very fun movie
very lazy movie
very deep film
very slow movie
genuinely beautiful film
enjoyable family film
very erotic original
very difficult film
very gritty movie
very funny movie
very good cinema
very serious movie
really interesting film
only decent film
very few film
nice charming film
rather gloomy film
technically challenging film
historically significant movie
pretty ok film
good film choice
very real film
lovely fun film
very goof film
very confusing movie
very solid film
very puzzling screenplay
very experienced film

Printing examples for cluster 501:
extra checks
considerable amounts
obscene amount
extravagant pay
albeit resources
disproportionate number
extra dimensions
vast degree
spent money
extensive amount
equal worth
extra trailers
maximum dignity
considerable peril
considerable amount
practical ou

non mormons
religious zealot

Printing examples for cluster 217:
eerie people
famous people
necessary people
dysfunctional people
fetish people
whenever people
freaky people
slant people
unstable people
clean people
interesting people
sincere people
snobbish people
ordinary people
elderly people
biting people
smalltown people
korean people
infected ones
believe people
several people
understand people
unlucky people
viennese people
heterosexual people
want people
much people
genius people
maltese people
different people

Printing examples for cluster 259:
psychological study
emotional waters
psychological show
emotional acceptance
emotional gamut
emotional truth
emotional eyes
emotional impact etc
emotional shadings
emotional confrontation
emotional strength
emotional portrayal
emotional clever
emotional thought
emotional level
psychological harassment i
emotional moment
emotional infidelity
emotional viewing
emotional places
psychological battleships
psychological abuse
emotional deter

arrow killings
pesky murder
decent murder mystery
actual rape
manhattan murder
bizarre conspiracy
small child murder
like murder
imaginative murders

Printing examples for cluster 467:
barty felix
angelic max
erich von
neanderthal stephan
alfred kralik
herman shumlin
erich von stroheim
stefan sauk
eva hermann
romy schneider
delicate shadowing max
petra von kant
fw murnau
strangely uwe boll
olan luise rainer
brother karl erik
leopold loeb
uwe boll film
carl ellsworth
max cady
dieter der
von stroheim walsh
kamal hans
felix aylmer
ueli steiger cinematographer
max von
reworked bernard hermann
ole ceasar
max thieriot
carl reiner

Printing examples for cluster 485:
male sex addict
gorgeous sex
sex appeal
goodhearted sex bomb
like funky sex
visceral sex
true life sex
underage sex
later sex
extended sex
glamour celebrity sex
much sex appeal
terrible sex comedy
demented sex
heterosexual sex
incomprehensible sex
big sex scene
only sex scene
clown sex
sex vulgarity
sleazy sex scene
fake sex
nonse

same basic
basic meaning
basic credibility
basic monster
basic relativity
basic sort
basic training brigade
basic continuation
very basic fantasy
basic ait
basic sweetness
basic i
basic blasé nick

Printing examples for cluster 258:
amazing documentary saw
typical nature documentary
genuinely disturbing documentary
equally stunning documentary
documentary czech dream
safari documentaries
simple documentary
insightful documentary
actually documentary kasi
boring history documentary
narrative documentary
documentary video
photographed documentaries
not documentary formate
very emotional documentary
documentary czech
documentary tour
very engaging documentary
documentary set
called documentary
documentary hits
documentary section
documentary watch
uk documentary
awkward documentary
documentary quality
actual documentary
biographical documentaries
documentary laird
documentary fog

Printing examples for cluster 530:
plus side toni
eating side
shadow side
lovable sides
so talented side
west

technical marvels
technical department
technological breakthroughs
technical credits
purely technical point
technical element
technical gadgets
technical advisor
technical flourishes
technical devices
technical problems
technological supremacy
technical work
technical merit
technical mistakes
technological achievement
technical medium
technical review
technical aspects
technical virtuosity

Printing examples for cluster 185:
actual zombie
zombie honeymoon
overall zombie
nubile zombie
rob zombies
huge zombie film
scandanavian zombie flick
cannibalistic zombie
zombie doom
zombie scene
incompetent zombie film
apocalyptic zombie film
nubile zombie sonja
unintelligent zombie movie
violent zombie
recommend zombie nation
derivative zombie
weird zombie hunter
same zombies
animated zombie
little zombie
underwater zombie attack
zombie films
third zombie
mega zombie
whole zombie thing
homicidal zombie
whole zombie
recent zombie plague
severed zombie

Printing examples for cluster 529:
criminal pa

In [247]:
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
print(kmeans.cluster_centers_.)
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % meaning_labels[ind]),

SyntaxError: invalid syntax (<ipython-input-247-445f186ab7e3>, line 3)