In [1]:
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora, models, similarities

In [2]:
corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
           [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
           [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
           [(0, 1.0), (4, 2.0), (7, 1.0)],
           [(3, 1.0), (5, 1.0), (6, 1.0)],
           [(9, 1.0)],
           [(9, 1.0), (10, 1.0)],
           [(9, 1.0), (10, 1.0), (11, 1.0)],
           [(8, 1.0), (10, 1.0), (11, 1.0)]]

tfidf = models.TfidfModel(corpus)
vec = [(0, 1), (4, 1)]
tfidf[vec]

[(0, 0.8075244024440723), (4, 0.5898341626740045)]

### Strings to Vectors

In [9]:
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

In [27]:
# remove stopwords
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
        for document in documents]

# remove word that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
        for text in texts]

dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

new_doc = 'Human computer interaction with human human human'
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

{u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}
[(1, 1), (2, 4)]


In [32]:
# BagWords
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('test.mm', corpus) # save to file for later use
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(0, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (6, 2), (8, 1)],
 [(3, 1), (4, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(5, 1), (10, 1), (11, 1)]]

### Transformation

In [37]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


#### LSI

In [39]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(2)

[(0,
  u'0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  u'-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [40]:
for doc in corpus_lsi:
    print(doc)
# lsi.save('file_to_save.lsi')
# lsi.load('file_to_load.lsi')

[(0, 0.066007833960902804), (1, -0.52007033063618502)]
[(0, 0.19667592859142299), (1, -0.76095631677000519)]
[(0, 0.089926399724463077), (1, -0.72418606267525143)]
[(0, 0.075858476521780557), (1, -0.63205515860034334)]
[(0, 0.10150299184979941), (1, -0.57373084830029586)]
[(0, 0.70321089393783165), (1, 0.16115180214025668)]
[(0, 0.87747876731198393), (1, 0.16758906864659256)]
[(0, 0.90986246868185872), (1, 0.14086553628718854)]
[(0, 0.6165825350569285), (1, -0.053929075663894835)]


### Similiarity Queries

In [42]:
doc = 'Human computer interaction'
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi) # vec_lsi order by similiarity to vec_bow

[(0, 0.079104751174447582), (1, -0.57328352430794027)]


# Beatles test
TODO: Remove stopwords

In [17]:
import os
import re
import string

songs = []
folder = './../data/beatles/'

filenames = filter(lambda it: it.endswith('.txt'), os.listdir(folder))

for name in filenames:
    with open(folder + name, 'r') as f:
        songs.append(re.sub(r'\[.*\]', '', f.read().replace('\n',' ').strip()))

In [23]:
texts = [[word for word in song.translate(None, string.punctuation).lower().split()]
        for song in songs]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

[(0,
  u'0.234*"love" + 0.165*"i" + 0.139*"im" + 0.138*"you" + 0.135*"ill" + 0.129*"me" + 0.123*"she" + 0.119*"yeah" + 0.118*"her" + 0.117*"be"'),
 (1,
  u'-0.599*"love" + 0.232*"back" + 0.225*"john" + 0.163*"brian" + 0.119*"get" + -0.115*"ill" + -0.114*"need" + -0.111*"true" + 0.110*"he" + 0.110*"paul"')]

In [35]:
number_of_topics = 5
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=number_of_topics)
lsi.print_topics(number_of_topics)

[(0,
  u'0.234*"love" + 0.165*"i" + 0.139*"im" + 0.139*"you" + 0.135*"ill" + 0.129*"me" + 0.123*"she" + 0.119*"yeah" + 0.118*"her" + 0.117*"be"'),
 (1,
  u'-0.605*"love" + 0.233*"back" + 0.219*"john" + 0.152*"brian" + 0.122*"get" + -0.115*"true" + -0.114*"need" + -0.112*"ill" + 0.108*"paul" + 0.108*"he"'),
 (2,
  u'-0.355*"she" + -0.307*"her" + 0.269*"john" + -0.206*"shes" + 0.193*"brian" + 0.151*"needed" + -0.125*"does" + 0.117*"someone" + -0.117*"girl" + -0.109*"down"'),
 (3,
  u'-0.241*"john" + -0.227*"love" + -0.214*"she" + -0.198*"her" + -0.193*"blackbird" + -0.184*"brian" + 0.154*"baby" + 0.152*"want" + -0.145*"moment" + 0.143*"cry"'),
 (4,
  u'0.342*"blackbird" + -0.316*"yeah" + -0.254*"love" + 0.247*"fly" + 0.240*"moment" + -0.217*"john" + 0.207*"waiting" + 0.196*"arise" + -0.181*"brian" + 0.136*"only"')]

In [47]:
number_of_topics = 5
lda = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary,
                               num_topics=number_of_topics,
                               passes=20)


for t, top_words in lda.print_topics(num_topics=number_of_topics, num_words=5):
    print "Topic", t, ":", top_words

Topic 0 : 0.002*bird + 0.002*bye + 0.002*chains + 0.002*la + 0.002*birthday
Topic 1 : 0.002*nah + 0.002*beep + 0.002*coo + 0.002*dr + 0.002*robert
Topic 2 : 0.005*love + 0.004*i + 0.004*you + 0.004*im + 0.004*me
Topic 3 : 0.002*harp + 0.002*dizzy + 0.002*mystery + 0.002*ussr + 0.002*roll
Topic 4 : 0.002*commonwealth + 0.002*da + 0.002*tripper + 0.002*evrything + 0.002*bop


In [61]:
# Получение темы для конкретного документа

for i in range(20):
    print(filenames[i].replace('.txt', ''), lda[corpus[i]])

('1822!', [(2, 0.7134855435191424), (3, 0.265731891494497)])
('A Day In The Life', [(2, 0.9139578570388539), (3, 0.083207853305271903)])
("A Hard Day's Night", [(2, 0.9967464099738873)])
('A Little Rhyme', [(2, 0.99079081909550637)])
('A Shot Of Rhythm And Blues', [(1, 0.089610437386146657), (2, 0.90847717317027243)])
('A Taste Of Honey', [(2, 0.99291513980013879)])
('Across The Universe', [(1, 0.35699346864690573), (2, 0.64003536173678643)])
('Act Naturally', [(2, 0.91549908443542261), (3, 0.081635333131889726)])
("Ain't She Sweet", [(0, 0.060636108469957144), (2, 0.93598685491024902)])
("All I've Got To Do", [(2, 0.99499850721885164)])
('All My Loving', [(2, 0.99466479547493669)])
('All Things Must Pass', [(2, 0.99469455442273791)])
('All Together Now', [(1, 0.15317303864486695), (2, 0.84415120268542987)])
('All You Need Is Love', [(2, 0.99648810922580078)])
('And I Love Her', [(2, 0.9923777251634488)])
('And Your Bird Can Sing', [(2, 0.99254278562079168)])
('Anna, Go To Him', [(2, 0