In [1]:
import gensim
import logging
import os
import re
import string
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora, models, similarities

In [2]:
corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
           [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
           [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
           [(0, 1.0), (4, 2.0), (7, 1.0)],
           [(3, 1.0), (5, 1.0), (6, 1.0)],
           [(9, 1.0)],
           [(9, 1.0), (10, 1.0)],
           [(9, 1.0), (10, 1.0), (11, 1.0)],
           [(8, 1.0), (10, 1.0), (11, 1.0)]]

tfidf = models.TfidfModel(corpus)
vec = [(0, 1), (4, 1)]
tfidf[vec]

[(0, 0.8075244024440723), (4, 0.5898341626740045)]

### Strings to Vectors

In [9]:
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

In [27]:
# remove stopwords
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
        for document in documents]

# remove word that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
        for text in texts]

dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

new_doc = 'Human computer interaction with human human human'
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

{u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}
[(1, 1), (2, 4)]


In [32]:
# BagWords
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('test.mm', corpus) # save to file for later use
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(0, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (6, 2), (8, 1)],
 [(3, 1), (4, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(5, 1), (10, 1), (11, 1)]]

### Transformation

In [37]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


#### LSI

In [39]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(2)

[(0,
  u'0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  u'-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [40]:
for doc in corpus_lsi:
    print(doc)
# lsi.save('file_to_save.lsi')
# lsi.load('file_to_load.lsi')

[(0, 0.066007833960902804), (1, -0.52007033063618502)]
[(0, 0.19667592859142299), (1, -0.76095631677000519)]
[(0, 0.089926399724463077), (1, -0.72418606267525143)]
[(0, 0.075858476521780557), (1, -0.63205515860034334)]
[(0, 0.10150299184979941), (1, -0.57373084830029586)]
[(0, 0.70321089393783165), (1, 0.16115180214025668)]
[(0, 0.87747876731198393), (1, 0.16758906864659256)]
[(0, 0.90986246868185872), (1, 0.14086553628718854)]
[(0, 0.6165825350569285), (1, -0.053929075663894835)]


### Similiarity Queries

In [42]:
doc = 'Human computer interaction'
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi) # vec_lsi order by similiarity to vec_bow

[(0, 0.079104751174447582), (1, -0.57328352430794027)]


# Beatles test
TODO: Remove stopwords

In [3]:
folder = './../data/beatles/'
num_topics = 5
import random

def read_data(folder):
    texts, names = [], []
    filenames = filter(lambda it: it.endswith('.txt'), os.listdir(folder))
    for fname in filenames:
        names.append(fname.replace('.txt', ''))
        with open(folder + fname, 'r') as f:
            texts.append(re.sub(r'\[.*\]', '', f.read().replace('\n',' ').strip()))
    return names, texts

def prepare_data(records):
    return [[word for word in record.translate(None, string.punctuation).lower().split()]
        for record in records]

def create_lsi(dictionary, corpus, num_topics=5):
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
    return lsi_model

def cread_lda(dictionary, corpus, num_topics=5, passes=20):
    number_of_topics = 5
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lda_model = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary,
                                   num_topics=number_of_topics,
                                   passes=passes)
    return lda_model

In [4]:
song_names, raw_texts = read_data(folder)
texts = prepare_data(raw_texts)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [6]:
lsi_model = create_lsi(dictionary, corpus)
for t, topic in lsi_model.print_topics(num_topics):
    print('Topic {0} : {1}'.format(t, topic))

Topic 0 : 0.234*"love" + 0.165*"i" + 0.139*"im" + 0.138*"you" + 0.135*"ill" + 0.129*"me" + 0.123*"she" + 0.119*"yeah" + 0.117*"her" + 0.117*"be"
Topic 1 : -0.609*"love" + 0.231*"back" + 0.215*"john" + 0.154*"brian" + 0.120*"get" + -0.115*"need" + -0.113*"true" + 0.110*"he" + -0.109*"ill" + 0.108*"paul"
Topic 2 : 0.328*"she" + -0.294*"john" + 0.290*"her" + -0.211*"brian" + 0.198*"shes" + -0.138*"needed" + 0.120*"does" + 0.120*"down" + -0.113*"love" + -0.113*"paul"
Topic 3 : 0.241*"she" + 0.224*"blackbird" + 0.222*"her" + 0.185*"john" + 0.176*"love" + 0.165*"moment" + 0.158*"fly" + -0.151*"baby" + 0.150*"girl" + -0.144*"want"
Topic 4 : 0.320*"blackbird" + -0.307*"yeah" + -0.263*"love" + -0.234*"john" + 0.231*"fly" + 0.230*"moment" + -0.203*"brian" + 0.198*"waiting" + 0.183*"arise" + 0.127*"only"


In [8]:
random.seed(42)

lda_model = cread_lda(dictionary, corpus, num_topics)
for t, top_words in lda_model.print_topics(num_topics=num_topics, num_words=5):
    print("Topic", t, ":", top_words)

('Topic', 0, ':', u'0.002*i + 0.002*yeah + 0.002*long + 0.002*ah + 0.002*be')
('Topic', 1, ':', u'0.002*christmas + 0.002*goodbye + 0.002*aw + 0.002*harp + 0.002*honey')
('Topic', 2, ':', u'0.003*i + 0.002*what + 0.002*girl + 0.002*her + 0.002*get')
('Topic', 3, ':', u'0.006*love + 0.003*dont + 0.003*you + 0.003*oh + 0.003*i')
('Topic', 4, ':', u'0.003*baby + 0.003*cry + 0.002*no + 0.002*gonna + 0.002*im')


In [9]:
# Получение темы для конкретного документа
for i in range(20):
    print(song_names[i], lda_model[corpus[i]])

('1822!', [(1, 0.52211132048712006), (3, 0.45661616145185091)])
('A Day In The Life', [(0, 0.93234032464485683), (3, 0.064800838710947758)])
("A Hard Day's Night", [(2, 0.59639866382676765), (3, 0.401130343938853)])
('A Little Rhyme', [(2, 0.84725434711504188), (3, 0.14571822990448782)])
('A Shot Of Rhythm And Blues', [(2, 0.66083510545051505), (3, 0.33722872941850446)])
('A Taste Of Honey', [(0, 0.99277053712726293)])
('Across The Universe', [(2, 0.030486131292410516), (4, 0.96649820511462992)])
('Act Naturally', [(3, 0.47476828978075786), (4, 0.52233039100101319)])
("Ain't She Sweet", [(2, 0.71058409092586106), (3, 0.28601270684145963)])
("All I've Got To Do", [(2, 0.24092343960557047), (3, 0.75527546340440466)])
('All My Loving', [(3, 0.46507086697440275), (4, 0.53085947506280862)])
('All Things Must Pass', [(3, 0.20633866267931289), (4, 0.78962031495826934)])
('All Together Now', [(0, 0.99638830713218329)])
('All You Need Is Love', [(3, 0.9964525033033127)])
('And I Love Her', [(3,