In [1]:
import gensim
import logging
import os
import re
import string
import itertools
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np

from gensim import corpora, models, similarities



In [15]:
corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
           [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
           [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
           [(0, 1.0), (4, 2.0), (7, 1.0)],
           [(3, 1.0), (5, 1.0), (6, 1.0)],
           [(9, 1.0)],
           [(9, 1.0), (10, 1.0)],
           [(9, 1.0), (10, 1.0), (11, 1.0)],
           [(8, 1.0), (10, 1.0), (11, 1.0)]]

tfidf = models.TfidfModel(corpus)
vec = [(0, 1), (4, 1)]
tfidf[vec]

2016-12-03 23:29:32,484 : INFO : collecting document frequencies
2016-12-03 23:29:32,484 : INFO : PROGRESS: processing document #0
2016-12-03 23:29:32,485 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


[(0, 0.8075244024440723), (4, 0.5898341626740045)]

### Strings to Vectors

In [16]:
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

In [17]:
# remove stopwords
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
        for document in documents]

# remove word that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
        for text in texts]

dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

new_doc = 'Human computer interaction with human human human'
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

2016-12-03 23:29:36,206 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2016-12-03 23:29:36,207 : INFO : built Dictionary(12 unique tokens: ['minors', 'trees', 'graph', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


{'minors': 11, 'trees': 9, 'graph': 10, 'response': 7, 'survey': 3, 'user': 6, 'computer': 2, 'human': 0, 'time': 4, 'system': 5, 'interface': 1, 'eps': 8}
[(0, 4), (2, 1)]


In [18]:
# BagWords
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('test.mm', corpus) # save to file for later use
corpus

2016-12-03 23:29:38,735 : INFO : storing corpus in Matrix Market format to test.mm
2016-12-03 23:29:38,735 : INFO : saving sparse matrix to test.mm
2016-12-03 23:29:38,736 : INFO : PROGRESS: saving document #0
2016-12-03 23:29:38,736 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2016-12-03 23:29:38,737 : INFO : saving MmCorpus index to test.mm.index


[[(0, 1), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(1, 1), (5, 1), (6, 1), (8, 1)],
 [(0, 1), (5, 2), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(3, 1), (10, 1), (11, 1)]]

### Transformation

In [37]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


#### LSI

In [39]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(2)

[(0,
  u'0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  u'-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [40]:
for doc in corpus_lsi:
    print(doc)
# lsi.save('file_to_save.lsi')
# lsi.load('file_to_load.lsi')

[(0, 0.066007833960902804), (1, -0.52007033063618502)]
[(0, 0.19667592859142299), (1, -0.76095631677000519)]
[(0, 0.089926399724463077), (1, -0.72418606267525143)]
[(0, 0.075858476521780557), (1, -0.63205515860034334)]
[(0, 0.10150299184979941), (1, -0.57373084830029586)]
[(0, 0.70321089393783165), (1, 0.16115180214025668)]
[(0, 0.87747876731198393), (1, 0.16758906864659256)]
[(0, 0.90986246868185872), (1, 0.14086553628718854)]
[(0, 0.6165825350569285), (1, -0.053929075663894835)]


### Similiarity Queries

In [42]:
doc = 'Human computer interaction'
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi) # vec_lsi order by similiarity to vec_bow

[(0, 0.079104751174447582), (1, -0.57328352430794027)]


# Beatles test

In [4]:
folder = './../data/beatles/'
num_topics = 5
import random

translator = str.maketrans({key: None for key in string.punctuation})

def read_data(folder):
    texts, names = [], []
    filenames = filter(lambda it: it.endswith('.txt'), os.listdir(folder))
    for fname in filenames:
        names.append(fname.replace('.txt', ''))
        with open(folder + fname, 'r') as f:
            texts.append(re.sub(r'\[.*\]', '', f.read().replace('\n',' ').strip()))
    return names, texts

def prepare_data(records):
    return [[word for word in record.translate(translator).lower().split() if word not in STOPWORDS]
        for record in records]

def create_lsi(dictionary, corpus, num_topics=5):
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
    return lsi_model

def cread_lda(dictionary, corpus, num_topics=5, passes=20, chunksize=2000):
    number_of_topics = 5
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lda_model = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary,
                                   num_topics=number_of_topics,
                                   passes=passes,
                                   chunksize=chunksize)
    return lda_model

def get_top_words(lda_model):
    # http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html
    top_words = [[word for word, __ in lda_model.show_topic(topic, topn=50)] for topic in range(lda_model.num_topics)]
    replacements = []
    for words in top_words:
        yield words

In [5]:
song_names, raw_texts = read_data(folder)
texts = prepare_data(raw_texts)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

2016-12-03 23:28:14,893 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2016-12-03 23:28:14,906 : INFO : built Dictionary(2141 unique tokens: ['magical', 'believe', 'pop', 'bride', 'called']...) from 186 documents (total 14083 corpus positions)


In [6]:
lsi_model = create_lsi(dictionary, corpus)
for t, topic in lsi_model.print_topics(num_topics):
    print('Topic {0} : {1}'.format(t, topic))

2016-12-03 23:28:17,102 : INFO : collecting document frequencies
2016-12-03 23:28:17,102 : INFO : PROGRESS: processing document #0
2016-12-03 23:28:17,104 : INFO : calculating IDF weights for 186 documents and 2140 features (6169 matrix non-zeros)
2016-12-03 23:28:17,106 : INFO : using serial LSI version on this node
2016-12-03 23:28:17,107 : INFO : updating model with new documents
2016-12-03 23:28:17,117 : INFO : preparing a new chunk of documents
2016-12-03 23:28:17,143 : INFO : using 100 extra samples and 2 power iterations
2016-12-03 23:28:17,143 : INFO : 1st phase: constructing (2141, 105) action matrix
2016-12-03 23:28:17,166 : INFO : orthonormalizing (2141, 105) action matrix
2016-12-03 23:28:17,482 : INFO : 2nd phase: running dense svd on (105, 186) matrix
2016-12-03 23:28:17,604 : INFO : computing the final decomposition
2016-12-03 23:28:17,604 : INFO : keeping 5 factors (discarding 87.926% of energy spectrum)
2016-12-03 23:28:17,605 : INFO : processed documents up to #186
20

Topic 0 : -0.402*"love" + -0.213*"ill" + -0.213*"yeah" + -0.207*"im" + -0.164*"dont" + -0.156*"baby" + -0.150*"know" + -0.144*"girl" + -0.143*"want" + -0.140*"oh"
Topic 1 : -0.707*"love" + 0.168*"yeah" + 0.164*"baby" + 0.161*"john" + -0.133*"need" + -0.132*"whoa" + 0.119*"im" + -0.116*"true" + 0.116*"got" + 0.106*"brian"
Topic 2 : 0.525*"john" + 0.391*"brian" + 0.220*"paul" + 0.184*"love" + 0.174*"yeah" + 0.151*"beatles" + -0.149*"dont" + 0.129*"ha" + 0.115*"play" + -0.108*"let"
Topic 3 : 0.463*"yeah" + -0.345*"girl" + -0.228*"blackbird" + -0.197*"moment" + -0.161*"fly" + -0.154*"john" + -0.140*"waiting" + 0.138*"ill" + 0.137*"gotta" + -0.130*"arise"
Topic 4 : 0.401*"baby" + -0.249*"blackbird" + -0.193*"yeah" + -0.182*"fly" + -0.172*"waiting" + -0.165*"ill" + 0.156*"girl" + -0.151*"night" + -0.150*"moment" + 0.149*"man"


In [7]:
random.seed(42)

lda_model = cread_lda(dictionary, corpus, num_topics, )
for t, top_words in lda_model.print_topics(num_topics=num_topics, num_words=5):
    print("Topic", t, ":", top_words)

2016-12-03 23:28:21,566 : INFO : collecting document frequencies
2016-12-03 23:28:21,566 : INFO : PROGRESS: processing document #0
2016-12-03 23:28:21,568 : INFO : calculating IDF weights for 186 documents and 2140 features (6169 matrix non-zeros)
2016-12-03 23:28:21,570 : INFO : using symmetric alpha at 0.2
2016-12-03 23:28:21,571 : INFO : using symmetric eta at 0.2
2016-12-03 23:28:21,571 : INFO : using serial LDA version on this node
2016-12-03 23:28:21,668 : INFO : running online LDA training, 5 topics, 20 passes over the supplied corpus of 186 documents, updating model once every 186 documents, evaluating perplexity every 186 documents, iterating 50x with a convergence threshold of 0.001000
2016-12-03 23:28:22,096 : INFO : -32.485 per-word bound, 6011414105.4 perplexity estimate based on a held-out corpus of 186 documents with 761 words
2016-12-03 23:28:22,097 : INFO : PROGRESS: pass 0, at document #186/186
2016-12-03 23:28:22,249 : INFO : topic #0 (0.200): 0.004*"john" + 0.003*"l

Topic 0 : 0.004*"john" + 0.003*"love" + 0.003*"brian" + 0.002*"time" + 0.002*"mm"
Topic 1 : 0.006*"love" + 0.004*"want" + 0.003*"yeah" + 0.003*"oh" + 0.003*"ill"
Topic 2 : 0.003*"let" + 0.003*"girl" + 0.002*"baby" + 0.002*"ooh" + 0.002*"better"
Topic 3 : 0.004*"long" + 0.003*"love" + 0.003*"yeah" + 0.003*"im" + 0.003*"know"
Topic 4 : 0.003*"needed" + 0.002*"im" + 0.002*"sun" + 0.002*"dont" + 0.002*"forget"


In [10]:
# Получение темы для конкретного документа
for i in range(20):
    print(song_names[i], lda_model[corpus[i]])

1822! [(0, 0.95238184057863251), (1, 0.011806327241384274), (2, 0.011806038336300608), (3, 0.01205437286871662), (4, 0.011951420974965923)]
A Day In The Life [(0, 0.99166360790163754)]
A Hard Day's Night [(0, 0.87334633608717138), (3, 0.12036482836703136)]
A Little Rhyme [(0, 0.11937755548646665), (3, 0.86702457084998119)]
A Shot Of Rhythm And Blues [(3, 0.99359494701706841)]
A Taste Of Honey [(3, 0.98196429248794947)]
Across The Universe [(3, 0.99397226067866862)]
Act Naturally [(3, 0.99075636526347888)]
Ain't She Sweet [(1, 0.99111079096012977)]
All I've Got To Do [(3, 0.98369702398346603)]
All My Loving [(1, 0.033794861305270964), (2, 0.84789377810861766), (3, 0.11284831260718597)]
All Things Must Pass [(3, 0.98691002643177894)]
All Together Now [(3, 0.98722457253916729)]
All You Need Is Love [(1, 0.99156018291903891)]
And I Love Her [(3, 0.97920966667464626)]
And Your Bird Can Sing [(4, 0.98014722887354611)]
Anna, Go To Him [(1, 0.96877728216327685)]
Another Girl [(1, 0.98868445147

### Other test. Nightwish

In [11]:
folder = './../data/nightwish/'
song_names, raw_texts = read_data(folder)
texts = prepare_data(raw_texts)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

2016-12-03 23:29:01,602 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2016-12-03 23:29:01,614 : INFO : built Dictionary(1991 unique tokens: ['moonlit', 'believe', 'bride', 'called', 'eternal']...) from 65 documents (total 5430 corpus positions)


In [12]:
lsi_model = create_lsi(dictionary, corpus)
for t, topic in lsi_model.print_topics(num_topics):
    print('Topic {0} : {1}'.format(t, topic))

2016-12-03 23:29:03,909 : INFO : collecting document frequencies
2016-12-03 23:29:03,910 : INFO : PROGRESS: processing document #0
2016-12-03 23:29:03,911 : INFO : calculating IDF weights for 65 documents and 1990 features (3833 matrix non-zeros)
2016-12-03 23:29:03,913 : INFO : using serial LSI version on this node
2016-12-03 23:29:03,914 : INFO : updating model with new documents
2016-12-03 23:29:03,919 : INFO : preparing a new chunk of documents
2016-12-03 23:29:03,921 : INFO : using 100 extra samples and 2 power iterations
2016-12-03 23:29:03,921 : INFO : 1st phase: constructing (1991, 105) action matrix
2016-12-03 23:29:03,923 : INFO : orthonormalizing (1991, 105) action matrix
2016-12-03 23:29:03,974 : INFO : 2nd phase: running dense svd on (105, 65) matrix
2016-12-03 23:29:03,986 : INFO : computing the final decomposition
2016-12-03 23:29:03,986 : INFO : keeping 5 factors (discarding 87.310% of energy spectrum)
2016-12-03 23:29:03,993 : INFO : processed documents up to #65
2016-

Topic 0 : 0.196*"wish" + 0.132*"love" + 0.125*"heart" + 0.121*"world" + 0.110*"time" + 0.108*"come" + 0.107*"oh" + 0.103*"night" + 0.100*"long" + 0.094*"eyes"
Topic 1 : -0.381*"wish" + -0.238*"angel" + -0.167*"miss" + 0.134*"drown" + -0.131*"im" + -0.124*"getting" + -0.124*"colder" + -0.112*"right" + 0.108*"ago" + -0.107*"wings"
Topic 2 : 0.181*"walks" + 0.172*"forever" + 0.171*"heaven" + 0.160*"heart" + -0.147*"right" + 0.144*"walk" + 0.125*"meadows" + -0.125*"crestfallen" + 0.117*"little" + 0.109*"wish"
Topic 3 : 0.219*"heaven" + 0.173*"meadows" + -0.148*"horizon" + -0.130*"world" + 0.127*"cradle" + -0.126*"away" + -0.124*"end" + -0.121*"far" + -0.111*"ago" + -0.109*"long"
Topic 4 : 0.176*"right" + 0.162*"night" + 0.159*"things" + -0.156*"wheres" + 0.153*"fight" + 0.118*"land" + 0.116*"horizon" + 0.115*"lost" + 0.109*"crestfallen" + -0.107*"dont"


In [13]:
random.seed(42)

lda_model = cread_lda(dictionary, corpus, num_topics)
for t, top_words in lda_model.print_topics(num_topics=num_topics, num_words=5):
    print("Topic", t, ":", top_words)

2016-12-03 23:29:06,720 : INFO : collecting document frequencies
2016-12-03 23:29:06,721 : INFO : PROGRESS: processing document #0
2016-12-03 23:29:06,722 : INFO : calculating IDF weights for 65 documents and 1990 features (3833 matrix non-zeros)
2016-12-03 23:29:06,724 : INFO : using symmetric alpha at 0.2
2016-12-03 23:29:06,724 : INFO : using symmetric eta at 0.2
2016-12-03 23:29:06,725 : INFO : using serial LDA version on this node
2016-12-03 23:29:06,774 : INFO : running online LDA training, 5 topics, 20 passes over the supplied corpus of 65 documents, updating model once every 65 documents, evaluating perplexity every 65 documents, iterating 50x with a convergence threshold of 0.001000
2016-12-03 23:29:07,042 : INFO : -50.356 per-word bound, 1441357516219480.0 perplexity estimate based on a held-out corpus of 65 documents with 398 words
2016-12-03 23:29:07,042 : INFO : PROGRESS: pass 0, at document #65/65
2016-12-03 23:29:07,109 : INFO : topic #0 (0.200): 0.002*"angel" + 0.002*"w

Topic 0 : 0.002*"angel" + 0.002*"wish" + 0.001*"nighttime" + 0.001*"queen" + 0.001*"1001"
Topic 1 : 0.002*"slow" + 0.002*"meadows" + 0.002*"heaven" + 0.002*"im" + 0.002*"like"
Topic 2 : 0.002*"little" + 0.002*"shall" + 0.002*"walks" + 0.002*"leave" + 0.001*"memories"
Topic 3 : 0.002*"bye" + 0.002*"world" + 0.002*"end" + 0.002*"long" + 0.002*"phantom"
Topic 4 : 0.002*"wheres" + 0.002*"lost" + 0.002*"killing" + 0.002*"complete" + 0.001*"things"


In [14]:
for topic, words in enumerate(get_top_words(lda_model)):
    print('Toplic {0} : {1}\n'.format(topic, ' '.join(words[:10])))

Toplic 0 : angel wish nighttime queen 1001 philosopher sun nights master beautiful

Toplic 1 : slow meadows heaven im like wish praise blue sky miss

Toplic 2 : little shall walks leave memories forever heart love eva resting

Toplic 3 : bye world end long phantom opera rain drown come wish

Toplic 4 : wheres lost killing complete things got night instead elvenpath dont

