In [1]:
import nltk
import pymorphy2
import os
import codecs
import cPickle
import numpy as np
import time
import gensim

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

# Base initializations

In [3]:
# Folders and files
__DATA_FOLDER = os.path.join('data', 'Computerra_txt') # Folder with raw computerra corpus
__PICKLE_FOLDER = os.path.join('data', 'pickle') # Folder with serialied clear corpus
__CLEAR_CORPUS_FOLDER = os.path.join(__PICKLE_FOLDER, 'computerra') # Folder with serialied clear corpus
__MODELS_FOLDER = 'models'
__MODEL_FILE = os.path.join(__MODELS_FOLDER, 'd2v_computerra.model')

# Normalize options
__NORMALIZE_CORPUS = 1 # option to run normalization of the corpus

# Doc2vec params
__LEARN_DOC2VEC = 1
__SPACE_SIZE = 500
__THREADS = 2
__WINDOW = 5
LabeledSentence = gensim.models.doc2vec.LabeledSentence

# Random Seed
__RND_SEED = 1

# Tokenization options
morph = pymorphy2.MorphAnalyzer()
word_pattern = u'(?u)\w+'
tokenizer = RegexpTokenizer(word_pattern)
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Make dirs
if not os.path.exists(__PICKLE_FOLDER):
    os.mkdir(__PICKLE_FOLDER)
if not os.path.exists(__CLEAR_CORPUS_FOLDER):
    os.mkdir(__CLEAR_CORPUS_FOLDER)
if not os.path.exists(__MODELS_FOLDER):
    os.mkdir(__MODELS_FOLDER)

# Functions and classes

In [None]:
def normalize_text(text):
    '''
    Function clears text content with normalization
    
    Args:
    text - string with unnormalized content
    
    Returns:
    normalized_text - string with normalized content
    '''
    tokenized_text = []

    # sentence tokenizer
    raw_sentences = sentence_tokenizer.tokenize(text.strip())
    new_sentences = []
    
    for sentence in raw_sentences:
        new_sentence = ''
        for token in tokenizer.tokenize(sentence.strip()):
            if not token.isdigit():
                gram_info = morph.parse(token)
                new_sentence += ' ' + (gram_info[0].normal_form)
        if len(new_sentence):
            new_sentences.append(new_sentence.strip())
    
    normalized_text = '. '.join(new_sentences).strip()
    return normalized_text


def load_computerra_corpus(filepath):
    '''
    Function loads computerra document corpus
    Ignores .txt documents at 2nd level

    Agrs:
    filepath - path to corpus

    Returns:
    titles - list of documents titels
    docs - list of documents content
    '''

    titles = []
    docs = []

    for root, directories, filenames in os.walk(filepath):       
        for filename in filenames: 
            _, file_extension = os.path.splitext(filename)
            if file_extension == '.txt':
                file_to_read = os.path.join(root, filename)
                with codecs.open(file_to_read,
                                 mode='rb',
                                 encoding='cp1251') as fin:
                    
                    content = fin.read()\
                                 .split('='*75)[2:-1]                       
                    content = '. '.join(content).strip()
                    
                    if content.count(' ') >= 10:                    
                        titles.append(file_to_read)
                        docs.append(content)
    return titles, docs


class LabeledLineSentence(object):
    '''
    Class for doc2vec
    '''
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter_old__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(doc.split(), [self.labels_list[idx]])
    def __iter__(self):
        for idx in np.random.choice(len(self.doc_list), size=len(self.doc_list), replace=False):
            yield LabeledSentence(self.doc_list[idx].split(), [self.labels_list[idx]])

# Load and normalize corpus

In [None]:
%%time

if __NORMALIZE_CORPUS or not os.path.exists(os.path.join(__CLEAR_CORPUS_FOLDER, 'docs_clear.p')):
    print 'Running corpus normalization'
    
    # Tokenization options
    morph = pymorphy2.MorphAnalyzer()
    word_pattern = u'(?u)\w+'
    tokenizer = RegexpTokenizer(word_pattern)
    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Load original content
    titles, docs = load_computerra_corpus(__DATA_FOLDER)

    # Clear it
    docs_clear = map(normalize_text, docs)
    
    # Dump
    with open(os.path.join(__CLEAR_CORPUS_FOLDER, 'docs_clear.p'), 'wb') as fout:
        cPickle.dump(docs_clear, fout)
    with open(os.path.join(__CLEAR_CORPUS_FOLDER, 'docs.p'), 'wb') as fout:
        cPickle.dump(docs, fout)
    with open(os.path.join(__CLEAR_CORPUS_FOLDER, 'titles.p'), 'wb') as fout:
        cPickle.dump(titles, fout)
else:
    print 'Loading prenomalized corpus'
    
    # Load
    with open(os.path.join(__CLEAR_CORPUS_FOLDER, 'docs_clear.p'), 'rb') as fin:
        docs_clear = cPickle.load(fin)
    with open(os.path.join(__CLEAR_CORPUS_FOLDER, 'docs.p'), 'rb') as fin:
        docs = cPickle.load(fin)
    with open(os.path.join(__CLEAR_CORPUS_FOLDER, 'titles.p'), 'rb') as fin:
        titles = cPickle.load(fin)

Running corpus normalization


# Run doc2vec

In [None]:
%%time

if __LEARN_DOC2VEC or not os.path.exists(__MODEL_FILE):

    print 'Learning Doc2Vec model..'
    
    # remove dots in document
    docs_clear_nodots = []
    for doc in docs_clear:
        docs_clear_nodots.append(doc.replace('.', ''))
    
    # preparing input
    sentences = LabeledLineSentence(docs_clear_nodots, titles)
    
    # setting model params
    model = gensim.models.Doc2Vec(dm=1, size=__SPACE_SIZE, window=__WINDOW, min_count=3,
                                  workers=__THREADS, negative=10, sample=1e-5, hs=0, seed=__RND_SEED)
    
    model.build_vocab(sentences)
    
    # learining 
    alpha, min_alpha, passes = (0.025, 0.001, 30)
    alpha_delta = (alpha - min_alpha) / passes
    
    for epoch in range(passes):
        start_time = time.time()
        print 'alpha = %f' % (alpha)
        model.alpha = alpha
        model.min_alpha = alpha # fix the learning rate, 
        model.train(sentences)
        print 'epoch %d/%d done in %f seconds' % (epoch+1, passes, time.time()-start_time)

        alpha -= alpha_delta
        
        
    # Dump
    model.save(__MODEL_FILE)
    
else:
    
    print 'Loading Doc2Vec model..'
    
    model = gensim.models.Doc2Vec.load(__MODEL_FILE)

# DEMO

## Similarity with documents from corpus

In [None]:
# Pick random document from corpus
idx_title = np.random.randint(model.docvecs.count)
doc_id = titles[idx_title]
idx = titles.index(doc_id)

sims = model.docvecs.most_similar(doc_id, topn=3)  # get similar documents
print u'Query (%s):\n «%s...»\n' % (doc_id, docs[idx][:1000]) 
print u'='*30
print u'TOP-3 similar documents %s:\n'

for label, index in [('top-1', 0), ('top-2', 1), ('top-3', 2)]:
    idx = titles.index(sims[index][0])    
    print(u'%s - %s, Similarity=%f:\n «%s...»\n' % (label,
                                                    sims[index][0],
                                                    sims[index][1],
                                                    docs[idx][:1000]))

## Similarity with arbitrary document

In [None]:
file_to_read = 'data/Computerra_txt/1999/285/2345/index.txt'

# Loading raw file content
with codecs.open(file_to_read,
                 mode='rb',
                 encoding='cp1251') as fin:
                    raw_content = fin.read()\
                                 .split('='*75)[2:-1]                       
                    raw_content = '. '.join(raw_content).strip()

content = normalize_text(raw_content).replace('.', '')
inferred_doc = model.infer_vector(content.split())

sims = model.docvecs.most_similar([inferred_doc], topn=3)  # get similar documents
print u'Query:\n «%s»\n' % (raw_content[:1000]) 
print u'='*30
print u'TOP-3 similar documents %s:\n'

for label, index in [('top-1', 0), ('top-2', 1), ('top-3', 2)]:
    idx = titles.index(sims[index][0])    
    print(u'%s - %s, Similarity=%f:\n «%s»\n' % (label,
                                                 sims[index][0],
                                                 sims[index][1],
                                                 docs[idx][:1000]))