# TOPIC2VEC algorithm by using gensim and according to the second hint given by Gordon Mohr.  
We used a unique LDA (with lemmatized token) and then perform a different topic2vec learning on each window (and partition) in which we split the entire dataset. This has been done to compare topic representations obtained from different subsamples. 
All the steps are parallelized.
(https://groups.google.com/forum/#!topic/gensim/BVu5-pD6910)

1. Vectorization of docs (already tokenized by performing lemmatization) by using CountVectorizer (with or without tfidf) 
2. Latent Dirichlet Allocation 
3. Topic2Vec in each windows obtained from each partition of the entire dataset (20 NewsGroups)   

It saves:
* file with all the parameters used for CountVectorizer and LDA, the LDA vocabulary with the most likely words of each topic, the number of epochs used when learning the topic2vec and the seed to randomize the dataset partition
* the topic2vec model for each window and each partition

In [None]:
import numpy as np; import pandas as pd; import matplotlib.pyplot as plt
%matplotlib inline
import codecs 
from glob import glob
import os
import pickle
import copy
import pyorient
import ast

In [None]:
from __future__ import print_function
from time import time
import string
import re
# random
from random import shuffle, seed

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
from gensim import corpora, models, similarities

# INPUT PARAMETERS

In [None]:
n_partition = 1
n_window_t2v = 2
random_seed_partition = 54

n_epoch_t2v = 10

In [None]:
CountVectorizer_param = {
    'encoding' : 'utf-8',
    'analyzer' : unicode.split, 
    'strip_accents' : 'unicode',
    'ngram_range' : (1,1), 
    'min_df' : 2,
    'max_df' : 0.95
    }

In [None]:
lda_param = {
    'n_topics':8, 
    'max_iter':10, 
    'learning_method':'batch', 
    'learning_offset':50.,
    'evaluate_every':0, 
    'n_jobs':-1, 
    'random_state':10
    }

In [None]:
n_docs = 11314
dir_name = '/results/20NG_lemmatiz_win5_n_topics' + str (lda_param['n_topics']) + '_n_doc' + str(n_docs) + '_n_win' + str(n_window_t2v)

In [None]:
n_top_words = 30

# 1. LOADING LEMMATIZED/TOKENIZED TEXTS

In [None]:
cwd =  os.getcwd() # Prints the working directory
results_dir_path = cwd + dir_name

if not os.path.exists(results_dir_path):
    os.makedirs(results_dir_path)

In [None]:
output = open(cwd + '/results/lemmatized_text_n_docs' + str(n_docs) + '.pkl', 'r')
tokenized_docs = pickle.load(output) #space of the parameters spanned with the grid search
output.close()

tokenized_text = [unicode(x[0]) for x in tokenized_docs['tokenized_docs'] if len(x[0])>0]
cat_docs = [x[1] for x in tokenized_docs['tokenized_docs'] if len(x[0])>0]

# 2. LDA to find the topic most-associated with each word

## 2.1 From Strings to Vectors

In [None]:
t0 = time()
tf_vectorizer = CountVectorizer(encoding = CountVectorizer_param['encoding'],
                                analyzer = CountVectorizer_param['analyzer'],
                                strip_accents = CountVectorizer_param['strip_accents'],
                                ngram_range = CountVectorizer_param['ngram_range'], 
                                min_df = CountVectorizer_param['min_df'],
                                max_df = CountVectorizer_param['max_df']).fit(tokenized_text)
tf_docs = tf_vectorizer.transform(tokenized_text)
print("fit vectorizer with lemmatization done in %0.3fs." % (time() - t0))

### WITH TFIDF (active/deactivate following cell to perform/not perform TFIDF)

In [None]:
n_features = len(tf_vectorizer.get_feature_names())

## 2.2 LDA implementation

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
def top_words_dict(model, feature_names, n_top_words):
    top_words_per_topic = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words_per_topic['Topic_#' + str(topic_idx) + ':'] = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return top_words_per_topic

In [None]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_docs, n_features))
lda = LatentDirichletAllocation(n_topics=lda_param['n_topics'],
                                max_iter=lda_param['max_iter'], 
                                learning_method=lda_param['learning_method'],
                                learning_offset=lda_param['learning_offset'],
                                evaluate_every=lda_param['evaluate_every'],
                                n_jobs=lda_param['n_jobs'],
                                random_state=lda_param['random_state'])
t0 = time()
lda.fit(tf_docs)
print("done in %0.3fs." % (time() - t0))

In [None]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:
top_words_per_topic = top_words_dict(lda, tf_feature_names, n_top_words)

In [None]:
per_topic_distr_LDA = lda.components_
per_topic_distr_LDA.shape
#per_topic_distr_LDA.sum(axis=1)

In [None]:
CountVectorizer_param['analyzer'] = (CountVectorizer_param['analyzer'].__name__)

output = open(results_dir_path + '/CountVect_LDA_param.pkl', 'w')

pickle.dump({'lda_param': lda_param,
             'CountVectorizer_param': CountVectorizer_param,
             'top_words_per_topic': top_words_per_topic,
             'random_seed_partition': random_seed_partition,
             'n_epoch_t2v': n_epoch_t2v}, output) #space of the parameters spanned with the grid search
output.close()

# 3. TOPIC2VEC

In [None]:
most_p_topic = np.argmax(per_topic_distr_LDA, axis=0)

In [None]:
word_and_topic = zip(tf_feature_names, most_p_topic)

word2topic_dict = {word : 'topic_' + np.array_str(topic) for word, topic in word_and_topic}

## 3.1 Tokenization

In [None]:
def map_doc_to_topic(tokenized_text, prefix, doc_id_number, word2topic_dict):
    doc_to_topic_list = [prefix + '_' + str(doc_id_number)]
    for word in tokenized_text:
        if word in word2topic_dict.keys():
            doc_to_topic_list.append(word2topic_dict[word])
            
    return doc_to_topic_list

In [None]:
class LabeledLineSentence_training(object):
    def __init__(self, word2topic_dict, tokenized_docs, cat_docs):
        self.labels_list = word2topic_dict
        self.tokenized_docs = tokenized_docs
        self.cat_docs = cat_docs
    
    def __iter__(self):
        for idx, doc_cat in enumerate(zip(self.tokenized_docs,self.cat_docs)):
            words_doc = doc_cat[0].split()
            tags_doc = map_doc_to_topic(words_doc, doc_cat[1], idx, word2topic_dict)
            yield models.doc2vec.LabeledSentence(words = words_doc,
                                                 tags = tags_doc)
                
    def to_array(self):
        if 'self.sentences' not in locals():
            self.sentences = []
            for idx, doc_cat in enumerate(zip(self.tokenized_docs,self.cat_docs)):
                words_doc = doc_cat[0].split()
                tags_doc = map_doc_to_topic(words_doc, doc_cat[1], idx, word2topic_dict)
                self.sentences.append(models.doc2vec.LabeledSentence(words = words_doc,
                                      tags = tags_doc))
        return self.sentences
            
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [None]:
def LabeledLineSentence_to_array(tokenized_doc, cat_doc, idx_doc):
    words_doc = tokenized_doc.split()
    tags_doc = map_doc_to_topic(words_doc, cat_doc, idx_doc, word2topic_dict)
    return models.doc2vec.LabeledSentence(words = words_doc, tags = tags_doc)

In [None]:
def partition(lst, n_window, random_seed):
    seed(random_seed)
    division = len(lst) / float(n_window) 
    shuffle(lst)
    return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in xrange(n_window) ]

## 3.1 Training

### Preparation of docs to obtain the input object suitable for the doc2vec, UNPARALLELIZED

### Preparation of docs to obtain the input object suitable for the doc2vec, PARALLELIZED

In [None]:
from joblib import Parallel, delayed  
import multiprocessing
t0 = time()
num_cores = multiprocessing.cpu_count()

all_docs = Parallel(n_jobs=num_cores)(delayed(LabeledLineSentence_to_array)(doc[0],doc[1],idx) 
                                            for idx, doc in enumerate(zip(tokenized_text,cat_docs)))
print("Labelled sentences done in %0.3fs." % (time() - t0))

In [None]:
# MULTIPLE PARTITIONS
# Partitions and Topic2Vec on each windows of each partition
t0 = time()
for i_partition in xrange(n_partition):
    print ('partition: ' + str(i_partition))
    random_seed_partition += 1
    partitioned_docs = partition(all_docs, n_window_t2v, random_seed_partition)

    for i_window in xrange(n_window_t2v):
        print ('window: ' + str(i_window))
        current_partition = partitioned_docs[i_window]
        model = models.Doc2Vec(size=100, window=5, min_count=1, dm=1, dbow_words=1,
                              workers=20, alpha=0.025, min_alpha=0.025) # use fixed learning rate
        model.build_vocab(current_partition)
        for epoch in xrange(n_epoch_t2v):
            print ('epoch: ' + str(epoch))
            shuffle(current_partition)
            model.train(current_partition)
            model.alpha -= 0.002 # decrease the learning rate
            model.min_alpha = model.alpha # fix the learning rate, no decay
        fname = results_dir_path + '/t2v_20NG_partSEED' + str(random_seed_partition) + '_win' + str(i_window) + '.model'
        model.save(fname)
print("done in %0.3fs." % (time() - t0))