# TOPIC2VEC algorithm by using gensim and according to the second hint given by Gordon Mohr.  
We used a unique LDA (with token not lemmatized) and then perform a different topic2vec learning on each window (and partition) in which we split the entire dataset. This has been done to compare topic representations obtained from different subsamples.
(https://groups.google.com/forum/#!topic/gensim/BVu5-pD6910)


1. Vectorization of docs by using CountVectorizer (with or without tfidf) with no lemmatization
2. Latent Dirichlet Allocation 
3. Topic2Vec in each windows obtained from each partition of the entire dataset (20 NewsGroups)   

It saves:
* the topic2vec model for each window and each partition

In [None]:
import numpy as np; import pandas as pd; import matplotlib.pyplot as plt
%matplotlib inline
import codecs 
from glob import glob
import os
import pickle
import copy
import pyorient
import ast

In [None]:
from __future__ import print_function
from time import time
import string
import re
# random
from random import shuffle, seed

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
from gensim import corpora, models, similarities

In [None]:
n_top_words = 20

## 1. IMPORTING DOCS FROM 20 NEWSGROUPS DATASET

In [None]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## INPUT PARAMETERS

In [None]:
n_partition = 2
n_window_t2v = 2
random_seed_partition = 33
n_topics_LDA = 8

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories, shuffle=True, random_state=42)
cat_docs = map(lambda x: newsgroups_train.target_names[x], newsgroups_train.target)

#### TOTAL NUMBER OF DOC

In [None]:
n_docs = newsgroups_train.filenames.shape[0]
n_docs

In [None]:
cwd =  os.getcwd() # Prints the working directory
results_dir_path = cwd + '/results/20NG_n_topics' + str (n_topics_LDA) +'_n_doc' + str(n_docs) + '_n_win' + str(n_window_t2v)

if not os.path.exists(results_dir_path):
    os.makedirs(results_dir_path)

# 2. LDA to find the topic most-associated with each word

## 2.1 From Strings to Vectors

### WITHOUT Lemmatization

In [None]:
t0 = time()
tf_vectorizer = CountVectorizer(encoding='utf-8', analyzer='word', stop_words='english',
                                ngram_range = (1,1), max_df=0.95, min_df = 50, token_pattern = '[a-zA-Z]{2,}').fit(newsgroups_train.data)
tf_docs = tf_vectorizer.transform(newsgroups_train.data)
print("fit vectorizer without lemmatization done in %0.3fs." % (time() - t0))

In [None]:
my_stop_words = tf_vectorizer.stop_words_

### WITH TFIDF (active/deactivate following cell to perform/not perform TFIDF)

In [None]:
tfidf_vectorizer = TfidfTransformer(sublinear_tf=False, use_idf = True).fit(tf_docs)
tf_docs = tfidf_vectorizer.transform(tf_docs)

In [None]:
n_features = len(tf_vectorizer.get_feature_names())

## 2.2 LDA implementation

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_docs, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics_LDA, max_iter=10, 
                                learning_method='batch', learning_offset=50.,
                                evaluate_every=1, n_jobs=-1, random_state=1)
t0 = time()
lda.fit(tf_docs)
print("done in %0.3fs." % (time() - t0))

In [None]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:
per_topic_distr_LDA = lda.components_
per_topic_distr_LDA.shape
#per_topic_distr_LDA.sum(axis=1)

# 3. TOPIC2VEC

In [None]:
most_p_topic = np.argmax(per_topic_distr_LDA, axis=0)

In [None]:
word_and_topic = zip(tf_feature_names, most_p_topic)

word2topic_dict = {word : 'topic_' + np.array_str(topic) for word, topic in word_and_topic}

## 3.1 Tokenization

In [None]:
def tokenizer(document):
    
    text = "".join([ch for ch in document if ch not in string.punctuation])
    text_list = text.split()
    normalized_text = [x.lower() for x in text_list]
    # Define an empty list
    nostopwords_text = []
    # Scan the words
    for word in normalized_text:
        # Determine if the word is contained in the stop words list
        if word not in (ENGLISH_STOP_WORDS and my_stop_words):
            # If the word is not contained I append it
            nostopwords_text.append(word)
    tokenized_text = [word for word in nostopwords_text if re.search('[a-zA-Z]{2,}', word)]
            
    return tokenized_text

In [None]:
def map_doc_to_topic(tokenized_text, prefix, doc_id_number, word2topic_dict):
    doc_to_topic_list = [prefix + '_' + str(doc_id_number)]
    for word in tokenized_text:
        if word in word2topic_dict.keys():
            doc_to_topic_list.append(word2topic_dict[word])
            
    return doc_to_topic_list

In [None]:
class LabeledLineSentence_training(object):
    def __init__(self, word2topic_dict, docs, cat_docs):
        self.labels_list = word2topic_dict
        self.docs = docs
        self.cat_docs = cat_docs
        
    def __iter__(self):
        for idx, doc_cat in enumerate(zip(self.docs,self.cat_docs)):
            words_doc=tokenizer(doc_cat[0])
            tags_doc = map_doc_to_topic(words_doc, doc_cat[1], idx, word2topic_dict)
            yield models.doc2vec.LabeledSentence(words = words_doc,
                                                     tags = tags_doc)
                
    def to_array(self):
        if 'self.sentences' not in locals():
            self.sentences = []
            for idx, doc_cat in enumerate(zip(self.docs,self.cat_docs)):
                words_doc=tokenizer(doc_cat[0])
                tags_doc = map_doc_to_topic(words_doc, doc_cat[1], idx, word2topic_dict)
                self.sentences.append(models.doc2vec.LabeledSentence(words = words_doc,
                                                     tags = tags_doc))
        return self.sentences
            
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

## 3.1 Training

In [None]:
def partition(lst, n_window, random_seed):
    seed(random_seed)
    division = len(lst) / float(n_window) 
    shuffle(lst)
    return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in xrange(n_window) ]

### Preparation of docs to obtain the input object suitable for the doc2vec, UNPARALLELIZED

In [None]:
it = LabeledLineSentence_training(word2topic_dict,newsgroups_train.data,cat_docs)
all_docs = it.to_array()

In [None]:
# MULTIPLE PARTITIONS
# Partitions and Topic2Vec on each windows of each partition
t0 = time()
for i_partition in xrange(n_partition):
    random_seed_partition += 1
    partitioned_docs = partition(all_docs, n_window_t2v, random_seed_partition)
    
    for i_window in xrange(n_window_t2v):
        current_partition = partitioned_docs[i_window]
        model = models.Doc2Vec(size=100, window=10, min_count=1, dm=1, dbow_words=1,
                              workers=20, alpha=0.025, min_alpha=0.025) # use fixed learning rate
    
        model.build_vocab(current_partition)
        for epoch in xrange(20):
            shuffle(current_partition)
            model.train(current_partition)
            model.alpha -= 0.002 # decrease the learning rate
            model.min_alpha = model.alpha # fix the learning rate, no decay
        fname = results_dir_path + '/t2v_20NG_partSEED' + str(random_seed_partition) + '_win' + str(i_window) + '.model'
        model.save(fname)
print("done in %0.3fs." % (time() - t0))