# TOPIC2VEC algorithm by using gensim and according to the second hint given by Gordon Mohr.  
(https://groups.google.com/forum/#!topic/gensim/BVu5-pD6910)

Imports

In [None]:
import numpy as np; import pandas as pd; import matplotlib.pyplot as plt
%matplotlib inline
import codecs 
from glob import glob
import os
import pickle
import copy
import pyorient
import ast

In [None]:
from __future__ import print_function
from time import time
import string
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
from gensim import corpora, models, similarities

In [None]:
cutoff_txtlen = 200

n_topics = 10
n_top_words = 20

## 1. IMPORTING DOCS FROM BIP DB

### Time interval

In [None]:
start = '2015/11/01'
stop = '2015/11/30'
n_doc_per_day = 10
SEED = 123

In [None]:
drange = pd.date_range(start=start,end=stop,freq='D')

### Load the data from the Postgre SQL database

In [None]:
import psycopg2
import psycopg2.extras

# Connect to an existing database
conn = psycopg2.connect("dbname=bip user=cgnal host='151.80.103.221' password=CGnal2015!")

In [None]:
import sys  
#reload(sys)  
#sys.setdefaultencoding('utf8')

We decided to organize document in a unique table with the following fields:
* **domain** [domain as detected from the classification algorithm]
* **topic** [topic as detected from the classification algorithm]
* **sourceDomain** [domain associated with the source of the document, if any]
* **sourceTopic** [topic associated with the source of the document, if any]
* **sourceType** [kind of source of the document: RSSfeed, twitter, etc...]
* **sourceName** [name of the source of the document]
* **author** [author of the document]
* **publishDay** [publication date of the document]
* **publishDate** [publication date in milliseconds of the document]
* **title** [title of the document]
* **ID** [ID of the document]
* **pk** [numeric ID of the document]
* **link** [link to the webpage where the original document has been found]
* **sourceTags** [tags associated with the document, if any]
* **text** [text of the document encoded with the utf-8 format]

In [None]:
import pandas.io.sql as pdsql
def random_textsPG(publishday, n_doc, conn, seed):
    """ Iterator over documents in a day from the PS database 
    
    Iterate over all documents in the database:
        - on a specified day 

    yielding one document at a time.
    
    Parameters
    ----------
    publishday : string
        day of publication of the selected documents 
        
    Return
    ------
    id_name : string
        id of the doc
    record : dictionary-like object
        record from the db
    
        
    """
    # Open a cursor to perform database operations
    cur = conn.cursor()
    # Count the number of records in the selected day
    cur.execute("select count(*) from inputdocument where publishday = " + publishday)
    record_number = cur.fetchone()
    cur.close()

    if record_number[0] < n_doc:
        sys.exit('Not enough document in the day: %s' % publishday)
        
    cur = conn.cursor()
    cur.execute("SELECT setseed(%s)" %str(seed))
    the_frame = pdsql.read_frame("select * from inputdocument where publishday = %s order by random() limit %s" % (publishday,n_doc),  conn)
#    the_frame['text']=the_frame['text'].apply(lambda x: x.decode('utf-8').encode('utf-8'))
#    the_frame['text']=the_frame['text'].apply(lambda x: codecs.decode(x, 'utf-8'))
#    the_frame.text=the_frame.text.apply(lambda x: codecs.decode(x, 'utf-8'));
    the_frame.text=the_frame.text.apply(lambda x: x.decode('utf-8'));
#    the_frame['text'] = codecs.decode(buffer(str(the_frame['text']),0,len(the_frame['text'])), 'utf-8')
    cur.close()
    
    return the_frame

In [None]:
import random
tot = []
random.seed(SEED)
for day in drange:
    tot += [random_textsPG(day.strftime("%Y%m%d"), n_doc_per_day, conn, random.randrange(-1,1))]
    

In [None]:
df=pd.concat(tot);
df.index = np.arange(df.shape[0])

### Normalization of the data (check for duplicates or empty texts)

#### Check for empty text

In [None]:
df_norm=df[df.text.apply(lambda x: len(x)>cutoff_txtlen)];
num_empty_doc = df.shape[0] - df_norm.shape[0]
num_empty_doc

#### Check for duplicates

In [None]:
df_norm2 = df_norm.drop_duplicates(subset = 'text') #NB: inplace vuol dire passato per riferimento, altrimenti fa una copy qundi si tratta proprio di un'altra area di memoria
num_doc_notUnique = df_norm.shape[0] - df_norm2.shape[0]
num_doc_notUnique
n_samples = df_norm2.shape[0]

#### TOTAL NUMBER OF DOC

In [None]:
n_docs = df_norm2.shape[0]
n_docs

# 2. LDA to find the topic most-associated with each word

## 2.1 From Strings to Vectors

### WITH Lemmatization

### WITHOUT Lemmatization

In [None]:
t0 = time()
tf_vectorizer = CountVectorizer(encoding='utf-8', analyzer='word', stop_words='english',
                                ngram_range = (1,1), min_df = 2, token_pattern = '[a-zA-Z]{2,}').fit(df_norm2.text)
print("fit vectorizer without lemmatization done in %0.3fs." % (time() - t0))

### Vectorization

In [None]:
n_features = len(tf_vectorizer.get_feature_names())

In [None]:
df_norm2.text[0]

In [None]:
tf_docs = tf_vectorizer.transform(df_norm2.text)

### WITH TFIDF

tfidf_vectorizer = TfidfTransformer(sublinear_tf=False, use_idf = True).fit(tf_docs)
tfidf_docs = tfidf_vectorizer.transform(tf_docs)

## 2.2 LDA implementation

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf_docs)
print("done in %0.3fs." % (time() - t0))

In [None]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:
per_topic_distr_LDA = lda.components_
per_topic_distr_LDA.shape
#per_topic_distr_LDA.sum(axis=1)

# 3. TOPIC2VEC

In [None]:
most_p_topic = np.argmax(per_topic_distr_LDA, axis=0)

In [None]:
word_and_topic = zip(tf_feature_names, most_p_topic)

word2topic_dict = {word : 'topic_' + np.array_str(topic) for word, topic in word_and_topic}

## 3.1 Tokenization

In [None]:
def tokenizer(document):
    text = "".join([ch for ch in document if ch not in string.punctuation])
    text_list = text.split()
    normalized_text = [x.lower() for x in text_list]
    # Define an empty list
    nostopwords_text = []
    # Scan the words
    for word in normalized_text:
        # Determine if the word is contained in the stop words list
        if word not in ENGLISH_STOP_WORDS:
            # If the word is not contained I append it
            nostopwords_text.append(word)
    tokenized_text = [word for word in nostopwords_text if re.search('[a-zA-Z]{2,}', word)]
            
    return tokenized_text

In [None]:
def map_doc_to_topic(tokenized_text, doc_id_number, word2topic_dict):
    doc_to_topic_list = ['paragraph_' + str(doc_id_number)]
    for word in tokenized_text:
        if word in word2topic_dict.keys():
            doc_to_topic_list.append(word2topic_dict[word])
            
    return doc_to_topic_list

In [None]:
class LabeledLineSentence(object):
    def __init__(self, docs_list, word2topic_dict):
        self.labels_list = word2topic_dict
        self.docs_list = docs_list
    def __iter__(self):
        for idx, doc in enumerate(self.docs_list):
            words_doc=tokenizer(doc)
            tags_doc = map_doc_to_topic(words_doc, idx, word2topic_dict)
            yield models.doc2vec.LabeledSentence(words = words_doc,
                                                 tags = tags_doc)
    def sentences_perm(self):
        shuffle(models.doc2vec.LabeledSentence)
        return models.doc2vec.LabeledSentence

## 3.1 Training

In [None]:
it = LabeledLineSentence(df_norm2.text, word2topic_dict)

In [None]:
model = models.Doc2Vec(size=100, window=8, min_count=2, dm=1, dbow_words=1,
                              workers=50, alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(it)
for epoch in range(10):
    model.train(it.sentences_perm())
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

In [None]:
fname =  os.getcwd() # Prints the working directory
fname = fname + '/topic2vec_ndoc' + str(n_docs) + 'n_topic' + str(n_topics) + '.model'
model.save(fname)

In [None]:
paragraphs_tag = model.docvecs.doctags
paragraphs_tag

In [None]:
paragraphs_vector = model.docvecs.doctag_syn0

In [None]:
model.docvecs.most_similar(positive = 'paragraph_49')

In [None]:
model.docvecs.n_similarity(['topic_0', 'topic_2'], ['topic_3', 'topic_4'])

In [None]:
model.docvecs.similarity('topic_0', 'topic_2')