# Code to check the semantics associated with the topic vectors in each partition of the entire dataset:
1. document classification: each doc is assigned to the topic whose vector is the closest (cosine distance) among all topic vectors
2. for each topic it plots the histogram over the real categories of the number of documents assigned to the topic 
3. for each partition used it plots the histogram of the number of docs per category   


It saves:
* for each topic and each window: an histograms of the number of docs per category
* for each window: an histograms of the number of docs per category

In [None]:
import os
from time import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
%pylab inline

In [None]:
from gensim import corpora, models, similarities
import re

In [None]:
# list of categories available in the dataset
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
cat_sorted = sorted(categories)
n_cats = len(categories)

## INPUT PARAMETERS

In [None]:
# parameters to select the topic2vec model to load
n_docs = 11314
n_window_t2v = 2
random_seed_partition = 55
n_topics = 8

In [None]:
cwd =  os.getcwd() # Prints the working directory
results_dir_path = cwd + '/results/DBOW/20NG_lemmatiz_DBOW_win5_n_topics' + str (n_topics) +'_n_doc' + str(n_docs) + '_n_win' + str(n_window_t2v)

In [None]:
topic_list = []
for i in xrange(n_topics):
    topic_list.append('topic_' + str(i))

In [None]:
#%matplotlib qt
reg_topic = 'topic_\d+'

# Histo of the number of docs per category
fig2, axes2 = plt.subplots(nrows=1, ncols=n_window_t2v)
fig2.set_size_inches(10, 6) 
fig2.suptitle('#Docs per Category', size = 25)

for i_window in xrange(n_window_t2v):

    fname = results_dir_path + '/t2v_20NG_partSEED' + str(random_seed_partition) + '_win' + str(i_window) + '.model'
    model = models.Doc2Vec.load(fname)

    df = pd.DataFrame(np.zeros((n_topics,n_cats), dtype=np.int), index = topic_list, columns = cat_sorted)
    df_count = pd.DataFrame(np.zeros((n_cats,1), dtype=np.int), index = cat_sorted)
    
    for idx_doc,i_doc in enumerate(model.docvecs.doctags):
        reg_doc = '([a-zA-Z._-]+)'
        if not re.match(reg_topic, i_doc):
            doc_topic_simil = []
            for idx_topic,i_topic in enumerate(topic_list):
                doc_topic_simil.append(model.docvecs.similarity(i_doc, i_topic))
            
            cat = re.search(reg_doc, i_doc)  

            current_topic = topic_list[doc_topic_simil.index(max(doc_topic_simil))]
            current_doc = cat.group(1)[:-1]
            
            df.loc[current_topic,current_doc] = df.loc[current_topic,current_doc] + 1
            
        df_count.ix[current_doc] = df_count.ix[current_doc] + 1

    for idx_topic,i_topic in enumerate(topic_list):       
        # Plot of the number of docs per category in each topic
        fig1 = plt.figure()
        ax = plt.gca()
        # Format
        fig1.set_size_inches(10, 10)
 
        fig1.suptitle(i_topic + ': topic semantics', size = 25)
        df.loc[i_topic].plot(kind='bar') 
    
        # turn off the frame
        ax.set_frame_on(True)


        # note I could have used nba_sort.columns but made "labels" instead
        ax.set_xticklabels(cat_sorted, minor=False, rotation='vertical')

        for t in ax.xaxis.get_major_ticks():
            t.tick1On = False
            t.tick2On = False
        for t in ax.yaxis.get_major_ticks():
            t.tick1On = False
            t.tick2On = False

        fig1.savefig(results_dir_path +'/' + i_topic + '_semantics_partSEED' + str(random_seed_partition) + '_win'+ str(i_window) +'.png')
  

    # Plot of the number of docs per category in partition
    ax2 = axes2[i_window]
    df_count.plot(kind='bar', ax = ax2, legend = False) 

    ax2.set_title('Window: '+str(i_window))
    # turn off the frame
    ax2.set_frame_on(True)

    for t in ax2.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax2.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

fig2.savefig(results_dir_path +'/n_docs_per_category.png', dpi = 200)