# Topic Model Analysis

Todo: 1. industry : cusip to naics mapping 2. Topic evloution

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from compute_topic import *
from manage_path import *
from topic_model_analysis import *

import plotly
plotly.offline.init_notebook_mode(connected=True) 
import plotly.graph_objs as go

## Topic Evolution
1. Dc_v1: 1 document for buy and sell, per dealer, per day.
2. Dc_v2: 1 document for buy and 1 document for sell, per dealer, per day.
3. Dc_v3: Either DC_v1 or DC_v2 without the 2 (4) documents representing the Source_seller and Source_buyer.
4. Tc_v1: 1 document for (buyer,seller,year)

In [None]:
result_directory = get_result_directory()
topic = pd.read_csv(result_directory/'Dc_v1_250topics.csv',index_col=0)

In [None]:
def get_document_item(document,position):
    return str(document).split(',')[position]

In [None]:
get_document_item_vectorize = np.vectorize(get_document_item)

In [None]:
topic['dealer'] = get_document_item_vectorize(topic.index,0)

In [None]:
topic.index = pd.to_datetime(get_document_item_vectorize(topic.index,1))

In [None]:
topic.head()

In [None]:
count_matrix = topic.groupby(["dealer"]).count()

In [None]:
data = [go.Heatmap( z=count_matrix.values.tolist(), colorscale='Viridis')]
plotly.offline.iplot(data, filename='pandas-heatmap')

In [None]:
topic.describe()

In [None]:
topic.corr()

## Topic Terms Distribution
Below is the topic terms ditrutbution demo. Since we already got Document X Topics, we want to have a Topic X Terms too.

In [None]:
model = load_model('Tc_v1',250)

In [None]:
topic_terms_distribution = pd.DataFrame(model.get_topics())

In [None]:
topic_terms_distribution.head()

## Industy Analysis

In [None]:
data_path = get_pickle_directory() / 'TRACE2014_jinming.pkl'
data = pd.read_pickle(data_path)

In [None]:
data.columns

In [None]:
print("We have {} rows of data".format(data.shape[0]))

In [None]:
def fix_NAICS_Code(NAICS_Code):
    "Fix the problem that FISD omits 0 if NAICS_Code ends with 0"
    if len(str(NAICS_Code))==5:
        NAICS_Code = NAICS_Code+'0'
    return NAICS_Code
fix_NAICS_Code_vectorize = np.vectorize(fix_NAICS_Code)
data['NAICS_CODE'] = fix_NAICS_Code_vectorize(data['NAICS_CODE'].values)

In [None]:
data = data[['BOND_SYM_ID', 'INDUSTRY_GROUP', 'INDUSTRY_CODE', 'PARENT_ID', 'NAICS_CODE','SIC_CODE']].copy()

In [None]:
data['INDUSTRY_CODE'].value_counts().plot.pie()

In [None]:
# Get the NAICS_code lookup table
NAICS_code_path = get_dataset_directory() / 'NAICS_Code_Lookup.csv'
NAICS_code = pd.read_csv(NAICS_code_path,dtype={'Code':str,'Description':str})

In [None]:
# Group by BOND_SYM_ID and NAICS_CODE of 
NAICS_CODE_count = data['NAICS_CODE'].value_counts()
# Series to DataFrame
NAICS_CODE_count = NAICS_CODE_count.to_frame(name='count')

In [None]:
NAICS_code_total = NAICS_code.merge(NAICS_CODE_count,left_on='Code',right_index=True).sort_values(by="count",ascending=False)
NAICS_code_total['percentage'] = NAICS_code_total['count']/data.shape[0]

In [None]:
NAICS_code_total.head()

In [None]:
(NAICS_code.merge(NAICS_CODE_count,left_on='Code',right_index=True).sort_values(by="count",ascending=False)['count']/data.shape[0]).sum()

In [None]:
data.groupby(by=['NAICS_CODE'])['BOND_SYM_ID'].nunique().sort_values(ascending=False).head()

In [None]:
data.groupby(by=['BOND_SYM_ID'])

In [None]:
Bond_X_Industry = data.drop_duplicates(['BOND_SYM_ID'])

In [None]:
Bond_X_Industry.sort_values(by=['BOND_SYM_ID']).head()

## Convergence Test

In [None]:
import re
import matplotlib.pyplot as plt
def convergence_likelyhood(model_name,num_topics):
    p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
    matches = [p.findall(l) for l in open('../LDAModel/{}_{}topics.log.txt'.format(model_name,num_topics))]
    matches = [m for m in matches if len(m) > 0]
    tuples = [t[0] for t in matches]
    perplexity = [float(t[1]) for t in tuples]
    liklihood = [float(t[0]) for t in tuples]
    iter = list(range(0,len(tuples)*10,10))
    plt.plot(iter,liklihood,c="black")
    plt.ylabel("log liklihood")
    plt.xlabel("iteration")
    plt.title("Topic Model Convergence")
    plt.grid()
    plt.savefig("{}_{}topics.pdf".format(model_name,num_topics))
    plt.close()

In [None]:
#convergence_likelyhood("matrix_1",250)