# Topic Modeling Data Preparation

reference:
* [Beginners Guide to Topic Modeling in Python](https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/)


* data format?

In [13]:
import pandas as pd
import numpy as np
import os

from pathlib import Path

from sklearn import preprocessing

#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#import tempfile
#TEMP_FOLDER = tempfile.gettempdir()
#print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

import gensim

from manage_path import *

#### To load entire dataset

#### To load only field of interest

In [None]:
def read_data(file_name):
    # Prepare data file path
    dataset_directory = get_dataset_directory()
    file_path = dataset_directory / file_name
    
    # Only get the field we want
    field_of_interest_datetime = ['BOND_SYM_ID','CUSIP_ID','SCRTY_TYPE_CD','ENTRD_VOL_QT','RPTD_PR','RPT_SIDE_CD' \
                                  ,'TRD_EXCTN_DT_D','EXCTN_TM_D','TRD_RPT_DT','TRD_RPT_TM', 'Report_Dealer_Index'\
                                  ,'Contra_Party_Index','TRC_ST']

    data_dtype={'BOND_SYM_ID': str, 'CUSIP_ID': str,'SCRTY_TYPE_CD':str, 'ENTRD_VOL_QT': np.float64, 'RPTD_PR': np.float32 \
           ,'RPT_SIDE_CD':str, 'Report_Dealer_Index': str,'Contra_Party_Index': str, 'TRC_ST':str}

    parse_dates = {'TRD_RPT_DTTM':['TRD_RPT_DT','TRD_RPT_TM'],'TRD_EXCTN_DTTM':['TRD_EXCTN_DT_D','EXCTN_TM_D']}
    
    data = pd.read_csv(file_path, usecols=field_of_interest_datetime, dtype=data_dtype, parse_dates=parse_dates\
                       , infer_datetime_format=True, converters={'TRD_RPT_TM':lambda x : pd.to_datetime(x,format='%H%M%S')})
    
    # Data Cleaning => keep only Trade Status that is T
    data = data[data['TRC_ST'] == 'T']
    
    # Add new column document_date which is the date of TRD_EXCTN_DTTM
    data['document_date'] = data['TRD_EXCTN_DTTM'].dt.date.apply(str)
    # Add new column document_buy which is the string representation of report dealer buy on the specific day
    data['document_buy'] = data.apply(lambda x: str(x['Report_Dealer_Index'])+ ',' +str(x['document_date'] + ',B') ,axis=1)
    # Add new column document_sell which is the string representation of report dealer sell on the specific day
    data['document_sell'] = data.apply(lambda x: str(x['Contra_Party_Index'])+ ',' +str(x['document_date'] + ',S') ,axis=1)
    
    # Get bond_issues
    bond_issues_path = dataset_directory / 'Mergent_FISD_Bonds_Issues.csv'
    bond_issues_fields = ['ISSUER_ID','COMPLETE_CUSIP']
    bond_issues_dtype = {'ISSUER_ID':str , 'COMPLETE_CUSIP':str}
    bond_issues = pd.read_csv(bond_issues_path, usecols=bond_issues_fields , dtype=bond_issues_dtype)
    
    # Get bond_issuers
    bond_issuer_path = dataset_directory / 'Mergent_FISD_Bonds_Issuers.csv'
    bond_issuer_fields = ['ISSUER_ID', 'AGENT_ID', 'CUSIP_NAME', 'INDUSTRY_GROUP','INDUSTRY_CODE', 'PARENT_ID', 'NAICS_CODE','SIC_CODE']
    bond_issuer_dtype = {'ISSUER_ID':str, 'AGENT_ID':str, 'CUSIP_NAME':str, 'INDUSTRY_GROUP':str \
                         ,'INDUSTRY_CODE': str, 'PARENT_ID': str, 'NAICS_CODE':str, 'SIC_CODE':str}
    bond_issuer = pd.read_csv(bond_issuer_path, usecols=bond_issuer_fields, encoding='cp1252', dtype=bond_issuer_dtype)
    
    
    #bond_ratings_path = dataset_directory / 'Mergent_FISD_Bonds_Ratings.csv'
    #bond_ratings = pd.read_csv(bond_ratings_path)
    
    
    # Merge data with bond issues using complete cusip
    data = data.merge(bond_issues, left_on='CUSIP_ID', right_on='COMPLETE_CUSIP', how='left')
    # Then, merge data with bond issuers using ISSUER_ID
    data = data.merge(bond_issuer, left_on='ISSUER_ID', right_on='ISSUER_ID', how='left')
    return data

In [None]:
data = read_data('TRACE2014_jinming_5000.csv')

In [None]:
data.head()

In [16]:
def load_data(file_name="TRACE2014_jinming"):
    print("loading data {}...".format(file_name))
    pickle_directory = get_Pickle_directory()
    
    pickle_file_path = pickle_directory / file_name
    
    print("Getting data from{}...".format(pickle_file_path))
    data = pd.read_pickle(pickle_file_path)
    print("Data getting success!")
    return data
def data_groupby():
    data = load_data(file_name='TRACE2014_jinming_5000.pkl')
    data_gb_sell = data.groupby(by=['document_sell','BOND_SYM_ID'])
    data_gb_buy = data.groupby(by=['document_buy','BOND_SYM_ID'])
    return (data_gb_sell,data_gb_buy)

In [20]:
data = load_data(file_name='TRACE2014_jinming.pkl')

loading data TRACE2014_jinming.pkl...
Getting data fromC:\Users\raymo\UMD\Research\FINRA_TRACE\Data\Pickle\TRACE2014_jinming.pkl...
Data getting success!


## Prepare Document
In our model, each document represents the activity of a dealer on a day, which we called it Dealer_Day activity. In each document, there are tokens represented by a bond, or BOND_SYM_ID.

In [None]:
# THIS TOO SLOW
# Add new column document to concatenate Report_Dealer_Index and TRD_RPT_DTTM
#data['document'] = data.apply(lambda x: str(x['Report_Dealer_Index'])+ ',' +str(x['TRD_RPT_DTTM'])[:10] ,axis=1)

In [None]:
# This is how to get date of datetime series
#str(data['TRD_RPT_DTTM'].dt.date[0])

### Transform BOND_SYM_ID to BOND_SYM_ID_transformed with sklearn labelEncoder
Because both Gensim and SKlearn require token to be represented by interger ID, we have to encode current to interger ID.<br>
Not sure if this is needed for Gensim

In [None]:
# Validate if the transform was correct
#data['BOND_SYM_ID_inversed_transformed'] = inverse_transform

In [None]:
# make BOND_SYM_ID to dict ---- No need anymore

#BOND_SYM_ID_count_dict = data['BOND_SYM_ID'].value_counts().to_dict()
#keys = BOND_SYM_ID_count_dict.keys()
#np.array(list(keys))

#### Group data by ['document','BOND_SYM_ID_transformed'] so that represent a document and token inside it

In [None]:
data_gb = data.groupby(by=['document','BOND_SYM_ID'])

In [None]:
matrix_1 = data_gb['BOND_SYM_ID'].size().unstack(fill_value=0)
matrix_1 = matrix_1.sort_index(axis=1)

matrix_1_shape = matrix_1.shape
print('We have {} documents and {} tokens'.format(matrix_1_shape[0],matrix_1_shape[1]))

In [None]:
root_folder = Path('../Data/Pickle/')
file_name = 'Matrix_1'
file_path = root_folder / file_name
matrix_1.to_pickle(file_path)

### Matrix 1 => Count of  BOND_SYM_ID in a document

#### Use SKlearn Label Encoder to transform tokens into integer

In [None]:
le = preprocessing.LabelEncoder()
le.fit(matrix_1.columns)
transform = le.transform(matrix_1.columns)
inverse_transform = le.inverse_transform(transform)

#### Create id2word to map label encoding to its words then create corpus with gensim Dense2Corpus which takes numpy

In [None]:
id2word = dict(zip(transform, inverse_transform))

matrix1_corpus = gensim.matutils.Dense2Corpus(matrix_1.values,documents_columns=False)
# Run LDA in Gensim
lda = gensim.models.ldamulticore.LdaMulticore(corpus= matrix1_corpus,id2word=id2word,workers=3, num_topics=500, chunksize=10000, passes=1)

In [None]:
# Save matrix_1 dictionary to file
#np.save('matrix_1.npy', matrix_1_dict['count']) 

### Matrix 2 => Sum of ENTRD_VOL_QT of a BOND_SYM_ID in document

In [None]:
matrix_2 = data_gb[['ENTRD_VOL_QT']].sum()
matrix_2.head()

#### Transform Matrix 2 data 
https://scikit-learn.org/stable/modules/preprocessing.html

In [None]:
matrix_2 = matrix_2.unstack(fill_value=0)
matrix_2.head()

### Matrix 3 => Product of ENTRD_VOL_QT and RPTD_PR of a BOND_SYM_ID in document
Create new column QT_X_PR as the product of ENTRD_VOL_QT and RPTD_PR<br>
Then do groupby QT_X_PR then sum them to get the total amount of the bond in that document

In [None]:
data['QT_X_PR'] = data['ENTRD_VOL_QT'] * data['RPTD_PR']

In [None]:
matrix_3 = data_gb[['QT_X_PR']].sum()
matrix_3.head()

In [None]:
data['QT_X_PR'].describe()

In [None]:
matrix_3 = matrix_3.unstack(fill_value=0)
matrix_3.head()

## Prepare Corpus for Gensim

### Output for SKlearn

## Data Validation

In [None]:
shape = data.shape
print('We have {} rows {} columns'.format(shape[0],shape[1]))

In [None]:
n_duplicated = data.duplicated().sum()
percentage = n_duplicated/shape[0]*100
print('{} rows that are entirely the same in the data set which is {:.2f}'.format(n_duplicated,percentage))

In [None]:
print('Number of duplcations based on grouping keys:')
test_duplication = [['BOND_SYM_ID'],['CUSIP_ID'],['BOND_SYM_ID','CUSIP_ID'],['Report_Dealer_Index','TRD_EXCTN_DTTM'],
                   ['Report_Dealer_Index','TRD_EXCTN_DTTM','BOND_SYM_ID']]
for test in test_duplication:
    print('{} : {}'.format(test, data.duplicated(subset=test,keep='first').sum()))

In [None]:
data.loc[data.duplicated(keep=False,subset=['Report_Dealer_Index','TRD_EXCTN_DTTM','BOND_SYM_ID'])].sort_values(by=['BOND_SYM_ID'])