# Topic Modeling Data Preparation

reference:
* [Beginners Guide to Topic Modeling in Python](https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/)

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

from pathlib import Path

In [2]:
root_folder = Path('../Data/')
file_name = 'TRACE2014_jinming_5000.csv.xlsx'
file_path = root_folder / file_name

field_of_interest = ['BOND_SYM_ID','CUSIP_ID','SCRTY_TYPE_CD','ENTRD_VOL_QT','RPTD_PR','RPT_SIDE_CD','TRD_EXCTN_DT','TRD_RPT_DT','RPT_SIDE_CD','Report_Dealer_Index','Contra_Party_Index']

#dtype={'TRC_ST': str, 'BOND_SYM_ID': str, 'CUSIP_ID': str, 'SCRTY_TYPE_CD': str, 'WIS_CD': str, 'CMSN_TRD_FL': str, \
#           'ENTRD_VOL_QT': str, 'RPTD_PR': str, 'YLD_SIGN_CD': str, 'YLD_PT': float, 'ASOF_CD': str, 'TRD_EXCTN_DT': pd.Timestamp}

data = pd.read_excel(file_path,usecols=field_of_interest)

In [3]:
data.head(3)

Unnamed: 0,BOND_SYM_ID,CUSIP_ID,SCRTY_TYPE_CD,ENTRD_VOL_QT,RPTD_PR,TRD_EXCTN_DT,TRD_RPT_DT,RPT_SIDE_CD,Report_Dealer_Index,Contra_Party_Index
0,TWIX3666829,982526AB1,CORP,25000,102.992,20140716,20140716,B,0,83
1,WAMU3885939,92936PAB6,CORP,214,95.934579,20141223,20141223,S,83,99999
2,WAMU3885939,92936PAB6,CORP,100,92.5,20141205,20141205,S,83,99999


In [4]:
#document = data[['Report_Dealer_Index','TRD_EXCTN_DT']]
document = data.groupby(by=['Report_Dealer_Index','TRD_EXCTN_DT'])

In [5]:
token = data.groupby(by=['BOND_SYM_ID','CUSIP_ID'])

In [6]:
data_gb = data.groupby(by=['Report_Dealer_Index','TRD_EXCTN_DT','BOND_SYM_ID'])

In [7]:
matrix_1 = data_gb.size()
matrix_1.head()

Report_Dealer_Index  TRD_EXCTN_DT  BOND_SYM_ID
0                    20140102      AA.HO          1
                                   BCS.KBJ        1
                                   MT3824014      1
                                   PBR3674838     1
                                   PBR4006643     1
dtype: int64

In [8]:
matrix_2 = data_gb['ENTRD_VOL_QT'].sum()
matrix_2.head()

Report_Dealer_Index  TRD_EXCTN_DT  BOND_SYM_ID
0                    20140102      AA.HO            7000
                                   BCS.KBJ        150000
                                   MT3824014       10000
                                   PBR3674838      20000
                                   PBR4006643      30000
Name: ENTRD_VOL_QT, dtype: int64

In [9]:
matrix_3 = data_gb[['ENTRD_VOL_QT','RPTD_PR']]
matrix_3.prod().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ENTRD_VOL_QT,RPTD_PR
Report_Dealer_Index,TRD_EXCTN_DT,BOND_SYM_ID,Unnamed: 3_level_1,Unnamed: 4_level_1
0,20140102,AA.HO,7000.0,92.533
0,20140102,BCS.KBJ,150000.0,106.44
0,20140102,MT3824014,10000.0,106.75
0,20140102,PBR3674838,20000.0,112.8769
0,20140102,PBR4006643,30000.0,89.16


### Validation

In [10]:
shape = data.shape
print('We have {} rows {} columns'.format(shape[0],shape[1]))

We have 5000 rows 10 columns


In [11]:
print('{} rows that are entirely the same in the data set'.format(data.duplicated().sum()))

414 rows that are entirely the same in the data set


In [12]:
print('Number of duplcations based on grouping keys:')
test_duplication = [['BOND_SYM_ID'],['CUSIP_ID'],['BOND_SYM_ID','CUSIP_ID'],['Report_Dealer_Index','TRD_EXCTN_DT'],
                   ['Report_Dealer_Index','TRD_EXCTN_DT','BOND_SYM_ID']]
for test in test_duplication:
    print('{} : {}'.format(test, data.duplicated(subset=test).sum()))

Number of duplcations based on grouping keys:
['BOND_SYM_ID'] : 4087
['CUSIP_ID'] : 4095
['BOND_SYM_ID', 'CUSIP_ID'] : 4087
['Report_Dealer_Index', 'TRD_EXCTN_DT'] : 4154
['Report_Dealer_Index', 'TRD_EXCTN_DT', 'BOND_SYM_ID'] : 1104


In [13]:
data.loc[data['BOND_SYM_ID']=='TWC3675093']

Unnamed: 0,BOND_SYM_ID,CUSIP_ID,SCRTY_TYPE_CD,ENTRD_VOL_QT,RPTD_PR,TRD_EXCTN_DT,TRD_RPT_DT,RPT_SIDE_CD,Report_Dealer_Index,Contra_Party_Index
3659,TWC3675093,88732JAS7,CORP,25000,125.45,20140219,20140219,B,0,110
4999,TWC3675093,88732JAS7,CORP,50000,126.169,20140625,20140625,B,0,47
