In [1]:
# Import Library
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn import preprocessing
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.test.utils import datapath
# Import local files
import get_data
from manage_path import get_current_directory,create_directory

In [2]:
def data_groupby():
    data = get_data.load_data()
    data_gb = data.groupby(by=['document','BOND_SYM_ID'])
    return data_gb

def compute_matrix1():
    data_gb = data_groupby()
    print("computing matrix_1 ......")
    matrix_1 = data_gb['BOND_SYM_ID'].size().unstack(fill_value=0)
    matrix_1 = matrix_1.sort_index(axis=1)
    print("computing matrix_1 done!")
    return matrix_1

def compute_matrix2():
    data_gb = data_groupby()
    print("computing matrix_2 ......")
    matrix_2 = data_gb['ENTRD_VOL_QT'].sum().unstack(fill_value=0)
    matrix_2 = matrix_2.sort_index(axis=1)
    print("computing matrix_2 done!")
    return matrix_2

def compute_matrix3():
    data_gb = data_groupby()
    print("computing matrix_3 ......")
    data_gb['cap'] = pd.eval(data_gb['ENTRD_VOL_QT'] * data_gb['RPTF_PR'])
    matrix_3 = data_gb['cap'].sum().unstack(fill_value=0)
    matrix_3 = matrix_3.sort_index(axis=1)
    print("computing matrix_3 done!")
    return matrix_3

def compute_corpus(matrix):
    corpus = gensim.matutils.Dense2Corpus(matrix.values,documents_columns=False)
    return corpus

def save_corpus(corpus,file_name):
    current_path = os.getcwd()
    current_path = Path(current_path)
    corpus_save_path = current_path.parent / "./Data/Corpus/"
    try:
        os.mkdir(corpus_save_path)
    except OSError:  
        print ("Creation of the directory %s failed" % corpus_save_path)
    else:  
        print ("Successfully created the directory %s " % corpus_save_path)
    file_name = corpus_save_path / "{}.mm".format(file_name)
    gensim.corpora.MmCorpus.serialize(str(file_name), corpus)
    
def load_corpus(file_name):
    print("loading corpus...")
    current_path = os.getcwd()
    current_path = Path(current_path)
    corpus_load_path = current_path.parent / "./Data/Corpus/"
    file_name = corpus_load_path / "{}.mm".format(file_name)
    file_name = str(file_name)
    corpus = gensim.corpora.MmCorpus(file_name)
    print("corpus successfully loaded!!")
    print(corpus)
    return corpus

def compute_id2word(matrix,matrix_name,save=True):
    le = preprocessing.LabelEncoder()
    le.fit(matrix.columns)
    transform = le.transform(matrix.columns)
    inverse_transform = le.inverse_transform(transform)
    id2word = dict(zip(transform, inverse_transform))
    print("saving id2word ...")
    if(save):
        current_path = get_current_directory()
        id2word_save_path = current_path.parent / "./Data/id2word/"
        create_directory(id2word_save_path)
        file_name = id2word_save_path / "{}.npy".format(file_name)
        # save the id2word using numpy
        np.save(file_name, id2word)
        print("id2word saved!!")
    else:
        return id2word

def load_id2word(id2word_name):
    print("loading id2word ...")
    current_path = get_current_directory()
    id2word_save_path = current_path.parent / "./Data/id2word/"
    id2word_save_path = id2word_save_path / "{}.npy".format(id2word_name)
    # load the id2word using numpy
    id2word = np.load('id2word_name').item()
    print("id2word loaded!!")
    return id2word

def compute_topic(corpus_name,corpus,num_topics,id2word,workers=3,chunksize=10000,passes=20,iterations=50):
    print("LdaMulticore Start!!")
    lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,id2word=id2word,workers=workers, num_topics=num_topics, chunksize=chunksize, passes=passes,iterations=iterations)
    print("LdaMulticore Done!!")
    
    model_name = "{}_{}topics".format(corpus_name,num_topics)
    print("Saving Model as "+model_name)
    
    current_path = get_current_directory()
    save_path = current_path.parent / ("./LDAModel/{}/".format(model_name))
    # create directory
    create_directory(save_path)
    
    save_path = save_path / model_name
    save_path = datapath(str(save_path))

    lda.save(save_path)
    print("Model successfully save at" + save_path)

def main():
    corpus_name = str(input("Please enter corpus_name: "))
    num_topics = int(input("Please enter num_topics: "))
    workers = int(input("Please enter number of workers: "))
    if(corpus_name == 'matrix_1' or corpus_name == 'matrix1'):
        corpus = load_corpus("matrix_1")
        id2word = load_id2word("matrix_1")
    else:
        corpus = load_corpus("matrix_1")
        id2word = load_id2word("matrix_1")
    
    compute_topic(corpus_name,corpus,num_topics,id2word,workers=workers)

In [None]:
#matrix1 = compute_matrix1()
#compute_id2word(matrix=matrix1,matrix_name="matrix_1")

In [None]:
compute_id2word(matrix=matrix1,id2word_name="matrix_1")

In [None]:
id2word = load_id2word("matrix_1")

In [None]:
corpus = load_corpus("matrix_1")

In [None]:
current_path = get_current_directory()
save_path = current_path.parent / ("./LDAModel/")

In [None]:
save_path

In [None]:
create_directory(save_path)

In [3]:
matrix_2 = compute_matrix2()

loading data TRACE2014_jinming...
Getting data fromC:\Users\raymo\UMD\Research\FINRA_TRACE\Data\Pickle\TRACE2014_jinming...
Data getting success!
computing matrix_2 ......
computing matrix_2 done!


In [4]:
matrix_2

BOND_SYM_ID,A.GC,A.GE,A.GF,A3900782,A4020252,A4172095,A4172097,AA.AA,AA.GI,AA.GL,...,ZION4066869,ZLIOF3944284,ZMH.AA,ZMH.AB,ZMH.AC,ZMH.AD,ZQK4029107,ZQK4062844,ZUAN.GA,ZUAN.GB
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"0,2014-01-02",0.0,0.0,0.0,420000.0,0.0,0.0,0.0,640000.0,0.0,0.0,...,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"0,2014-01-03",0.0,0.0,2070000.0,0.0,0.0,0.0,0.0,259000.0,0.0,0.0,...,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"0,2014-01-06",0.0,0.0,2500000.0,0.0,0.0,0.0,0.0,282000.0,0.0,0.0,...,325000.0,0.0,0.0,0.0,2080000.0,0.0,0.0,500000.0,0.0,0.0
"0,2014-01-07",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2824000.0,0.0,0.0,...,375000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"0,2014-01-08",0.0,0.0,0.0,0.0,432000.0,0.0,0.0,757000.0,0.0,0.0,...,103000.0,0.0,0.0,0.0,60000.0,0.0,0.0,0.0,0.0,0.0
"0,2014-01-09",35000.0,0.0,0.0,0.0,0.0,0.0,0.0,2779000.0,20000.0,0.0,...,65000.0,0.0,0.0,0.0,600000.0,0.0,5000.0,0.0,0.0,0.0
"0,2014-01-10",0.0,0.0,0.0,0.0,0.0,0.0,0.0,639000.0,76000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"0,2014-01-13",85000.0,150000.0,0.0,1000000.0,10000.0,0.0,0.0,760000.0,0.0,5000.0,...,0.0,0.0,0.0,0.0,4000.0,0.0,500000.0,5000.0,0.0,0.0
"0,2014-01-14",0.0,2350000.0,0.0,2000000.0,0.0,0.0,0.0,271000.0,20000.0,25000.0,...,50000.0,0.0,0.0,0.0,0.0,0.0,1025000.0,0.0,0.0,0.0
"0,2014-01-15",50000.0,0.0,0.0,0.0,0.0,0.0,0.0,3797000.0,0.0,0.0,...,25000.0,0.0,1325000.0,0.0,0.0,11000000.0,0.0,0.0,0.0,0.0
