# Demo of LDAPathwayPrediction modeling pipeline

## Load all required modules

In [1]:
from Model import Model
from Validation import Validation
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np

## Load data files 

In [2]:
data_df=Model().df_pkl
train,test,train_df,test_df=Model().get_train_and_test(data_df.copy())

## Get dictionary and corpus from train set

In [None]:
dictionary,corpus=Model().get_dict_corpus(train)
model=Model().MyLDA(corpus,dictionary,num_topics=100,random_state=250,passes=100)

## Get similarity between topics

In [None]:
distance_list=Validation().get_topic_similarity(topics=150,model=model,distance='jaccard')

In [None]:
#higher the value, more dissimilar the topics
print ("Mean distance is {} and standard deviation is {}".format(np.mean(distance_list),np.std(distance_list)))

## Test the model with test set

In [None]:
Model_load=Validation().model


In [None]:
#test_df=Validation().convert_list_to_df(test)
Model_load=Validation().model
output=Validation().solution_for_one_df(test_df,dictionary,Model_load)

In [None]:
Validation().print_testing_results(output)

## Run n-fold validation to check test measures

In [3]:
output=Validation().nfold_cv(n=10)

3281
test pathway:Pantothenate and CoA biosynthesis
Tp,Fp,Fn,Tn:(14, 104, 19, 3144); true length:33
3281
test pathway:Degradation of aromatic compounds
Tp,Fp,Fn,Tn:(91, 605, 26, 2559); true length:117
3281
test pathway:Ether lipid metabolism
Tp,Fp,Fn,Tn:(10, 113, 14, 3144); true length:24
3281
test pathway:Microbial metabolism in diverse environments
Tp,Fp,Fn,Tn:(679, 912, 59, 1645); true length:738
3281
test pathway:Polycyclic aromatic hydrocarbon degradation
Tp,Fp,Fn,Tn:(5, 220, 17, 3039); true length:22
3281
test pathway:Naphthalene degradation
Tp,Fp,Fn,Tn:(2, 55, 7, 3217); true length:9
3281
test pathway:Xylene degradation
Tp,Fp,Fn,Tn:(14, 417, 8, 2842); true length:22
3281
test pathway:Pyruvate metabolism
Tp,Fp,Fn,Tn:(49, 707, 27, 2512); true length:76
3281
test pathway:2-Oxocarboxylic acid metabolism
Tp,Fp,Fn,Tn:(50, 264, 5, 2997); true length:55
3281
test pathway:Penicillin and cephalosporin biosynthesis
Tp,Fp,Fn,Tn:(14, 182, 0, 3085); true length:14
3281
test pathway:Biosynthes

3281
test pathway:Carotenoid biosynthesis
Tp,Fp,Fn,Tn:(3, 29, 40, 3209); true length:43
3281
test pathway:Atrazine degradation
Tp,Fp,Fn,Tn:(3, 46, 8, 3224); true length:11
3281
test pathway:Biosynthesis of antibiotics
Tp,Fp,Fn,Tn:(469, 762, 83, 1981); true length:552
3281
test pathway:Pentose and glucuronate interconversions
Tp,Fp,Fn,Tn:(29, 372, 43, 2837); true length:72
3281
test pathway:Fluorobenzoate degradation
Tp,Fp,Fn,Tn:(10, 205, 4, 3062); true length:14
3281
test pathway:Degradation of aromatic compounds
Tp,Fp,Fn,Tn:(109, 424, 8, 2740); true length:117
3281
test pathway:Tetracycline biosynthesis
Tp,Fp,Fn,Tn:(9, 153, 0, 3119); true length:9
3281
test pathway:Prodigiosin biosynthesis
Tp,Fp,Fn,Tn:(5, 175, 0, 3101); true length:5
3281
test pathway:Purine metabolism
Tp,Fp,Fn,Tn:(52, 930, 67, 2291); true length:119
3281
test pathway:Dioxin degradation
Tp,Fp,Fn,Tn:(12, 105, 1, 3163); true length:13
3281
test pathway:Biosynthesis of 12-, 14- and 16-membered macrolides
Tp,Fp,Fn,Tn:(17,

In [4]:
Validation().print_testing_results(output)

Recall mean: 0.5986208878377992 and median 0.5877789277048289
Precision mean: 0.08097292968202147 and median 0.07830746658099694
Accuracy mean: 0.895586522768524 and median 0.8966981666349267
F1 mean: 0.1287170619321387 and median 0.12764024871924481
specificity mean: 0.8947529229853055 and median 0.8995067391884839


In [None]:
Validation().plot_a_distribution(output[0],xlabel='Recall',
                                xlim_tuple=(0,1.1),
                                ylabel='Density',title='Recall values')
