In [None]:
#######################################################################################
# Author: Srijan Verma                                                              #
# School of Pharmacy                                                                #
# Sirimulla Research Group [http://www.sirimullaresearchgroup.com/]                 #
# The University of Texas at El Paso, TX, USA                                       #
# Last modified: 19/12/2019                                                         #
# Copyright (c) 2019 Srijan Verma and Sirimulla Research Group, under MIT license   #
#######################################################################################

In [1]:
from glob import glob
import pandas as pd
import os
from tqdm import tqdm_notebook

In [4]:
csv_files = glob('../dataset/cyt_homo_data/filtered/relation_D/*.csv')
finger_prints = ['morgan_fp', 'maccs_fp', 'avalon_fp', 'rdk_fp',  
                'topological_fp', 'ecfp2_fp', 'ecfp6_fp']
commands = []


for _file in tqdm_notebook(csv_files):
    for _fin in finger_prints:
        output_file = os.path.split(_file)[1]
        commands.append('python3 generate_fp_v2.py ' + _fin + 
                        ' ' + _file + ' ../dataset/cyt_homo_data/numpy_files'
                        )

with open('../shell_scripts/shell_fp.sh', 'w') as f:
    for comm in commands:
        f.write(comm + '\n')

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))




In [5]:
df = pd.read_csv('../dataset/cyt_homo_data/filtered/relation_D/Cytochrome_P450_26A1.csv')

In [71]:
df_stats_fp = pd.read_csv('../dataset/cyt_homo_data/fp_stats/fp_stats_223_files.csv',index_col=0)

In [72]:
arr_names = []
drop_index = []
for i in tqdm_notebook(range(len(df_stats_fp))):
    if df_stats_fp['Size'][i] == 28 or df_stats_fp['Size'][i] < 28:
#         print(df_stats_fp['Size'][i])
        drop_index.append(i)
    else:
        arr_names.append(df_stats_fp['file_name'][i])

HBox(children=(IntProgress(value=0, max=223), HTML(value='')))




In [3]:
model_list = ['RandomForestClassifier', 'DecisionTreeClassifier', 
              'AdaBoostClassifier', 'MLPClassifier', 'GradientBoostingClassifier', 
              'LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis', 
              'LogisticRegression', 'KNeighborsClassifier', 'MultinomialNB', 
              'XGBClassifier', 'DummyClassifier', 'GaussianNB', 'SVC', 'BaggingClassifier', 
              'BernoulliNB', 'ComplementNB', 'ExtraTreesClassifier', 
              'GaussianProcessClassifier', 'HistGradientBoostingClassifier', 
              'ExtraTreeClassifier', 'LinearSVC', 'NearestCentroid', 'OneVsOneClassifier', 
              'OneVsRestClassifier', 'OutputCodeClassifier', 'PassiveAggressiveClassifier', 
              'Perceptron', 'RidgeClassifier', 'SGDClassifier', 'BayesianGaussianMixture', 
              'GaussianMixture'
             ]

In [5]:
df = pd.DataFrame(model_list,columns=['Model_Name'])
df.to_csv('/Users/pankajverma/Desktop/SOP/DixitSirProject/thesis/model_config/model_config.csv')

In [3]:
len(model_list)

32

## python model_dev_v6.py AllModels ../../dataset/cyt_homo_data/filtered/morgan_fp-Cytochrome_P450_26A1.npy ../../dataset/cyt_homo_data/filtered 0.15 0.15 default save_model=no

In [23]:

commands = []


for _file in tqdm_notebook(glob('../../suman_sir_protein_subcell_project/dataset/pdsp_pharos_combined/np_individual_stats/*csv')):
    for _fin in model_list:
        output_file = os.path.split(_file)[1][0:-4]
        commands.append('python3 model_dev_v6.py ' + _fin + 
                        ' ../dataset/numpy_files/' + output_file + '.npy' 
                        + ' ../results 0.15 0.15 default save_model=no'
                        )

# with open('../../suman_sir_protein_subcell_project/shell_scripts/shell-model_dev_v6_1.sh', 'w') as f:
#     for comm in commands:
#         f.write(comm + '\n')

HBox(children=(IntProgress(value=0, max=240), HTML(value='')))




In [18]:
commands[0]

'python3 model_dev_v6.py RandomForestClassifier ../dataset/numpy_files/morgan_fp-membrane_protein.npy ../results/results_1 0.15 0.15 default save_model=no'

In [27]:
with open('../../suman_sir_protein_subcell_project/shell_scripts/shell-model_dev_v6_4.sh', 'w') as f:
    for comm in commands[(1920*3):(1920*4)]:
        f.write(comm + '\n')

In [2]:
df = pd.read_csv('../results/two_paper_trans_results/default_models/imp_enzymes_all.csv',index_col=0)

In [3]:
imp_enz = df['Output_Target'].tolist()

In [4]:
imp_enz

['1A1_ind',
 '1A1_inh',
 '1A1_sub',
 '1A2_ind',
 '1A2_inh',
 '1A2_sub',
 '2C19_ind',
 '2C19_inh',
 '2C19_sub',
 '2C8_ind',
 '2C8_inh',
 '2C8_sub',
 '2C9_ind',
 '2C9_inh',
 '2C9_sub',
 '2D6_inh',
 '2D6_sub',
 '2E1_ind',
 '2E1_inh',
 '2E1_sub',
 '3A4_ind',
 '3A4_inh',
 '3A4_sub',
 '3A5_ind',
 '3A5_inh',
 '3A5_sub',
 'GST_ind',
 'GST_inh',
 'GST_sub',
 'MET_ind',
 'MET_inh',
 'MET_sub',
 'NAT_ind',
 'NAT_inh',
 'NAT_sub',
 'SUL_ind',
 'SUL_inh',
 'SUL_sub',
 'UGT_ind',
 'UGT_inh',
 'UGT_sub']

In [6]:
commands = []


for _file in tqdm_notebook(imp_enz):
    for val in ('tr','va','te'):
#     output_file = os.path.split(_file)[1]
        commands.append('python3 generate_fp_v7.py rdkDes_lecfp6_laval_hashap ../dataset/csv_files/splitted_csv_files/' + _file + 
                    '_'+val+'.csv ' + '../dataset/combined_fp_numpy_files'
                    )

HBox(children=(IntProgress(value=0, max=41), HTML(value='')))




In [7]:
commands[0]

'python3 generate_fp_v7.py rdkDes_lecfp6_laval_hashap ../dataset/csv_files/splitted_csv_files/1A1_ind_tr.csv ../dataset/combined_fp_numpy_files'

In [8]:
len(commands)

123

In [10]:
with open('../shell_scripts/shell-gen_fp_comb.sh', 'w') as f:
    for comm in commands:
        f.write(comm + '\n')

In [11]:
models = ['ExtraTreeClassifier_default',
 'AdaBoostClassifier_default',
 'MLPClassifier_default',
 'MultinomialNB_default',
 'LogisticRegression_default',
 'RandomForestClassifier_default',
 'RidgeClassifier_default',
 'BaggingClassifier_default',
 'SGDClassifier_default',
 'LinearSVC_default',
 'NearestCentroid_default',
 'Perceptron_default',
 'XGBClassifier_default',
 'OneVsRestClassifier_default',
 'PassiveAggressiveClassifier_default',
 'LinearDiscriminantAnalysis_default',
 'ComplementNB_default',
 'OutputCodeClassifier_default',
 'HistGradientBoostingClassifier_default',
 'DecisionTreeClassifier_default',
 'QuadraticDiscriminantAnalysis_default',
 'OneVsOneClassifier_default',
 'GaussianNB_default',
 'GaussianMixture_default',
 'BernoulliNB_default',
 'ExtraTreesClassifier_default',
 'BayesianGaussianMixture_default',
 'DummyClassifier_default',
 'NuSVC_default']

In [20]:
commands = []


for _file in tqdm_notebook(imp_enz):
    for k in range(len(models)):
#     output_file = os.path.split(_file)[1]
        commands.append('python3 model_dev_v7.py ' + models[k][0:-8]
                   + ' ../dataset/combined_fp_numpy_files/'
                   + 'rdkDes_lecfp6_laval_hashap-'+_file+'_tr.npy '
                   + '../dataset/combined_fp_numpy_files/rdkDes_lecfp6_laval_hashap-'+_file+'_va.npy '
                   + '../dataset/combined_fp_numpy_files/rdkDes_lecfp6_laval_hashap-'+_file+'_te.npy '
                   + '../results_com_fp 0.15 0.15 default save_model=yes /scratch/06633/verma/seqcomhol/two_paper_trans/saved_models')

HBox(children=(IntProgress(value=0, max=41), HTML(value='')))




In [21]:
commands[100]

'python3 model_dev_v7.py OneVsRestClassifier ../dataset/combined_fp_numpy_files/rdkDes_lecfp6_laval_hashap-1A2_ind_tr.npy ../dataset/combined_fp_numpy_files/rdkDes_lecfp6_laval_hashap-1A2_ind_va.npy ../dataset/combined_fp_numpy_files/rdkDes_lecfp6_laval_hashap-1A2_ind_te.npy ../results_com_fp 0.15 0.15 default save_model=yes /scratch/06633/verma/seqcomhol/two_paper_trans/saved_models'

In [16]:
len(commands)

1189

In [17]:
len(models)

29

In [22]:
with open('../shell_scripts/shell-mod_dev_comb.sh', 'w') as f:
    for comm in commands:
        f.write(comm + '\n')