## Setting up colab environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd "/content/drive/My Drive/Colab Notebooks/w266_final/project_re"

/content/drive/.shortcut-targets-by-id/1I3W7Z7rz_YfsjBjX7z4zTzGbr-iKTrmD/w266_final/project_re


In [3]:
%reload_ext autoreload
%autoreload
import pickle, os, json
import torch
from pathlib import Path, PureWindowsPath, PurePosixPath
from util.tools import load_config
import pandas as pd
from sklearn.metrics import classification_report

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [5]:
config_folder = "config/"
config = load_config(config_folder)
config.__dict__

{'hyperparams': <util.tools.config at 0x7fc6a45cf710>,
 'modelconfig': <util.tools.config at 0x7fc6a45cf860>,
 'programsettings': <util.tools.config at 0x7fc6a45cf898>}

In [12]:
reports_folder_name = config.programsettings.REPORTS_DIR
data_folder = Path(reports_folder_name)
list_results_files = list(data_folder.glob('**/multi_model_*.pkl'))

all_results = []
all_data_source = []

for results_file in list_results_files:
    print(results_file)
    with open(results_file,"rb") as f:
        lst_results = pickle.load(f)
    
    for results in lst_results:
        max_seq = 0
        lr = 0
        train_batch = 0
        max_seq = ""
        model_name = ""
        class_weights = '[]'
        kernel_1 = 0
        kernel_2 = 0
        kernel_3 = 0
        if (len(results[0]) > 50):
#           print(results[0])
            results_json = json.loads(results[0])
            max_seq = results_json["hyperparams"]["MAX_SEQ_LENGTH"]
            lr = results_json["hyperparams"]["LEARNING_RATE"]
            train_batch = results_json["hyperparams"]["TRAIN_BATCH_SIZE"]
            model_name = results_json["programsettings"]["MODEL_NAME"]
            try:
                class_weights = results_json["hyperparams"]["LOSS_FN_CLASS_WEIGHTS"]
            
            except: 
                class_weights = '[]'
                
            try:
                kernel_1 = results_json["modelconfig"]["KERNEL_1"]
                kernel_2 = results_json["modelconfig"]["KERNEL_2"]
                kernel_3 = results_json["modelconfig"]["KERNEL_3"]
            
            except: 
                class_weights = '[]'

        all_results.append([results_file, model_name,lr, train_batch,max_seq, class_weights,kernel_1, kernel_2,kernel_3,results[1], results[2], results[3], results[4], results[5],results[6] ])
        
#       Creating list of all dev_labels and dev_preds, so that we can run classification reports for all of them together
        if (len(results) > 7):
          all_data_source.append([results[7],results[8]])
                                
columns = ['file_name','model_name', 'lr', 'train_batch_size','train_max_seq','class_weights','Kernel_1','Kernel_2','Kernel_3','train_loss', 'dev_loss', 'train_mcc', 'train_f1_score','dev_mcc','dev_f1_score']

df_results = pd.DataFrame(all_results, columns = columns)
pd.set_option('display.max_colwidth', 100)
df_results

reports/re/multi_model_experiment_results_2020-11-15 15_43_01_942702.pkl
reports/re/multi_model_experiment_results_2020-11-15 18_33_46_015615.pkl
reports/re/multi_model_experiment_results_2020-11-16 10_28_19_020854.pkl
reports/re/multi_model_experiment_results_2020-11-20 23_15_57_501227.pkl
reports/re/multi_model_experiment_results_2020-11-21 08_38_53_855089.pkl
reports/re/multi_model_experiment_results_2020-11-21 10_03_19_501249.pkl
reports/re/multi_model_experiment_results_5epochs.pkl
reports/re/multi_model_experiment_results_256_5epochs.pkl
reports/re/multi_model_experiment_results_CNN_wider.pkl
reports/re/multi_model_experiment_results_CNN.pkl
reports/re/multi_model_experiment_results_256_5epochs_binary.pkl
reports/re/multi_model_experiment_results_256_5epochs_weighted.pkl
reports/re/multi_model_experiment_results_2020-11-22 10_44_11_756583.pkl
reports/re/multi_model_experiment_results_5epochs_256_2fc_weighted.pkl
reports/re/multi_model_experiment_results_2020-11-25 04_29_30_655214

Unnamed: 0,file_name,model_name,lr,train_batch_size,train_max_seq,class_weights,Kernel_1,Kernel_2,Kernel_3,train_loss,dev_loss,train_mcc,train_f1_score,dev_mcc,dev_f1_score
0,reports/re/multi_model_experiment_results_2020-11-15 15_43_01_942702.pkl,,0.0,0,,[],0,0,0,2.294667,2.288867,0.004175,0.11,-0.048113,0.02
1,reports/re/multi_model_experiment_results_2020-11-15 15_43_01_942702.pkl,,0.0,0,,[],0,0,0,2.233493,2.232523,-0.031125,0.05,-0.058987,0.52
2,reports/re/multi_model_experiment_results_2020-11-15 18_33_46_015615.pkl,,0.0,0,,[],0,0,0,2.041215,2.28121,-0.003173,0.070633,-0.014653,0.040475
3,reports/re/multi_model_experiment_results_2020-11-15 18_33_46_015615.pkl,,0.0,0,,[],0,0,0,2.055447,2.839655,0.024098,0.133937,0.011763,0.061364
4,reports/re/multi_model_experiment_results_2020-11-16 10_28_19_020854.pkl,,0.0,0,,[],0,0,0,1.491616,2.126475,0.43107,0.405018,0.181686,0.16086
5,reports/re/multi_model_experiment_results_2020-11-16 10_28_19_020854.pkl,,0.0,0,,[],0,0,0,1.858239,2.370378,0.256408,0.173122,0.085593,0.056627
6,reports/re/multi_model_experiment_results_2020-11-20 23_15_57_501227.pkl,BioBERT_fc,1e-05,12,128.0,[],0,0,0,2.293831,2.218083,-0.074696,0.042834,0.030233,0.088604
7,reports/re/multi_model_experiment_results_2020-11-21 08_38_53_855089.pkl,BioBERT_fc,1e-05,12,128.0,[],0,0,0,2.3154,2.25572,0.012113,0.053352,-0.130368,0.053024
8,reports/re/multi_model_experiment_results_2020-11-21 08_38_53_855089.pkl,BioBERT_fc,8e-06,12,128.0,[],0,0,0,2.221329,2.343081,-0.057621,0.045866,-0.024152,0.023556
9,reports/re/multi_model_experiment_results_2020-11-21 08_38_53_855089.pkl,BioBERT_fc,1.3e-05,12,128.0,[],0,0,0,2.230306,2.363903,0.035066,0.063515,-0.020312,0.043442


### Let us generate Classification report for all the things we have available

In [10]:
len(all_data_source)

25

In [8]:
for i in range(len(all_data_source)):
  #print(all_data_source[i])
  dev_labels, dev_preds = all_data_source[i]
  print("\n \n \n",all_results[i])
  print(classification_report(dev_labels,dev_preds ))

[array([2, 6, 2, 6, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 2, 6, 6, 2, 0, 2, 6, 6,
       6, 2, 2, 6, 2, 6, 6, 2, 2, 6, 3, 3, 6, 6, 6, 6, 3, 2, 6, 2, 6, 6,
       2, 6, 6, 2, 2, 6, 6, 2, 2, 6, 6, 6, 2, 2, 6, 1, 2, 6, 2, 2, 6, 6,
       6, 2, 6, 6, 6, 6, 6, 6, 2, 6, 2, 1, 6, 6, 6, 6, 6, 2, 6, 2, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 2, 6, 2]), array([2, 6, 2, 6, 2, 6, 6, 2, 2, 2, 2, 6, 2, 6, 2, 2, 6, 6, 6, 0, 6, 6,
       6, 2, 1, 2, 6, 2, 6, 6, 6, 6, 2, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 6,
       2, 6, 2, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 6, 6, 2, 6, 6, 2, 6, 1,
       6, 6, 2, 6, 2, 6, 2, 6, 6, 2, 2, 2, 2, 2, 2, 2, 6, 0, 6, 2, 2, 6,
       6, 2, 6, 2, 2, 0, 6, 6, 2, 2, 6])]

 
 
 [PosixPath('reports/re/multi_model_experiment_results_2020-11-15 15_43_01_942702.pkl'), '', 0, 0, '', '[]', 0, 0, 0, 2.294666634665595, 2.288866784837511, 0.004174647419871997, 0.11, -0.04811280200229936, 0.02]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
     

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.80      0.99      0.89      1034
           1       0.97      0.97      0.97      1145
           2       0.97      0.99      0.98      1382
           3       0.99      0.98      0.98      1269
           4       0.97      0.91      0.94       136
           5       0.97      0.95      0.96       954
           6       0.97      0.97      0.97       933
           7       0.00      0.00      0.00       250
           8       1.00      1.00      1.00      8201

    accuracy                           0.97     15304
   macro avg       0.85      0.86      0.85     15304
weighted avg       0.96      0.97      0.96     15304

[array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0])]

 
 
 [PosixPath('reports/re/multi_model_experiment_results_2020-11-21 10_03_19_501249.pkl'), 'BioBERT_fc', 1.25e-05, 24, '256', '[]', 0, 0, 0, 2.2709851264953613, 2.421996593475342, 0.046052891840766996, 0.10986781884614702, -0.0

In [13]:
with open("reports/re/multi_model_experiment_results_2020-11-25 17_34_01_694684.pkl","rb") as f:
    lst_results = pickle.load(f)

In [15]:
len(lst_results[0])

11

In [16]:
print(classification_report(lst_results[0][9],lst_results[0][10]))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4229
           1       0.99      0.99      0.99      4780
           2       0.99      0.99      0.99      5399
           3       0.99      0.99      0.99      5215
           4       0.98      0.98      0.98       509
           5       0.99      0.99      0.99      3689
           6       0.99      0.99      0.99      3522
           7       1.00      1.00      1.00      3668
           8       1.00      1.00      1.00     32754

    accuracy                           1.00     63765
   macro avg       0.99      0.99      0.99     63765
weighted avg       1.00      1.00      1.00     63765

