In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import os
import re
from glob import glob
from collections import Counter

from pprint import pprint
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report

from src.load_data import load_data

In [3]:
def plot_figures(sets, title=None, bins=50):
    plt.figure(figsize=(5*len(sets), 3))
    if title:
        plt.suptitle(title, fontsize=16, y=1.08)
    if len(sets) > 1:
        plt.subplot(131)
        plt.hist(sets[0], bins=bins)
        plt.title('Training Set')
        plt.subplot(132)
        plt.hist(sets[1], bins=bins)
        plt.title('Validation Set')
        plt.subplot(133)
        plt.hist(sets[2], bins=bins)
        plt.title('Test Set')
    else:
        plt.hist(sets[0], bins=bins)
        plt.title('Total Set')
    plt.show()

### Load Data


In [4]:
train_data, valid_data, test_data, metadata = load_data()

#### Metadata

In [5]:
print('Total number of articles:', metadata['article_nums']['train'] + 
                                   metadata['article_nums']['valid'] +
                                   metadata['article_nums']['test'])
print('Article Distribution:', metadata['article_nums'])

Total number of articles: 315
Article Distribution: {'train': 251, 'valid': 32, 'test': 32}


In [6]:
print('Total number of sentences:', metadata['sentence_nums']['train'] + 
                                   metadata['sentence_nums']['valid'] +
                                   metadata['sentence_nums']['test'])
print('Sentence Distribution:', metadata['sentence_nums'])

Total number of sentences: 4422
Sentence Distribution: {'train': 3582, 'valid': 399, 'test': 441}


# Result Analysis

In [7]:
def read_results(file_path):

    validation = {
        '0': {}, 
        '1': {},
        'micro': {},
        'macro': {},
        'weighted': {},
    }
    test = {
        '0': {}, 
        '1': {},
        'micro': {},
        'macro': {},
        'weighted': {},
    }

    with open(file_path, 'r') as f:
        for i, line in enumerate(f.readlines()):
            line = line.strip().split()

            if line:

                if i == 5:
                    validation['0'] = {
                        'precision': float(line[1]),
                        'recall': float(line[2]),
                        'f1': float(line[3]),
                        'support': int(line[4])
                    }

                elif i == 6:
                    validation['1'] = {
                        'precision': float(line[1]),
                        'recall': float(line[2]),
                        'f1': float(line[3]),
                        'support': int(line[4])
                    }

                elif i == 8:
                    validation['micro'] = {
                        'precision': float(line[2]),
                        'recall': float(line[3]),
                        'f1': float(line[4]),
                        'support': int(line[5])
                    }

                elif i == 9:
                    validation['macro'] = {
                        'precision': float(line[2]),
                        'recall': float(line[3]),
                        'f1': float(line[4]),
                        'support': int(line[5])
                    }

                elif i == 10:
                    validation['weighted'] = {
                        'precision': float(line[2]),
                        'recall': float(line[3]),
                        'f1': float(line[4]),
                        'support': int(line[5])
                    }

                elif i == 20:
                    test['0'] = {
                        'precision': float(line[1]),
                        'recall': float(line[2]),
                        'f1': float(line[3]),
                        'support': int(line[4])
                    }

                elif i == 21:
                    test['1'] = {
                        'precision': float(line[1]),
                        'recall': float(line[2]),
                        'f1': float(line[3]),
                        'support': int(line[4])
                    }

                elif i == 23:
                    test['micro'] = {
                        'precision': float(line[2]),
                        'recall': float(line[3]),
                        'f1': float(line[4]),
                        'support': int(line[5])
                    }

                elif i == 24:
                    test['macro'] = {
                        'precision': float(line[2]),
                        'recall': float(line[3]),
                        'f1': float(line[4]),
                        'support': int(line[5])
                    }

                elif i == 25:
                    test['weighted'] = {
                        'precision': float(line[2]),
                        'recall': float(line[3]),
                        'f1': float(line[4]),
                        'support': int(line[5])
                    }

                elif i == 12:
                    validation['macro']['f1'] = float(line[1])
                    
                elif i == 13:
                    validation['1']['f1'] = float(line[1])

                elif i == 27:
                    test['macro']['f1'] = float(line[1])
                                                     
                elif i == 28:
                    test['1']['f1'] = float(line[1])
                    
    return validation, test

#### Read Model Performances File

In [25]:
model_folder = '/home/aorus/workspaces/simge/Master_Thesis/Model/Optimized_RQ2_bert_many_to_one_model/'

In [26]:
runs = []
with open('{}/model_performances.csv'.format(model_folder), 'r') as f:
    for line in f.readlines():
        line = line.strip()
        if line:        
            model_name, val_f1_macro, test_f1_macro = line.split(',')
            specs = model_name.split('_')
            
            if len(specs) > 3:
            
                metadata = {
                    'research_question': specs[1],
                    'transformer': '_'.join([specs[i] for i in range(2, specs.index('model'))]),
                    'model_no': specs[specs.index('model')+1],
                    'optimizer': specs[specs.index('lr')-1],
                    'learning_rate': specs[specs.index('lr')+1],
                    'lrreduction': True if 'lrreduction' in specs else False,
                    'loss': '_'.join([specs[specs.index('loss')+i] for i in range(1,3)]),
                }

                if 'ww' in specs:
                    metadata['window_width'] = specs[specs.index('ww')+1]
                    
                if 'epochs' in specs:
                    metadata['epochs'] = specs[specs.index('epochs')+1]
                    
                runs.append({
                    'model_name': model_name,
                    'metadata': metadata,
                    'val_f1_macro': val_f1_macro,
                    'test_f1_macro': test_f1_macro
                })

In [27]:
runs[0]

{'model_name': 'Optimized_RQ2_bert_many_to_one_model_0_ww_1_adam_lr_0.0001_epochs_1_loss_binary_crossentropy',
 'metadata': {'research_question': 'RQ2',
  'transformer': 'bert_many_to_one',
  'model_no': '0',
  'optimizer': 'adam',
  'learning_rate': '0.0001',
  'lrreduction': False,
  'loss': 'binary_crossentropy',
  'window_width': '1',
  'epochs': '1'},
 'val_f1_macro': '0.7333240914026941',
 'test_f1_macro': '0.6436164827376734'}

In [11]:
runs2 = [r for r in runs
#          if r['metadata']['epochs'] == '2'
#         and r['metadata']['model_no'] == '2'
#          if r['metadata']['optimizer'] == 'adam'
#          if r['metadata']['learning_rate'] != '0.001']
        if r['metadata']['model_no'] == '2']

for run in runs2:
#     print(run['metadata']['learning_rate'])
#     print(run['metadata']['optimizer'])
#     print(run['metadata']['model_no'], end='  ')
    print(run['model_name'])
    print(run['val_f1_macro'], run['test_f1_macro'])
    print(run)
    print()

Optimized_RQ2_elmo_many_to_one_model_2_ww_1_adam_lr_0.01_lrreduction_loss_binary_crossentropy_onehot_softmax
0.8149575119330466 0.724206684962146
{'model_name': 'Optimized_RQ2_elmo_many_to_one_model_2_ww_1_adam_lr_0.01_lrreduction_loss_binary_crossentropy_onehot_softmax', 'metadata': {'research_question': 'RQ2', 'transformer': 'elmo_many_to_one', 'model_no': '2', 'optimizer': 'adam', 'learning_rate': '0.01', 'lrreduction': True, 'loss': 'binary_crossentropy', 'window_width': '1'}, 'val_f1_macro': '0.8149575119330466', 'test_f1_macro': '0.724206684962146'}

Optimized_RQ2_elmo_many_to_one_model_2_ww_1_rmsprop_lr_0.01_lrreduction_loss_binary_crossentropy_onehot_softmax
0.81886278515492 0.719098228663446
{'model_name': 'Optimized_RQ2_elmo_many_to_one_model_2_ww_1_rmsprop_lr_0.01_lrreduction_loss_binary_crossentropy_onehot_softmax', 'metadata': {'research_question': 'RQ2', 'transformer': 'elmo_many_to_one', 'model_no': '2', 'optimizer': 'rmsprop', 'learning_rate': '0.01', 'lrreduction': Tru

#### Read Model Results Files

In [51]:
model_folder = '/home/aorus/workspaces/simge/Master_Thesis/Model/Optimized_RQ2_bert_model/'
    
results = []
for folder_path in glob(model_folder + 'Optimized*'):
    
    results_file = os.path.join(folder_path, 'model_results_file.txt')
    
    if os.path.isfile(results_file):
    
        model_name = os.path.basename(folder_path)
        specs = model_name.split('_')

        if 'adamax' in model_name:
            continue

        metadata = {
            'research_question': specs[1],
            'transformer': '_'.join([specs[i] for i in range(2, specs.index('model'))]),
            'model_no': specs[specs.index('model')+1],
            'optimizer': specs[specs.index('lr')-1],
            'learning_rate': specs[specs.index('lr')+1],
            'lrreduction': True if 'lrreduction' in specs else False,
            'loss': '_'.join([specs[specs.index('loss')+i] for i in range(1,3)]),
        }

        if 'ww' in specs:
            metadata['window_width'] = specs[specs.index('ww')+1]

        if 'epochs' in specs:
            metadata['epochs'] = specs[specs.index('epochs')+1]

        if 'tunedlayers' in specs:
            metadata['tunedlayers'] = specs[specs.index('tunedlayers')+1]

        validation, test = read_results(results_file)

        results.append({
            'model_name': model_name,
            'metadata': metadata,
            'validation': validation,
            'test': test
        })

In [52]:
results[0]['metadata']['model_no']

'0'

In [53]:
print(results[0]['metadata']['research_question'], results[0]['metadata']['transformer'])
print(max([r['validation']['macro']['f1'] for r in results]))
print(max([r['test']['macro']['f1'] for r in results]))
print(max([r['test']['1']['f1'] for r in results]))

RQ2 bert
0.8028083028083028
0.7536684782608696
0.6568265682656828


In [54]:
print(results[0]['metadata']['research_question'], results[0]['metadata']['transformer'], 2)
print(max([r['validation']['macro']['f1'] for r in results]))
print(max([r['test']['macro']['f1'] for r in results]))
print(max([r['test']['1']['f1'] for r in results]))

RQ2 bert 2
0.8028083028083028
0.7536684782608696
0.6568265682656828


In [55]:
print(results[0]['metadata']['research_question'], results[0]['metadata']['transformer'])
print(max([r['validation']['macro']['f1'] for r in results]))
print(max([r['test']['macro']['f1'] for r in results]))
print(max([r['test']['1']['f1'] for r in results]))

RQ2 bert
0.8028083028083028
0.7536684782608696
0.6568265682656828


In [56]:
res2 = [r for r in sorted(results, key=lambda r:r['metadata']['model_no'])]
#          if r['metadata']['epochs'] == '2'
        
#          if r['metadata']['optimizer'] == 'adam'
#           if r['metadata']['window_width'] == '3']
#         and r['metadata']['model_no'] == '11']
#         if r['metadata']['model_no'] == '9']
#        and r['metadata']['optimizer'] == 'adam']

for run in res2:
    print(run['model_name'])
    print(run['test']['macro']['f1'])
    print(run['test']['1']['f1'])
    print(run['validation']['macro']['f1'])
    print()

Optimized_RQ2_bert_model_0_maxlen_58_rmsprop_lr_2e-05_epochs_1_loss_binary_crossentropy_sigmoid
0.6030131123038249
0.35365853658536583
0.6354627872654439

Optimized_RQ2_bert_model_0_maxlen_58_adam_lr_0.001_epochs_1_loss_binary_crossentropy_sigmoid
0.6928969359331476
0.5
0.7139687307981157

Optimized_RQ2_bert_model_0_maxlen_58_rmsprop_lr_0.0001_epochs_1_loss_binary_crossentropy_sigmoid
0.5933265582655827
0.3194444444444445
0.6937043394257226

Optimized_RQ2_bert_model_0_maxlen_58_adam_lr_2e-05_epochs_1_loss_binary_crossentropy_sigmoid
0.5946106705118962
0.32894736842105265
0.6312250815512868

Optimized_RQ2_bert_model_0_maxlen_58_rmsprop_lr_0.001_epochs_1_loss_binary_crossentropy_sigmoid
0.7085261070720423
0.5294117647058824
0.7398956975228161

Optimized_RQ2_bert_model_0_maxlen_58_adam_lr_0.0001_epochs_1_loss_binary_crossentropy_sigmoid
0.6565219211925799
0.44311377245508987
0.7285714285714286

Optimized_RQ2_bert_model_10_maxlen_58_adam_lr_0.001_epochs_1_loss_binary_crossentropy_sigmoid
0

In [49]:
res2 = [r for r in results
#          if r['metadata']['epochs'] == '2'
#         and r['metadata']['model_no'] == '8'
#          if r['metadata']['optimizer'] == 'adam'
          if r['metadata']['window_width'] == '1'
        and r['metadata']['learning_rate'] == '0.001']

for run in res2:
#     print(run['model_name'])
    print(run['test']['macro']['f1'])
    print(run['test']['1']['f1'])
    print(run['validation']['macro']['f1'])

KeyError: 'window_width'

for res in results:
    print(res['model_name'])
    print(res['test']['1']['f1'], res['test']['macro']['f1'])