# 1. Initialisation

## 1.1. File System

In [None]:
from google.colab import drive
drive.mount("/content/drive")

! chmod -R 777 'drive/My Drive/Otago/F. The Thesis/Source Code'

% cd 'drive/My Drive/Otago/F. The Thesis/Source Code'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
chmod: cannot access 'drive/My Drive/Otago/F. The Thesis/Source Code': No such file or directory
[Errno 2] No such file or directory: 'drive/My Drive/Otago/F. The Thesis/Source Code'
/content/drive/My Drive/Otago/F. The Thesis/Source Code


## 1.2. Importing Libraries

In [None]:
!pip install 'h5py==2.10.0' --force-reinstall

Collecting h5py==2.10.0
  Using cached h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
Collecting numpy>=1.7
  Using cached numpy-1.21.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
Collecting six
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: six, numpy, h5py
  Attempting uninstall: six
    Found existing installation: six 1.16.0
    Uninstalling six-1.16.0:
      Successfully uninstalled six-1.16.0
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.2
    Uninstalling numpy-1.21.2:
      Successfully uninstalled numpy-1.21.2
  Attempting uninstall: h5py
    Found existing installation: h5py 2.10.0
    Uninstalling h5py-2.10.0:
      Successfully uninstalled h5py-2.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lucid 0.3.10 requires umap-learn, which is not ins

In [None]:
import pandas as pd
import pandas_profiling
from pandas_profiling import ProfileReport

import time
import re
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import nltk
import csv
import ast
import pickle
import itertools
import random
from scipy.spatial import distance

In [None]:
%tensorflow_version 1.x
import tensorflow as tf
from keras import Model
from keras.backend.tensorflow_backend import set_session
import sklearn.metrics
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.losses import mse

In [None]:
from keras.models import load_model

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [None]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
sys.path.insert(0, './Libraries')
import preprocess

## 1.3. GPU Initialisation

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default session for Keras

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device



## 1.4. Global Variables

In [None]:
model_name = 'RIVEC300_SMALL_'
max_word = 3500

## 1.5. Model Reloading

In [None]:
def custom_loss_func(first_pos, second_pos):
    def custom_loss(y_true, y_pred):
        first_idx = K.argmax(K.abs(first_pos), axis = -1)
        second_idx = K.argmax(K.abs(second_pos), axis = -1)
        
        first_max = tf.gather(first_pos, first_idx, axis = 1)
        second_max = tf.gather(second_pos, second_idx, axis = 1)

        first_second = K.mean(mse(first_max, second_max)) * 0.1
        first_first = K.binary_crossentropy(y_true, y_pred)

        return mse(first_second, first_first)
    custom_loss.__name__ = 'custom_loss'
    return custom_loss

In [None]:
modelInputFile = f'./Model/{model_name}SEmHus_Word_Embedding_Model.h5'
model = load_model(modelInputFile)

## 1.6. Tokenizers Reloading

In [None]:
with open(f'./Tokenizer/{model_name}text.pickle', 'rb') as handle:
    texts_tok = pickle.load(handle)

with open(f'./Tokenizer/{model_name}actor.pickle', 'rb') as handle:
    labActor_tok = pickle.load(handle)

with open(f'./Tokenizer/{model_name}agency.pickle', 'rb') as handle:
    labAgency_tok = pickle.load(handle)

with open(f'./Tokenizer/{model_name}sector.pickle', 'rb') as handle:
    labSector_tok = pickle.load(handle)

with open(f'./Tokenizer/{model_name}place.pickle', 'rb') as handle:
    labPlace_tok = pickle.load(handle)

with open(f'./Tokenizer/{model_name}year.pickle', 'rb') as handle:
    labYear_tok = pickle.load(handle)

with open(f'./Tokenizer/{model_name}month.pickle', 'rb') as handle:
    labMonth_tok = pickle.load(handle)

with open(f'./Tokenizer/{model_name}reason.pickle', 'rb') as handle:
    labReason_tok = pickle.load(handle)

with open(f'./Tokenizer/{model_name}goal.pickle', 'rb') as handle:
    labGoal_tok = pickle.load(handle)

# 2. Data Preparation

In [None]:
def clean_alpha(text):
    text = ' '.join([w for w in str(text).split() if w.isalpha()])
    return text

## 2.1. Input and Output

In [None]:
normInputFile = f'./Inputs/{model_name}SEmHus_Enriched_Normalised_Dataset_Test.csv'
normDF = pd.read_csv(normInputFile)

In [None]:
normDF = normDF.loc[(normDF['country_name'] != "['World']") & (normDF['disaster_type'] != "['Other']") & (normDF['source_type'] != "['Media']")]

In [None]:
docs = normDF['text'].tolist()
labActor = [eval(v) for v in normDF['source_name'].tolist()]
labAgency = [eval(v) for v in normDF['source_type'].tolist()]
labSector = [eval(v) for v in normDF['theme'].tolist()]
labPlace = [eval(v) for v in normDF['country_name'].tolist()]
labYear = [eval(v) for v in normDF['year_created'].tolist()]
labMonth = [eval(v) for v in normDF['month_created'].tolist()]
labReason = [eval(v) for v in normDF['disaster_type'].tolist()]
labGoal = [eval(v) for v in normDF['development_goal'].tolist()]

In [None]:
docs = [' '.join(str(d).lower().split()[:max_word]) for d in docs]

In [None]:
padding_len = model.layers[0].get_output_at(0).get_shape().as_list()[1]

## 2.2. Import Dictionaries

In [None]:
list_encoded_text = f'./Dictionary/{model_name}list_encoded_text.csv'
word_embed_dict = f'./Dictionary/{model_name}word_embed_dict.csv'

In [None]:
with open(list_encoded_text) as csv_file:
    reader = csv.reader(csv_file)
    dictText = {rows[0]:rows[1] for rows in reader}

with open(word_embed_dict) as csv_file:
    reader = csv.reader(csv_file)
    dictEmbed = {rows[0]:np.array(eval(rows[1])) for rows in reader}

Reversed Dictionaries

In [None]:
encoded_text = texts_tok.texts_to_sequences(docs)
encoded_text = pad_sequences(encoded_text, maxlen=padding_len, padding='post', truncating='post')

encoded_actor = labActor_tok.transform(labActor)
encoded_agency = labAgency_tok.transform(labAgency)
encoded_sector = labSector_tok.transform(labSector)
encoded_place = labPlace_tok.transform(labPlace)
encoded_year = labYear_tok.transform(labYear)
encoded_month = labMonth_tok.transform(labMonth)
encoded_reason = labReason_tok.transform(labReason)
encoded_goal = labGoal_tok.transform(labGoal)
encoded_stat = np.array([[1]] * len(encoded_text))

In [None]:
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
inpWeight (Embedding)           (None, 100, 300)     3000000     input_1[0][0]                    
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 30000)        0           inpWeight[0][0]                  
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 300)          9000300     flatten_1[0][0]                  
____________________________________________________________________________________________

# 3. Class Embedding

## 3.1. Semantic Similarity Setup

In [None]:
num_word = 5

In [None]:
agency_weight = model.get_layer('outAgency').get_weights()[0]
agency_transpose = np.asarray(np.transpose(agency_weight))
agency_embed = {w: agency_transpose[idx] for idx, w in enumerate(labAgency_tok.classes_)}

actor_weight = model.get_layer('outActor').get_weights()[0]
actor_transpose = np.asarray(np.transpose(actor_weight))
actor_embed = {w: actor_transpose[idx] for idx, w in enumerate(labActor_tok.classes_)}

goal_weight = model.get_layer('outGoal').get_weights()[0]
goal_transpose = np.asarray(np.transpose(goal_weight))
goal_embed = {w: goal_transpose[idx] for idx, w in enumerate(labGoal_tok.classes_)}

sector_weight = model.get_layer('outSector').get_weights()[0]
sector_transpose = np.asarray(np.transpose(sector_weight))
sector_embed = {w: sector_transpose[idx] for idx, w in enumerate(labSector_tok.classes_)}

place_weight = model.get_layer('outPlace').get_weights()[0]
place_transpose = np.asarray(np.transpose(place_weight))
place_embed = {w: place_transpose[idx] for idx, w in enumerate(labPlace_tok.classes_)}

year_weight = model.get_layer('outYear').get_weights()[0]
year_transpose = np.asarray(np.transpose(year_weight))
year_embed = {w: year_transpose[idx] for idx, w in enumerate(labYear_tok.classes_)}

month_weight = model.get_layer('outMonth').get_weights()[0]
month_transpose = np.asarray(np.transpose(month_weight))
month_embed = {w: month_transpose[idx] for idx, w in enumerate(labMonth_tok.classes_)}

reason_weight = model.get_layer('outReason').get_weights()[0]
reason_transpose = np.asarray(np.transpose(reason_weight))
reason_embed = {w: reason_transpose[idx] for idx, w in enumerate(labReason_tok.classes_)}

In [None]:
def nearest_class(first, second, third):
    if first == 'agency': class_list = agency_embed.items()
    if first == 'actor': class_list = actor_embed.items()
    if first == 'goal': class_list = goal_embed.items()
    if first == 'sector': class_list = sector_embed.items()
    if first == 'place': class_list = place_embed.items()
    if first == 'year': class_list = year_embed.items()
    if first == 'month': class_list = month_embed.items()
    if first == 'reason': class_list = reason_embed.items()

    if second == 'agency': vec1 = agency_embed[third]
    if second == 'actor': vec1 = actor_embed[third]
    if second == 'goal': vec1 = goal_embed[third]
    if second == 'sector': vec1 = sector_embed[third]
    if second == 'place': vec1 = place_embed[third]
    if second == 'year': vec1 = year_embed[third]
    if second == 'month': vec1 = month_embed[third]
    if second == 'reason': vec1 = reason_embed[third]

    dist = []
    for word, vec2 in class_list:
        #dist.append((np.linalg.norm(vec1 - vec2), word))
        dist.append((round(distance.cosine(vec1, vec2), 4), word))
    dist.sort()
    dist = dist[:num_word]
    return dist

## 3.2. Semantic Similarity Measure

In [None]:
var1, var2 = 'reason', 'place'

if var2 == 'agency': rand_class = labAgency_tok.classes_[random.randint(0, len(labAgency_tok.classes_)-1)]
if var2 == 'actor': rand_class = labActor_tok.classes_[random.randint(0, len(labActor_tok.classes_)-1)]
if var2 == 'goal': rand_class = labGoal_tok.classes_[random.randint(0, len(labGoal_tok.classes_)-1)]
if var2 == 'sector': rand_class = labSector_tok.classes_[random.randint(0, len(labSector_tok.classes_)-1)]
if var2 == 'place': rand_class = labPlace_tok.classes_[random.randint(0, len(labPlace_tok.classes_)-1)]
if var2 == 'year': rand_class = labYear_tok.classes_[random.randint(0, len(labYear_tok.classes_)-1)]
if var2 == 'month': rand_class = labMonth_tok.classes_[random.randint(0, len(labMonth_tok.classes_)-1)]
if var2 == 'reason': rand_class = labReason_tok.classes_[random.randint(0, len(labReason_tok.classes_)-1)]

# rand_class = 'Zambia'

print(f'Nearest {var1} to this {var2} [{rand_class}] is:\n')

# nearest_class(var1, var2, rand_class)

nearest_text = ''

ents = nearest_class(var1, var2, rand_class)

for ent in ents:
    nearest_text += f'{ent[1]} ({ent[0]}); '

# print(nearest_text)
nearest_text

Nearest reason to this place [Honduras] is:



'Mud Slide (0.7782); Drought (0.8335); Tropical Cyclone (0.8576); Extratropical Cyclone (0.8894); Epidemic (0.9011); '

## 3.3. Vector Space Export

In [None]:
cols = ['Class', 'Entity', 'Vector']
rows = []

for key, value in agency_embed.items():
    rows.append(['Agency', key, list(value)])

for key, value in actor_embed.items():
    rows.append(['Actor', key, list(value)])

for key, value in goal_embed.items():
    rows.append(['SDG', key, list(value)])

for key, value in sector_embed.items():
    rows.append(['Sector', key, list(value)])

for key, value in place_embed.items():
    rows.append(['Place', key, list(value)])

for key, value in year_embed.items():
    rows.append(['Year', key, list(value)])

for key, value in month_embed.items():
    rows.append(['Month', key, list(value)])

for key, value in reason_embed.items():
    rows.append(['Reason', key, list(value)])

class_df = pd.DataFrame(data=rows, columns = cols)
class_df.to_csv(f'./Outputs/{model_name}Humanitarian_Class_Vector_Space.csv')

## 3.4. Entity Distance Export

In [None]:
class_data = class_df.values.tolist()

In [None]:
def nearest_label(group, entity):
    vec1 = class_df.loc[class_df['Entity'] == entity]
    vec1 = vec1.iloc[0]['Vector']
    dist = []
    for cls, ent, vec2 in class_data:
        if group == '*':
            dist.append((round(distance.cosine(vec1, vec2), 4), ent, cls))
        if group == cls:
            dist.append((round(distance.cosine(vec1, vec2), 4), ent, cls))
    dist.sort()
    dist = dist[:10]
    return dist

In [None]:
rand_num = random.randint(0, len(class_df)-1)

rand_class = class_df.iloc[rand_num]['Entity']
near_class = class_df.iloc[rand_num]['Class']

rand_class = 'Cambodia'

# near_group = '*'
near_group = 'Sector'

print(f'Nearest {near_group} to [{rand_class}] is:\n')
nearest_label(near_group, rand_class)

Nearest Sector to [Cambodia] is:



[(0.9026, 'Water Sanitation Hygiene', 'Sector'),
 (0.9112, 'HIV/Aids', 'Sector'),
 (0.9301, 'Food and Nutrition', 'Sector'),
 (0.9317, 'Disaster Management', 'Sector'),
 (0.9377, 'Climate Change and Environment', 'Sector'),
 (0.9409, 'Agriculture', 'Sector'),
 (0.9528, 'Gender', 'Sector'),
 (0.9645, 'Mine Action', 'Sector'),
 (0.9657, 'Humanitarian Financing', 'Sector'),
 (0.9679, 'Contributions', 'Sector')]

# 4. Model Evaluation

In [None]:
print(model.metrics_names)

['loss', 'outActor_loss', 'outAgency_loss', 'outSector_loss', 'outPlace_loss', 'outYear_loss', 'outMonth_loss', 'outReason_loss', 'outGoal_loss', 'outStat_loss', 'outActor_categorical_accuracy', 'outAgency_categorical_accuracy', 'outSector_categorical_accuracy', 'outPlace_categorical_accuracy', 'outYear_categorical_accuracy', 'outMonth_categorical_accuracy', 'outReason_categorical_accuracy', 'outGoal_categorical_accuracy', 'outStat_categorical_accuracy']


In [None]:
loss, _, _, _, _, _, _, _, _, _, accActor, accAgency, accSector, accPlace, accYear, accMonth, accReason, accGoal, accStat = model.evaluate(
    [encoded_text],
    [encoded_actor, encoded_agency, encoded_sector, encoded_place,
     encoded_year, encoded_month, encoded_reason, encoded_goal, encoded_stat], verbose=0)

In [None]:
print('Overall Loss is: %f' % (loss))
print('Actor Accuracy: %f' % (accActor*100))
print('Agency Accuracy: %f' % (accAgency*100))
print('Sector Accuracy: %f' % (accSector*100))
print('Place Accuracy: %f' % (accPlace*100))
print('Year Accuracy: %f' % (accYear*100))
print('Month Accuracy: %f' % (accMonth*100))
print('Reason Accuracy: %f' % (accReason*100))
print('SDG Accuracy: %f' % (accGoal*100))

Overall Loss is: 0.643880
Actor Accuracy: 70.984846
Agency Accuracy: 92.348486
Sector Accuracy: 56.818181
Place Accuracy: 86.609846
Year Accuracy: 81.287879
Month Accuracy: 68.806821
Reason Accuracy: 71.268940
SDG Accuracy: 73.295456


## 4.2. Categorical Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
actor_pred, agency_pred, sector_pred, place_pred, year_pred, month_pred, reason_pred, goal_pred, stat_pred = model.predict(encoded_text, verbose=0)

actor_pred = np.where(actor_pred>=0.5, 1, 0)
agency_pred = np.where(agency_pred>=0.5, 1, 0)
sector_pred = np.where(sector_pred>=0.5, 1, 0)
place_pred = np.where(place_pred>=0.5, 1, 0)
year_pred = np.where(year_pred>=0.5, 1, 0)
month_pred = np.where(month_pred>=0.5, 1, 0)
reason_pred = np.where(reason_pred>=0.5, 1, 0)
goal_pred = np.where(goal_pred>=0.5, 1, 0)

In [None]:
actor_label = [c for c in labActor_tok.classes_ if c in labActor]

In [None]:
# len(set(encoded_actor))

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

### 2.4.1. Actor Evaluation

In [None]:
actor_report = classification_report(encoded_actor, actor_pred, target_names=labActor_tok.classes_, digits=6, zero_division=False, output_dict=True)

report = []

for k, v in actor_report.items():
    p, r, f, s = v.values()
    report.append([k, p, r, f, s])
report

exportDF = pd.DataFrame(report, columns = ['Class', 'Precision', 'Recall', 'F1', 'Support'])
exportDF.to_excel('./Results/Actor_Score_Matrix.xlsx')

In [None]:
print('Hamming score: {0}'.format(hamming_score(encoded_actor, actor_pred)))
print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(encoded_actor, actor_pred)))
print('Subset accuracy: {0}'.format(sklearn.metrics.accuracy_score(encoded_actor, actor_pred, normalize=True, sample_weight=None)))

Hamming score: 0.6691130050505051
Hamming loss: 0.0004671265022652668
Subset accuracy: 0.6640151515151516


In [None]:
tot_score = []

for idx, sub in enumerate(labActor_tok.classes_):

    subDF = normDF.loc[normDF['source_name'].str.contains(sub)]

    cnt_report = len(subDF)
    if cnt_report <=0 : continue

    inpText = subDF['text'].tolist()
    inpText = [' '.join(str(d).lower().split()[:max_word]) for d in inpText]
    inpText = texts_tok.texts_to_sequences(inpText)
    inpText = pad_sequences(inpText, maxlen=padding_len, padding='post', truncating='post')

    outActor = [[sub]] * cnt_report
    
    outActor = labActor_tok.transform(outActor)

    preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal, preStat = model.predict(inpText, verbose=0)

    preActor = np.where(preActor>=np.sort(preActor[0])[-1], 1, 0)

    acc_report = sklearn.metrics.accuracy_score(outActor, preActor, normalize=True)
    prc_report = sklearn.metrics.precision_score(outActor, preActor, average='micro')
    rcl_report = sklearn.metrics.recall_score(outActor, preActor,  average='micro')
    fsc_report = sklearn.metrics.f1_score(outActor, preActor,  average='micro')
    avg_report = (acc_report + prc_report + rcl_report + fsc_report) / 4

    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(outActor.flatten(), preActor.flatten()).ravel()
    
    tot_score.append([sub, tn, fp, fn, tp, acc_report, prc_report, rcl_report, fsc_report, avg_report, cnt_report, outActor.argmax(axis=1).tolist(), preActor.argmax(axis=1).tolist()])

actor_score = pd.DataFrame(tot_score, columns = ['Actor', 'TN', 'FP', 'FN', 'TP', 'Accuracy', 'Precision', 'Recall', 'F1', 'Score', 'Support', 'True_Y', 'Pred_Y']) 
actor_score.to_excel('./Results/Actor_Score_Matrix.xlsx')

  return func(self, *args, **kwargs)


### 2.4.2. Agency Evaluation

In [None]:
agency_report = classification_report(encoded_agency, agency_pred, target_names=labAgency_tok.classes_, digits=6, zero_division=False, output_dict=True)
print(agency_report)

{'Academic and Research Institution': {'precision': 0.9761904761904762, 'recall': 0.6612903225806451, 'f1-score': 0.7884615384615384, 'support': 62}, 'Government': {'precision': 0.9510268562401264, 'recall': 0.8905325443786982, 'f1-score': 0.9197860962566845, 'support': 1352}, 'International Organization': {'precision': 0.9264870931537598, 'recall': 0.938601478112564, 'f1-score': 0.9325049421067495, 'support': 1759}, 'Media': {'precision': 0.14516129032258066, 'recall': 0.6428571428571429, 'f1-score': 0.2368421052631579, 'support': 14}, 'Non-governmental Organization': {'precision': 0.9573333333333334, 'recall': 0.9270497094899935, 'f1-score': 0.9419481797310594, 'support': 1549}, 'Other': {'precision': 0.9565217391304348, 'recall': 0.7096774193548387, 'f1-score': 0.8148148148148149, 'support': 31}, 'Red Cross/Red Crescent Movement': {'precision': 0.9799196787148594, 'recall': 0.8888888888888888, 'f1-score': 0.9321872015281757, 'support': 549}, 'micro avg': {'precision': 0.937753721244

In [None]:
report = []

for k, v in agency_report.items():
    p, r, f, s = v.values()
    report.append([k, p, r, f, s])
report

exportDF = pd.DataFrame(report, columns = ['Class', 'Precision', 'Recall', 'F1', 'Support'])
exportDF.to_excel('./Results/Agency_Score_Matrix.xlsx')

In [None]:
print('Hamming score: {0}'.format(hamming_score(encoded_agency, agency_pred)))
print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(encoded_agency, agency_pred))) 
print('Subset accuracy: {0}'.format(sklearn.metrics.accuracy_score(encoded_agency, agency_pred, normalize=True, sample_weight=None)))

Hamming score: 0.915719696969697
Hamming loss: 0.021293290043290045
Subset accuracy: 0.912689393939394


In [None]:
tot_score = []

for idx, sub in enumerate(labAgency_tok.classes_):

    subDF = normDF.loc[normDF['source_type'].str.contains(sub)]

    cnt_report = len(subDF)
    if cnt_report <=0 : continue

    inpText = subDF['text'].tolist()
    inpText = [' '.join(str(d).lower().split()[:max_word]) for d in inpText]
    inpText = texts_tok.texts_to_sequences(inpText)
    inpText = pad_sequences(inpText, maxlen=padding_len, padding='post', truncating='post')

    outAgency = [[sub]] * cnt_report
    outAgency = labAgency_tok.transform(outAgency)

    preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal, preStat = model.predict(inpText, verbose=0)

    preAgency = np.where(preAgency>=0.5, 1, 0)

    acc_report = sklearn.metrics.accuracy_score(outAgency, preAgency, normalize=True)
    prc_report = sklearn.metrics.precision_score(outAgency, preAgency, average='micro')
    rcl_report = sklearn.metrics.recall_score(outAgency, preAgency, average='micro')
    fsc_report = sklearn.metrics.f1_score(outAgency, preAgency, average='micro')
    avg_report = (acc_report + prc_report + rcl_report + fsc_report) / 4
    
    tot_score.append([sub, acc_report, prc_report, rcl_report, fsc_report, cnt_report])

    print(sub, '\t', cnt_report, '\t', avg_report)

agency_score = pd.DataFrame(tot_score, columns = ['Agency', 'Accuracy', 'Precision', 'Recall', 'F1', 'Support']) 
agency_score.to_excel('./Results/Agency_Score_Matrix.xlsx')

Academic and Research Institution 	 62 	 0.6874096891353588
Government 	 1352 	 0.8983390698664028
International Organization 	 1759 	 0.9438690210877856
Media 	 14 	 0.6428571428571429
Non-governmental Organization 	 1549 	 0.9336614927721272
Other 	 31 	 0.7184999118632116
Red Cross/Red Crescent Movement 	 549 	 0.8994846814847788


### 2.4.3. Sector Evaluation

In [None]:
sector_report = classification_report(encoded_sector, sector_pred, target_names=labSector_tok.classes_, digits=6, zero_division=False, output_dict=True)
print(sector_report)

{'Agriculture': {'precision': 0.9065420560747663, 'recall': 0.30842607313195547, 'f1-score': 0.46026097271648875, 'support': 629}, 'Climate Change and Environment': {'precision': 1.0, 'recall': 0.09302325581395349, 'f1-score': 0.1702127659574468, 'support': 43}, 'Contributions': {'precision': 0.911062906724512, 'recall': 0.42769857433808556, 'f1-score': 0.5821205821205822, 'support': 982}, 'Coordination': {'precision': 0.8209606986899564, 'recall': 0.29237947122861585, 'f1-score': 0.43119266055045874, 'support': 643}, 'Disaster Management': {'precision': 0.8620689655172413, 'recall': 0.3768844221105528, 'f1-score': 0.5244755244755245, 'support': 199}, 'Education': {'precision': 0.9213483146067416, 'recall': 0.2303370786516854, 'f1-score': 0.36853932584269666, 'support': 356}, 'Food and Nutrition': {'precision': 0.9275053304904051, 'recall': 0.27848911651728553, 'f1-score': 0.42836041358936483, 'support': 1562}, 'Gender': {'precision': 1.0, 'recall': 0.08333333333333333, 'f1-score': 0.1

In [None]:
report = []

for k, v in sector_report.items():
    p, r, f, s = v.values()
    report.append([k, p, r, f, s])
report

exportDF = pd.DataFrame(report, columns = ['Class', 'Precision', 'Recall', 'F1', 'Support'])
exportDF.to_excel('./Results/Sector_Score_Matrix.xlsx')

In [None]:
print('Hamming score: {0}'.format(hamming_score(encoded_sector, sector_pred)))
print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(encoded_sector, sector_pred))) 
print('Subset accuracy: {0}'.format(sklearn.metrics.accuracy_score(encoded_sector, sector_pred, normalize=True, sample_weight=None)))

Hamming score: 0.4897220035856399
Hamming loss: 0.07511961722488038
Subset accuracy: 0.36363636363636365


In [None]:
tot_score = []

for idx, sub in enumerate(labSector_tok.classes_):

    subDF = normDF.loc[normDF['theme'].str.contains(sub)]

    cnt_report = len(subDF)
    if cnt_report <=0 : continue

    inpText = subDF['text'].tolist()
    inpText = [' '.join(str(d).lower().split()[:max_word]) for d in inpText]
    inpText = texts_tok.texts_to_sequences(inpText)
    inpText = pad_sequences(inpText, maxlen=padding_len, padding='post', truncating='post')

    outSector = [[sub]] * cnt_report
    outSector = labSector_tok.transform(outSector)

    preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal, preStat = model.predict(inpText, verbose=0)

    preSector = np.where(preSector>=0.5, 1, 0)

    acc_report = sklearn.metrics.accuracy_score(outSector, preSector, normalize=True)
    prc_report = sklearn.metrics.precision_score(outSector, preSector, average='micro')
    rcl_report = sklearn.metrics.recall_score(outSector, preSector, average='micro')
    fsc_report = sklearn.metrics.f1_score(outSector, preSector, average='micro')
    avg_report = (acc_report + prc_report + rcl_report + fsc_report) / 4

    tot_score.append([sub, acc_report, prc_report, rcl_report, fsc_report, cnt_report])

sector_score = pd.DataFrame(tot_score, columns = ['Sector', 'Accuracy', 'Precision', 'Recall', 'F1', 'Support']) 
sector_score.to_excel('./Results/Sector_Score_Matrix.xlsx')

### 2.4.4. Place Evaluation

In [None]:
place_report = classification_report(encoded_place, place_pred, target_names=labPlace_tok.classes_, digits=6, zero_division=False, output_dict=True)
print(place_report)

{'Afghanistan': {'precision': 0.898876404494382, 'recall': 0.7619047619047619, 'f1-score': 0.8247422680412371, 'support': 105}, 'Albania': {'precision': 1.0, 'recall': 0.6, 'f1-score': 0.7499999999999999, 'support': 5}, 'Algeria': {'precision': 0.7727272727272727, 'recall': 0.8095238095238095, 'f1-score': 0.7906976744186046, 'support': 21}, 'American Samoa': {'precision': 0.75, 'recall': 0.9, 'f1-score': 0.8181818181818182, 'support': 10}, 'Angola': {'precision': 0.7857142857142857, 'recall': 0.7333333333333333, 'f1-score': 0.7586206896551724, 'support': 30}, 'Anguilla': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}, 'Antigua and Barbuda': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}, 'Argentina': {'precision': 1.0, 'recall': 0.6, 'f1-score': 0.7499999999999999, 'support': 5}, 'Armenia': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}, 'Aruba (The Netherlands)': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}, 'Au

In [None]:
report = []

for k, v in place_report.items():
    p, r, f, s = v.values()
    report.append([k, p, r, f, s])
report

exportDF = pd.DataFrame(report, columns = ['Class', 'Precision', 'Recall', 'F1', 'Support'])
exportDF.to_excel('./Results/Place_Score_Matrix.xlsx')

In [None]:
print('Hamming score: {0}'.format(hamming_score(encoded_place, place_pred)))
print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(encoded_place, place_pred))) 
print('Subset accuracy: {0}'.format(sklearn.metrics.accuracy_score(encoded_place, place_pred, normalize=True, sample_weight=None)))

Hamming score: 0.8426136363636364
Hamming loss: 0.0011727496641289745
Subset accuracy: 0.8426136363636364


In [None]:
tot_score = []

for idx, sub in enumerate(labPlace_tok.classes_):

    subDF = normDF.loc[normDF['country_name'].str.contains(sub)]

    cnt_report = len(subDF)
    if cnt_report <=0 : continue

    inpText = subDF['text'].tolist()
    inpText = [' '.join(str(d).lower().split()[:max_word]) for d in inpText]
    inpText = texts_tok.texts_to_sequences(inpText)
    inpText = pad_sequences(inpText, maxlen=padding_len, padding='post', truncating='post')

    outPlace = [[sub]] * cnt_report
    outPlace = labPlace_tok.transform(outPlace)

    preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal, preStat = model.predict(inpText, verbose=0)

    prePlace = np.where(prePlace>=0.5, 1, 0)

    acc_report = sklearn.metrics.accuracy_score(outPlace, prePlace, normalize=True)
    prc_report = sklearn.metrics.precision_score(outPlace, prePlace, average='micro')
    rcl_report = sklearn.metrics.recall_score(outPlace, prePlace, average='micro')
    fsc_report = sklearn.metrics.f1_score(outPlace, prePlace, average='micro')
    avg_report = (acc_report + prc_report + rcl_report + fsc_report) / 4

    tot_score.append([sub, acc_report, prc_report, rcl_report, fsc_report, cnt_report])

place_score = pd.DataFrame(tot_score, columns = ['Place', 'Accuracy', 'Precision', 'Recall', 'F1', 'Support']) 
place_score.to_excel('./Results/Place_Score_Matrix.xlsx')

  return func(self, *args, **kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


### 2.4.5. Year Evaluation

In [None]:
year_report = classification_report(encoded_year, year_pred, target_names=labYear_tok.classes_, digits=6, zero_division=False, output_dict=True)
print(year_report)

{'1995': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}, '1996': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}, '1997': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}, '1998': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}, '1999': {'precision': 0.9285714285714286, 'recall': 0.7647058823529411, 'f1-score': 0.8387096774193549, 'support': 17}, '2000': {'precision': 0.72, 'recall': 0.6666666666666666, 'f1-score': 0.6923076923076923, 'support': 27}, '2001': {'precision': 0.9423076923076923, 'recall': 0.7313432835820896, 'f1-score': 0.8235294117647058, 'support': 67}, '2002': {'precision': 0.9047619047619048, 'recall': 0.7916666666666666, 'f1-score': 0.8444444444444444, 'support': 24}, '2003': {'precision': 0.8953488372093024, 'recall': 0.7777777777777778, 'f1-score': 0.8324324324324325, 'support': 99}, '2004': {'precision': 0.875, 'recall': 0.7962085308056872, 'f1-score': 0.8337468982630273, 'support': 211}, '2005': {

In [None]:
report = []

for k, v in year_report.items():
    p, r, f, s = v.values()
    report.append([k, p, r, f, s])
report

exportDF = pd.DataFrame(report, columns = ['Class', 'Precision', 'Recall', 'F1', 'Support'])
exportDF.to_excel('./Results/Year_Score_Matrix.xlsx')

In [None]:
print('Hamming score: {0}'.format(hamming_score(encoded_year, year_pred)))
print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(encoded_year, year_pred))) 
print('Subset accuracy: {0}'.format(sklearn.metrics.accuracy_score(encoded_year, year_pred, normalize=True, sample_weight=None)))

Hamming score: 0.7839015151515152
Hamming loss: 0.015762741046831955
Subset accuracy: 0.7839015151515152


In [None]:
tot_score = []

for idx, sub in enumerate(labYear_tok.classes_):

    subDF = normDF.loc[normDF['year_created'].str.contains(sub)]

    cnt_report = len(subDF)
    if cnt_report <=0 : continue

    inpText = subDF['text'].tolist()
    inpText = [' '.join(str(d).lower().split()[:max_word]) for d in inpText]
    inpText = texts_tok.texts_to_sequences(inpText)
    inpText = pad_sequences(inpText, maxlen=padding_len, padding='post', truncating='post')

    outYear = [[sub]] * cnt_report
    outYear = labYear_tok.transform(outYear)

    preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal, preStat = model.predict(inpText, verbose=0)

    preYear = np.where(preYear>=0.5, 1, 0)

    acc_report = sklearn.metrics.accuracy_score(outYear, preYear, normalize=True)
    prc_report = sklearn.metrics.precision_score(outYear, preYear, average='micro')
    rcl_report = sklearn.metrics.recall_score(outYear, preYear, average='micro')
    fsc_report = sklearn.metrics.f1_score(outYear, preYear, average='micro')
    avg_report = (acc_report + prc_report + rcl_report + fsc_report) / 4

    tot_score.append([sub, acc_report, prc_report, rcl_report, fsc_report, cnt_report])

year_score = pd.DataFrame(tot_score, columns = ['Year', 'Accuracy', 'Precision', 'Recall', 'F1', 'Support']) 
year_score.to_excel('./Results/Year_Score_Matrix.xlsx')

  _warn_prf(average, modifier, msg_start, len(result))


### 2.4.6. Month Evaluation

In [None]:
month_report = classification_report(encoded_month, month_pred, target_names=labMonth_tok.classes_, digits=6, zero_division=False, output_dict=True)
print(month_report)

{'April': {'precision': 0.7448979591836735, 'recall': 0.6441176470588236, 'f1-score': 0.6908517350157729, 'support': 340}, 'August': {'precision': 0.7931034482758621, 'recall': 0.6680497925311203, 'f1-score': 0.7252252252252254, 'support': 482}, 'December': {'precision': 0.7514285714285714, 'recall': 0.6159250585480094, 'f1-score': 0.6769626769626771, 'support': 427}, 'February': {'precision': 0.7539936102236422, 'recall': 0.5841584158415841, 'f1-score': 0.6582984658298465, 'support': 404}, 'January': {'precision': 0.8024691358024691, 'recall': 0.6782608695652174, 'f1-score': 0.7351555136663525, 'support': 575}, 'July': {'precision': 0.759075907590759, 'recall': 0.6149732620320856, 'f1-score': 0.6794682422451994, 'support': 374}, 'June': {'precision': 0.8295964125560538, 'recall': 0.5623100303951368, 'f1-score': 0.6702898550724639, 'support': 329}, 'March': {'precision': 0.7355623100303952, 'recall': 0.6019900497512438, 'f1-score': 0.6621067031463749, 'support': 402}, 'May': {'precisio

In [None]:
report = []

for k, v in month_report.items():
    p, r, f, s = v.values()
    report.append([k, p, r, f, s])
report

exportDF = pd.DataFrame(report, columns = ['Class', 'Precision', 'Recall', 'F1', 'Support'])
exportDF.to_excel('./Results/Month_Score_Matrix.xlsx')

In [None]:
print('Hamming score: {0}'.format(hamming_score(encoded_month, month_pred)))
print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(encoded_month, month_pred))) 
print('Subset accuracy: {0}'.format(sklearn.metrics.accuracy_score(encoded_month, month_pred, normalize=True, sample_weight=None)))

Hamming score: 0.6414772727272727
Hamming loss: 0.04655934343434343
Subset accuracy: 0.6414772727272727


In [None]:
tot_score = []

for idx, sub in enumerate(labMonth_tok.classes_):

    subDF = normDF.loc[normDF['month_created'].str.contains(sub)]

    cnt_report = len(subDF)
    if cnt_report <=0 : continue

    inpText = subDF['text'].tolist()
    inpText = [' '.join(str(d).lower().split()[:max_word]) for d in inpText]
    inpText = texts_tok.texts_to_sequences(inpText)
    inpText = pad_sequences(inpText, maxlen=padding_len, padding='post', truncating='post')

    outMonth = [[sub]] * cnt_report
    outMonth = labMonth_tok.transform(outMonth)

    preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal, preStat = model.predict(inpText, verbose=0)

    preMonth = np.where(preMonth>=0.5, 1, 0)

    acc_report = sklearn.metrics.accuracy_score(outMonth, preMonth, normalize=True)
    prc_report = sklearn.metrics.precision_score(outMonth, preMonth, average='micro')
    rcl_report = sklearn.metrics.recall_score(outMonth, preMonth, average='micro')
    fsc_report = sklearn.metrics.f1_score(outMonth, preMonth, average='micro')
    avg_report = (acc_report + prc_report + rcl_report + fsc_report) / 4

    tot_score.append([sub, acc_report, prc_report, rcl_report, fsc_report, cnt_report])

month_score = pd.DataFrame(tot_score, columns = ['Month', 'Accuracy', 'Precision', 'Recall', 'F1', 'Support']) 
month_score.to_excel('./Results/Month_Score_Matrix.xlsx')

### 2.4.7. Reason Evaluation

In [None]:
reason_report = classification_report(encoded_reason, reason_pred, target_names=labReason_tok.classes_, digits=6, zero_division=False, output_dict=True)
print(reason_report)

{'Cold Wave': {'precision': 0.918918918918919, 'recall': 0.7083333333333334, 'f1-score': 0.8000000000000002, 'support': 48}, 'Drought': {'precision': 0.9593810444874274, 'recall': 0.5714285714285714, 'f1-score': 0.716245487364621, 'support': 868}, 'Earthquake': {'precision': 0.9768888888888889, 'recall': 0.7616077616077616, 'f1-score': 0.8559190031152648, 'support': 1443}, 'Epidemic': {'precision': 0.9852125693160814, 'recall': 0.8766447368421053, 'f1-score': 0.927763272410792, 'support': 608}, 'Extratropical Cyclone': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}, 'Fire': {'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}, 'Flash Flood': {'precision': 0.9821428571428571, 'recall': 0.07333333333333333, 'f1-score': 0.13647642679900743, 'support': 750}, 'Flood': {'precision': 0.9580908032596042, 'recall': 0.4824150058616647, 'f1-score': 0.6417153996101365, 'support': 1706}, 'Heat Wave': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'su

In [None]:
report = []

for k, v in reason_report.items():
    p, r, f, s = v.values()
    report.append([k, p, r, f, s])
report

exportDF = pd.DataFrame(report, columns = ['Class', 'Precision', 'Recall', 'F1', 'Support'])
exportDF.to_excel('./Results/Reason_Score_Matrix.xlsx')

In [None]:
print('Hamming score: {0}'.format(hamming_score(encoded_reason, reason_pred)))
print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(encoded_reason, reason_pred))) 
print('Subset accuracy: {0}'.format(sklearn.metrics.accuracy_score(encoded_reason, reason_pred, normalize=True, sample_weight=None)))

Hamming score: 0.6574273989898989
Hamming loss: 0.04551767676767677
Subset accuracy: 0.5234848484848484


In [None]:
tot_score = []

for idx, sub in enumerate(labReason_tok.classes_):

    subDF = normDF.loc[normDF['disaster_type'].str.contains(sub)]

    cnt_report = len(subDF)
    if cnt_report <=0 : continue

    inpText = subDF['text'].tolist()
    inpText = [' '.join(str(d).lower().split()[:max_word]) for d in inpText]
    inpText = texts_tok.texts_to_sequences(inpText)
    inpText = pad_sequences(inpText, maxlen=padding_len, padding='post', truncating='post')

    outReason = [[sub]] * cnt_report
    outReason = labReason_tok.transform(outReason)

    preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal, preStat = model.predict(inpText, verbose=0)

    preReason = np.where(preReason>=0.5, 1, 0)

    acc_report = sklearn.metrics.accuracy_score(outReason, preReason, normalize=True)
    prc_report = sklearn.metrics.precision_score(outReason, preReason, average='micro')
    rcl_report = sklearn.metrics.recall_score(outReason, preReason, average='micro')
    fsc_report = sklearn.metrics.f1_score(outReason, preReason, average='micro')
    avg_report = (acc_report + prc_report + rcl_report + fsc_report) / 4

    tot_score.append([sub, acc_report, prc_report, rcl_report, fsc_report, cnt_report])

reason_score = pd.DataFrame(tot_score, columns = ['Reason', 'Accuracy', 'Precision', 'Recall', 'F1', 'Support']) 
reason_score.to_excel('./Results/Reason_Score_Matrix.xlsx')

### 2.4.8. SDG Evaluation

In [None]:
goal_report = classification_report(encoded_goal, goal_pred, target_names=labGoal_tok.classes_, digits=6, zero_division=False, output_dict=True)
print(goal_report)

{'Affordable and Clean Energy': {'precision': 1.0, 'recall': 0.6666666666666666, 'f1-score': 0.8, 'support': 3}, 'Clean Water and Sanitation': {'precision': 0.8538011695906432, 'recall': 0.5816733067729084, 'f1-score': 0.6919431279620853, 'support': 251}, 'Climate Action': {'precision': 0.7568807339449541, 'recall': 0.6066176470588235, 'f1-score': 0.673469387755102, 'support': 272}, 'Decent Work and Economic Growth': {'precision': 0.7575757575757576, 'recall': 0.5813953488372093, 'f1-score': 0.6578947368421053, 'support': 43}, 'Gender Equality': {'precision': 1.0, 'recall': 0.2, 'f1-score': 0.33333333333333337, 'support': 5}, 'Good Health and Well-being': {'precision': 0.7573415765069552, 'recall': 0.7401812688821753, 'f1-score': 0.7486631016042782, 'support': 662}, 'Industry Innovation and Infrastructure': {'precision': 0.7105966162065895, 'recall': 0.6779949022939677, 'f1-score': 0.6939130434782609, 'support': 1177}, 'Life Below Water': {'precision': 0.7007299270072993, 'recall': 0.5

In [None]:
report = []

for k, v in goal_report.items():
    p, r, f, s = v.values()
    report.append([k, p, r, f, s])
report

exportDF = pd.DataFrame(report, columns = ['Class', 'Precision', 'Recall', 'F1', 'Support'])
exportDF.to_excel('./Results/Goal_Score_Matrix.xlsx')

In [None]:
print('Hamming score: {0}'.format(hamming_score(encoded_goal, goal_pred)))
print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(encoded_goal, goal_pred))) 
print('Subset accuracy: {0}'.format(sklearn.metrics.accuracy_score(encoded_goal, goal_pred, normalize=True, sample_weight=None)))

Hamming score: 0.6948863636363637
Hamming loss: 0.03179450757575757
Subset accuracy: 0.6948863636363637


In [None]:
tot_score = []

for idx, sub in enumerate(labGoal_tok.classes_):

    subDF = normDF.loc[normDF['development_goal'].str.contains(sub)]

    cnt_report = len(subDF)
    if cnt_report <=0 : continue

    inpText = subDF['text'].tolist()
    inpText = [' '.join(str(d).lower().split()[:max_word]) for d in inpText]
    inpText = texts_tok.texts_to_sequences(inpText)
    inpText = pad_sequences(inpText, maxlen=padding_len, padding='post', truncating='post')

    outGoal = [[sub]] * cnt_report
    outGoal = labGoal_tok.transform(outGoal)

    preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal, preStat = model.predict(inpText, verbose=0)

    preGoal = np.where(preGoal>=0.5, 1, 0)

    acc_report = sklearn.metrics.accuracy_score(outGoal, preGoal, normalize=True)
    prc_report = sklearn.metrics.precision_score(outGoal, preGoal, average='micro')
    rcl_report = sklearn.metrics.recall_score(outGoal, preGoal, average='micro')
    fsc_report = sklearn.metrics.f1_score(outGoal, preGoal, average='micro')
    avg_report = (acc_report + prc_report + rcl_report + fsc_report) / 4
    
    tot_score.append([sub, acc_report, prc_report, rcl_report, fsc_report, cnt_report])

goal_score = pd.DataFrame(tot_score, columns = ['Goal', 'Accuracy', 'Precision', 'Recall', 'F1', 'Support']) 
goal_score.to_excel('./Results/Goal_Score_Matrix.xlsx')

# 3. Results Testing

## 3.1. Internal Test (ReliefWeb)

In [None]:
docs = normDF['common_words'].tolist()
randActor = normDF['source_name'].tolist()
randAgency = normDF['source_type'].tolist()
randSector = normDF['theme'].tolist()
randPlace = normDF['country_name'].tolist()
randMonth = normDF['date_created'].tolist()
randReason = normDF['disaster_type'].tolist()
randGoal = normDF['development_goal'].tolist()

In [None]:
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
print(f'This is a data sub-set of 20 instances randomly picked from the main set used to train this model\n')
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')

randDF = normDF
randDF = normDF.sample(n = 20)

for idx, row in enumerate(randDF.itertuples()):
    
    actText = row.text
    actActor = eval(row.source_name)
    actAgency = eval(row.source_type)
    actSector = eval(row.theme)
    actPlace = eval(row.country_name)
    actYear = eval(row.year_created)
    actMonth = eval(row.month_created)
    actReason = eval(row.disaster_type)
    actGoal = eval(row.development_goal)

    preText = texts_tok.texts_to_sequences([actText])[0]
    preText = np.array([preText])
    preText = pad_sequences(preText, maxlen=padding_len)

    outActor, outAgency, outSector, outPlace, outYear, outMonth, outReason, outGoal, outStat = model.predict(preText, verbose=0)

    maxActor = np.where(outActor>=np.sort(outActor[0])[-len(set(actActor))], 1, 0)
    maxAgency = np.where(outAgency>=np.sort(outAgency[0])[-len(set(actAgency))], 1, 0)
    maxSector = np.where(outSector>=np.sort(outSector[0])[-len(set(actSector))], 1, 0)
    maxPlace = np.where(outPlace>=np.sort(outPlace[0])[-len(set(actPlace))], 1, 0)
    maxYear = np.where(outYear>=np.sort(outYear[0])[-len(set(actYear))], 1, 0)
    maxMonth = np.where(outMonth>=np.sort(outMonth[0])[-len(set(actMonth))], 1, 0)
    maxReason = np.where(outReason>=np.sort(outReason[0])[-len(set(actReason))], 1, 0)
    maxGoal = np.where(outGoal>=np.sort(outGoal[0])[-len(set(actGoal))], 1, 0)

    preActor = labActor_tok.inverse_transform(maxActor)
    preAgency = labAgency_tok.inverse_transform(maxAgency)
    preSector = labSector_tok.inverse_transform(maxSector)
    prePlace = labPlace_tok.inverse_transform(maxPlace)
    preYear = labYear_tok.inverse_transform(maxYear)
    preMonth = labMonth_tok.inverse_transform(maxMonth)
    preReason = labReason_tok.inverse_transform(maxReason)
    preGoal = labGoal_tok.inverse_transform(maxGoal)
    
    print(f'The text used in sample [{idx}] is: {row.text[:80]} ......')
    print(f'Overall confidence in this prediction is: {round(outStat[0][0]*100, 2)}%\n')

    print(f'Actual Actor for this disaster is: {actActor}')
    print(f'Predicted Actor for this disaster is: {preActor}')
    print(f'Confidence in this prediction is: {round(np.sort(outActor[0])[-1] * 100, 2)}%\n')

    print(f'Actual Agency for this disaster is: {actAgency}')
    print(f'Predicted Agency for this disaster is: {preAgency}')
    print(f'Confidence in this prediction is: {round(np.sort(outAgency[0])[-1] * 100, 2)}%\n')

    print(f'Actual Sector for this disaster is: {actSector}')
    print(f'Predicted Sector for this disaster is: {preSector}')
    print(f'Confidence in this prediction is: {round(np.sort(outSector[0])[-1] * 100, 2)}%\n')

    print(f'Actual Place for this disaster is: {actPlace}')
    print(f'Predicted Place for this disaster is: {prePlace}')
    print(f'Confidence in this prediction is: {round(np.sort(outPlace[0])[-1] * 100, 2)}%\n')

    print(f'Actual Year for this disaster is: {actYear}')
    print(f'Predicted Year for this disaster is: {preYear}')
    print(f'Confidence in this prediction is: {round(np.sort(outYear[0])[-1] * 100, 2)}%\n')

    print(f'Actual Month for this disaster is: {actMonth}')
    print(f'Predicted Month for this disaster is: {preMonth}')
    print(f'Confidence in this prediction is: {round(np.sort(outMonth[0])[-1] * 100, 2)}%\n')

    print(f'Actual Reason for this disaster is: {actReason}')
    print(f'Predicted Reason for this disaster is: {preReason}')
    print(f'Confidence in this prediction is: {round(np.sort(outReason[0])[-1] * 100, 2)}%\n')

    print(f'Actual SDG for this disaster is: {actGoal}')
    print(f'Predicted SDG for this disaster is: {preGoal}')
    print(f'Confidence in this prediction is: {round(np.sort(outGoal[0])[-1] * 100, 2)}%\n')

    print('------------------------------------------------------------------------------------------------\n')


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

This is a data sub-set of 20 instances randomly picked from the main set used to train this model

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

The text used in sample [0] is: funds sent national council churches philippines nccp amount sent us 50000 date  ......
Overall confidence in this prediction is: 99.04%

Actual Actor for this disaster is: ['Action by Churches Together International']
Predicted Actor for this disaster is: [('Government of the Philippines',)]
Confidence in this prediction is: 72.77%

Actual Agency for this disaster is: ['Non-governmental Organization']
Predicted Agency for this disaster is: [('Non-governmental Organization',)]
Confidence in this prediction is: 57.17%

Actual Sector for this disaster is: ['Shelter and Non-Food Items', 'Food and Nutrition', 'Water Sanitation Hygiene', 'Contributions']
Predicted S

## 3.2. External Test (Twitter)

In [None]:
tweetDT = './Inputs/SEmHuS_Random_Humanitarian_Tweets.csv'
tweetDF = pd.read_csv(tweetDT)

In [None]:
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
print(f'This is a data sub-set of 20 humanitarian tweets randomly picked from Twitter to test the model\n')
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')

for idx, row in enumerate(tweetDF.itertuples()):
    
    actText = str(row.tweet).lower()

    actText = preprocess.clean_text(actText)

    preText = ' '.join([w for w in str((actText + ' ') * padding_len).split()][:padding_len])

    preText = texts_tok.texts_to_sequences([preText])[0]
    preText = np.array([preText])
    preText = pad_sequences(preText, maxlen=padding_len, padding='post', truncating='post')

    outActor, outAgency, outSector, outPlace, outYear, outMonth, outReason, outGoal, outStat = model.predict(preText, verbose=0)

    maxActor = np.where(outActor>=np.sort(outActor[0])[-1], 1, 0)
    maxAgency = np.where(outAgency>=np.sort(outAgency[0])[-1], 1, 0)
    maxSector = np.where(outSector>=np.sort(outSector[0])[-1], 1, 0)
    maxPlace = np.where(outPlace>=np.sort(outPlace[0])[-1], 1, 0)
    maxYear = np.where(outYear>=np.sort(outYear[0])[-1], 1, 0)
    maxMonth = np.where(outMonth>=np.sort(outMonth[0])[-1], 1, 0)
    maxReason = np.where(outReason>=np.sort(outReason[0])[-1], 1, 0)
    maxGoal = np.where(outGoal>=np.sort(outGoal[0])[-1], 1, 0)

    preActor = labActor_tok.inverse_transform(maxActor)
    preAgency = labAgency_tok.inverse_transform(maxAgency)
    preSector = labSector_tok.inverse_transform(maxSector)
    prePlace = labPlace_tok.inverse_transform(maxPlace)
    preYear = labYear_tok.inverse_transform(maxYear)
    preMonth = labMonth_tok.inverse_transform(maxMonth)
    preReason = labReason_tok.inverse_transform(maxReason)
    preGoal = labGoal_tok.inverse_transform(maxGoal)

    print(f'The text used in this sample is: {actText} ......')
    print(f'Overall confidence in this prediction is: {round(outStat[0][0]*100, 2)}%\n')

    print(f'Predicted Actor for this question is:\t{preActor} | Confidence: {round(np.sort(outActor[0])[-1] * 100, 2)}%')
    print(f'Predicted Agency for this question is:\t{preAgency} | Confidence: {round(np.sort(outAgency[0])[-1] * 100, 2)}%')
    print(f'Predicted Sector for this question is:\t{preSector} | Confidence: {round(np.sort(outSector[0])[-1] * 100, 2)}%')
    print(f'Predicted Place for this question is:\t{prePlace} | Confidence: {round(np.sort(outPlace[0])[-1] * 100, 2)}%')
    print(f'Predicted Year for this question is:\t{preYear} | Confidence: {round(np.sort(outYear[0])[-1] * 100, 2)}%')
    print(f'Predicted Month for this question is:\t{preMonth} | Confidence: {round(np.sort(outMonth[0])[-1] * 100, 2)}%')
    print(f'Predicted Reason for this question is:\t{preReason} | Confidence: {round(np.sort(outReason[0])[-1] * 100, 2)}%')
    print(f'Predicted SDG for this question is:\t{preGoal} | Confidence: {round(np.sort(outGoal[0])[-1] * 100, 2)}%')
    print('------------------------------------------------------------------------------------------------\n')

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

This is a data sub-set of 20 humanitarian tweets randomly picked from Twitter to test the model

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

The text used in this sample is: improving refugee camp infrastructure responding growing humanitarian crisis students show skills knowledge work ethic build better lebanon . 🇱🇧 httpsanera . orgstorieson-the-job-training-students-communities-lebanon uniceflebanon education jobs relief lebanonprotests ......
Overall confidence in this prediction is: 98.71%

Predicted Actor for this question is:	[("SOS Children's Villages International",)] | Confidence: 64.99%
Predicted Agency for this question is:	[('Non-governmental Organization',)] | Confidence: 91.83%
Predicted Sector for this question is:	[('Education',)] | Confidence: 64.59%
Predicted Place for this question is:	[('Lebanon',)] | Confidence:

## 3.3. Competency Questions

In [None]:
questionFN = './Inputs/SEmHuS_Random_Competency_Question.csv'
questionDT = pd.read_csv(questionFN)

In [None]:
answerFN = './Inputs/SEmHuS_Random_Competency_Answer.csv'
answerDT = pd.DataFrame(columns=['actor', 'affiliation', 'sector', 'place', 'year', 'month', 'reason', 'SDG'])

In [None]:
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
print(f'This is a data sub-set of 10 Competency Questions to test the model\n')
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')

for idx, row in enumerate(questionDT.itertuples()):
    
    actText = str(row.question).lower()

    actText = preprocess.clean_text(actText)

    preText = ' '.join([w for w in str((actText + ' ') * padding_len).split()][:padding_len])

    preText = texts_tok.texts_to_sequences([preText])[0]
    preText = np.array([preText])
    preText = pad_sequences(preText, maxlen=padding_len)

    outActor, outAgency, outSector, outPlace, outYear, outMonth, outReason, outGoal, outStat = model.predict(preText, verbose=0)

    maxActor = np.where(outActor>=np.sort(outActor[0])[-1], 1, 0)
    maxAgency = np.where(outAgency>=np.sort(outAgency[0])[-1], 1, 0)
    maxSector = np.where(outSector>=np.sort(outSector[0])[-1], 1, 0)
    maxPlace = np.where(outPlace>=np.sort(outPlace[0])[-1], 1, 0)
    maxYear = np.where(outYear>=np.sort(outYear[0])[-1], 1, 0)
    maxMonth = np.where(outMonth>=np.sort(outMonth[0])[-1], 1, 0)
    maxReason = np.where(outReason>=np.sort(outReason[0])[-1], 1, 0)
    maxGoal = np.where(outGoal>=np.sort(outGoal[0])[-1], 1, 0)

    preActor = labActor_tok.inverse_transform(maxActor)
    preAgency = labAgency_tok.inverse_transform(maxAgency)
    preSector = labSector_tok.inverse_transform(maxSector)
    prePlace = labPlace_tok.inverse_transform(maxPlace)
    preYear = labYear_tok.inverse_transform(maxYear)
    preMonth = labMonth_tok.inverse_transform(maxMonth)
    preReason = labReason_tok.inverse_transform(maxReason)
    preGoal = labGoal_tok.inverse_transform(maxGoal)

    print(f'The text used in this sample is: {actText} ......')
    print(f'Overall confidence in this prediction is: {round(outStat[0][0]*100, 2)}%\n')

    print(f'Predicted Actor is:\t{preActor} | Confidence: {round(np.sort(outActor[0])[-1] * 100, 2)}%')
    print(f'Predicted Agency is:\t{preAgency} | Confidence: {round(np.sort(outAgency[0])[-1] * 100, 2)}%')
    print(f'Predicted Sector is:\t{preSector} | Confidence: {round(np.sort(outSector[0])[-1] * 100, 2)}%')
    print(f'Predicted Place is:\t{prePlace} | Confidence: {round(np.sort(outPlace[0])[-1] * 100, 2)}%')
    print(f'Predicted Year is:\t{preYear} | Confidence: {round(np.sort(outYear[0])[-1] * 100, 2)}%')
    print(f'Predicted Month is:\t{preMonth} | Confidence: {round(np.sort(outMonth[0])[-1] * 100, 2)}%')
    print(f'Predicted Reason is:\t{preReason} | Confidence: {round(np.sort(outReason[0])[-1] * 100, 2)}%')
    print(f'Predicted SDG is:\t{preGoal} | Confidence: {round(np.sort(outGoal[0])[-1] * 100, 2)}%')
    print('\n------------------------------------------------------------------------------------------------\n')

    answerDT.loc[idx] = [preActor, preAgency, preSector, prePlace, preYear, preMonth, preReason, preGoal]

answerDT.to_csv(answerFN)


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

This is a data sub-set of 10 Competency Questions to test the model

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

The text used in this sample is: country earthquake 2007 ......
Overall confidence in this prediction is: 98.99%

Predicted Actor is:	[('International Organization for Migration',)] | Confidence: 99.38%
Predicted Agency is:	[('International Organization',)] | Confidence: 98.27%
Predicted Sector is:	[('Shelter and Non-Food Items',)] | Confidence: 46.85%
Predicted Place is:	[('Uganda',)] | Confidence: 58.07%
Predicted Year is:	[('2007',)] | Confidence: 96.38%
Predicted Month is:	[('May',)] | Confidence: 59.82%
Predicted Reason is:	[('Earthquake',)] | Confidence: 99.11%
Predicted SDG is:	[('Sustainable Cities and Communities',)] | Confidence: 54.39%

---------------------------------------------------------------------------

## 3.4. Random Questions

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
print(f'Free text query to try the model and assess the quality of the answers')
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')

# num_answer = 5

actText = 'Pacific Tropical Cyclone unicef'

padText = ' '.join([w for w in str((actText + ' ') * padding_len).split()][:padding_len])

padText = preprocess.clean_text(padText)

preText = texts_tok.texts_to_sequences([padText])[0]
preText = np.array([preText])
preText = pad_sequences(preText, maxlen=padding_len, padding='post', truncating='post')

outActor, outAgency, outSector, outPlace, outYear, outMonth, outReason, outGoal, outStat = model.predict(preText, verbose=0)

maxActor = np.where(outActor>=np.sort(outActor[0])[-1], 1, 0)
maxAgency = np.where(outAgency>=np.sort(outAgency[0])[-1], 1, 0)
maxSector = np.where(outSector>=np.sort(outSector[0])[-1], 1, 0)
maxPlace = np.where(outPlace>=np.sort(outPlace[0])[-1], 1, 0)
maxYear = np.where(outYear>=np.sort(outYear[0])[-1], 1, 0)
maxMonth = np.where(outMonth>=np.sort(outMonth[0])[-1], 1, 0)
maxReason = np.where(outReason>=np.sort(outReason[0])[-1], 1, 0)
maxGoal = np.where(outGoal>=np.sort(outGoal[0])[-1], 1, 0)

preActor = labActor_tok.inverse_transform(maxActor)
preAgency = labAgency_tok.inverse_transform(maxAgency)
preSector = labSector_tok.inverse_transform(maxSector)
prePlace = labPlace_tok.inverse_transform(maxPlace)
preYear = labYear_tok.inverse_transform(maxYear)
preMonth = labMonth_tok.inverse_transform(maxMonth)
preReason = labReason_tok.inverse_transform(maxReason)
preGoal = labGoal_tok.inverse_transform(maxGoal)

print(f'The text used in this example is: {actText} ......')
print(f'Overall confidence in this prediction is: {round(outStat[0][0]*100, 2)}%\n')

print(f'Predicted Actor is:\t{preActor} | Confidence: {round(np.sort(outActor[0])[-1] * 100, 2)}%')
print(f'Predicted Agency is:\t{preAgency} | Confidence: {round(np.sort(outAgency[0])[-1] * 100, 2)}%')
print(f'Predicted Sector is:\t{preSector} | Confidence: {round(np.sort(outSector[0])[-1] * 100, 2)}%')
print(f'Predicted Place is:\t{prePlace} | Confidence: {round(np.sort(outPlace[0])[-1] * 100, 2)}%')
print(f'Predicted Year is:\t{preYear} | Confidence: {round(np.sort(outYear[0])[-1] * 100, 2)}%')
print(f'Predicted Month is:\t{preMonth} | Confidence: {round(np.sort(outMonth[0])[-1] * 100, 2)}%')
print(f'Predicted Reason is:\t{preReason} | Confidence: {round(np.sort(outReason[0])[-1] * 100, 2)}%')
print(f'Predicted SDG is:\t{preGoal} | Confidence: {round(np.sort(outGoal[0])[-1] * 100, 2)}%')
print('\n--------------------------------------------------------------------------------------------------\n')

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Free text query to try the model and assess the quality of the answers
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

The text used in this example is: Pacific Tropical Cyclone unicef ......
Overall confidence in this prediction is: 99.07%

Predicted Actor is:	[("UN Children's Fund",)] | Confidence: 94.83%
Predicted Agency is:	[('International Organization',)] | Confidence: 99.79%
Predicted Sector is:	[('Education',)] | Confidence: 32.02%
Predicted Place is:	[('Samoa',)] | Confidence: 31.53%
Predicted Year is:	[('2007',)] | Confidence: 31.11%
Predicted Month is:	[('December',)] | Confidence: 38.77%
Predicted Reason is:	[('Tropical Cyclone',)] | Confidence: 99.74%
Predicted SDG is:	[('Sustainable Cities and Communities',)] | Confidence: 62.7%

-----------------------------------------------------------------------------------------------

## 3.5. Draw Answers

In [None]:
from sklearn.manifold import TSNE
import seaborn
from  matplotlib import pyplot
seaborn.set(context='notebook', style='ticks', palette='deep', font='sans-serif', font_scale=1.4, color_codes=False, rc=None)

In [None]:
tsne = TSNE(n_components=2, random_state=0)

In [None]:
randDF = normDF

In [None]:
ansVector = []

ansActor, ansAgency, ansSector, ansPlace, ansYear, ansMonth, ansReason, ansGoal = [], [], [], [], [], [], [], []
conActor, conAgency, conSector, conPlace, conYear, conMonth, conReason, conGoal = [], [], [], [], [], [], [], []

for idx, row in enumerate(randDF.itertuples()):
    
    actText = str(row.text)
    actActor = eval(row.source_name)
    actAgency = eval(row.source_type)
    actSector = eval(row.theme)
    actPlace = eval(row.country_name)
    actYear = eval(row.year_created)
    actMonth = eval(row.month_created)
    actReason = eval(row.disaster_type)
    actGoal = eval(row.development_goal)

    preText = texts_tok.texts_to_sequences([actText])[0]
    preText = np.array([preText])
    preText = pad_sequences(preText, maxlen=padding_len)

    outActor, outAgency, outSector, outPlace, outYear, outMonth, outReason, outGoal, outStat = model.predict(preText, verbose=0)

    maxActor = np.where(outActor>=np.sort(outActor[0])[-len(set(actActor))], 1, 0)
    maxAgency = np.where(outAgency>=np.sort(outAgency[0])[-len(set(actAgency))], 1, 0)
    maxSector = np.where(outSector>=np.sort(outSector[0])[-len(set(actSector))], 1, 0)
    maxPlace = np.where(outPlace>=np.sort(outPlace[0])[-len(set(actPlace))], 1, 0)
    maxYear = np.where(outYear>=np.sort(outYear[0])[-len(set(actYear))], 1, 0)
    maxMonth = np.where(outMonth>=np.sort(outMonth[0])[-len(set(actMonth))], 1, 0)
    maxReason = np.where(outReason>=np.sort(outReason[0])[-len(set(actReason))], 1, 0)
    maxGoal = np.where(outGoal>=np.sort(outGoal[0])[-len(set(actGoal))], 1, 0)

    preActor = labActor_tok.inverse_transform(maxActor)
    preAgency = labAgency_tok.inverse_transform(maxAgency)
    preSector = labSector_tok.inverse_transform(maxSector)
    prePlace = labPlace_tok.inverse_transform(maxPlace)
    preYear = labYear_tok.inverse_transform(maxYear)
    preMonth = labMonth_tok.inverse_transform(maxMonth)
    preReason = labReason_tok.inverse_transform(maxReason)
    preGoal = labGoal_tok.inverse_transform(maxGoal)

    outVector = [outActor[0], outAgency[0], outSector[0], outPlace[0], outYear[0], outMonth[0], outReason[0], outGoal[0]]
    outVector = itertools.chain(*outVector)
    outVector = list(outVector)
    
    ansVector.append(outVector)

    ansActor.append(preActor[0][0])
    ansAgency.append(preAgency[0][0])
    ansSector.append(preSector[0][0])
    ansPlace.append(prePlace[0][0])
    ansYear.append(preYear[0][0])
    ansMonth.append(preMonth[0][0])
    ansReason.append(preReason[0][0])
    ansGoal.append(preGoal[0][0])

    conActor.append(np.sort(outActor[0])[-1])
    conAgency.append(np.sort(outAgency[0])[-1])
    conSector.append(np.sort(outSector[0])[-1])
    conPlace.append(np.sort(outPlace[0])[-1])
    conYear.append(np.sort(outYear[0])[-1])
    conMonth.append(np.sort(outMonth[0])[-1])
    conReason.append(np.sort(outReason[0])[-1])
    conGoal.append(np.sort(outGoal[0])[-1])

In [None]:
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [None]:
arr_shape = np.shape(arr)[-1]

In [None]:
arr = np.asarray(ansVector, dtype=np.float32)

In [None]:
def autoencoder(data):
    m = Sequential()
    m.add(Dense(512,  activation='tanh', input_shape=(arr_shape,)))
    m.add(Dense(128,  activation='sigmoid'))
    m.add(Dense(300,    activation='linear', name="bottleneck"))
    m.add(Dense(128,  activation='elu'))
    m.add(Dense(512,  activation='elu'))
    m.add(Dense(arr_shape,  activation='sigmoid'))

    m.compile(loss='mean_squared_error', optimizer = Adam())

    history = m.fit(data, data, batch_size=128, epochs=5, verbose=1)

    encoder = Model(m.input, m.get_layer('bottleneck').output)
    Zenc = encoder.predict(data)
    Renc = m.predict(data)
    return Zenc

In [None]:
res = autoencoder(arr)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
tsne = TSNE(n_components=2, random_state=0)
reduced_vector = tsne.fit_transform(arr)
X_dim = reduced_vector[:,0]
Y_dim = reduced_vector[:,1]

In [None]:
answer_df = pd.DataFrame({'Actor':ansActor,
                        'Agency':ansAgency,
                        'Sector':ansSector,
                        'Place':ansPlace,
                        'Year':ansYear,
                        'Month':ansMonth,
                        'Reason':ansReason,
                        'SDG':ansGoal,
                        'Vector':res.tolist(),
                        'X':X_dim,
                        'Y':Y_dim})

### 3.5.0. Actor Plot

In [None]:
_actor = labActor_tok.classes_.tolist()

fg = seaborn.FacetGrid(data=answer_df, hue='Actor', hue_order=_actor, aspect=1.61, height=10)
fg.map(pyplot.scatter, 'X', 'Y')

<seaborn.axisgrid.FacetGrid at 0x7f4cb7c47490>

### 3.5.1. Agency Plot

In [None]:
_agency = labAgency_tok.classes_.tolist()

fg = seaborn.FacetGrid(data=answer_df, hue='Agency', hue_order=_agency, aspect=1.61, height=10)
fg.map(pyplot.scatter, 'X', 'Y').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7f4ca40d2410>

### 3.5.2. Sector Plot

In [None]:
_sector = labSector_tok.classes_.tolist()

fg = seaborn.FacetGrid(data=answer_df, hue='Sector', hue_order=_sector, aspect=1.61, height=10)
fg.map(pyplot.scatter, 'X', 'Y').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7f4ca409ff10>

### 3.5.3. Year Plot

In [None]:
_year = labYear_tok.classes_.tolist()

fg = seaborn.FacetGrid(data=answer_df, hue='Year', hue_order=_year, aspect=1.61, height=10)
fg.map(pyplot.scatter, 'X', 'Y').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7f4ca3a48cd0>

### 3.5.4. Month Plot

In [None]:
_month = labMonth_tok.classes_.tolist()

fg = seaborn.FacetGrid(data=answer_df, hue='Month', hue_order=_month, aspect=1.61, height=10)
fg.map(pyplot.scatter, 'X', 'Y').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7f4ca3a1edd0>

### 3.5.X Reasons Plot

In [None]:
_reason = labReason_tok.classes_.tolist()

fg = seaborn.FacetGrid(data=answer_df, hue='Reason', hue_order=_reason, aspect=1.61, height=10)
fg.map(pyplot.scatter, 'X', 'Y').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7f4ca38ed810>

### 3.5.X SDG Plot

In [None]:
_goal = labGoal_tok.classes_.tolist()

fg = seaborn.FacetGrid(data=answer_df, hue='SDG', hue_order=_goal, aspect=1.61, height=10)
fg.map(pyplot.scatter, 'X', 'Y').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7f4ca37d0990>

# 4. Knowledge Extraction

In [None]:
num_word = 5

## 4.1. Paraphrasing

In [None]:
from scipy.spatial import distance

In [None]:
for i in dictEmbed:
    print(i)
    break

people


In [None]:
def nearest_word(word):
    vec1 = dictEmbed[word]
    dist = []
    for word2, vec2 in dictEmbed.items():
        dist.append((np.linalg.norm(vec1 - vec2), word2))
    dist.sort()
    dist = dist[:num_word]
    return dist

def nearest_vector(vector):
    vec1 = vector
    dist = []
    for word2, vec2 in dictEmbed.items():
        dist.append((np.linalg.norm(vec1 - vec2), word2))
    dist.sort()
    dist = dist[:num_word]
    return dist

In [None]:
from scipy import spatial

In [None]:
find_nearest = 'development -undp +wfp'

ntr_word = []
pos_word = []
neg_word = []

key_words = find_nearest.lower()
key_words = key_words.split(' ')

for word in key_words:
    exact_word = word[1:]
    if word.startswith('+'):
        pos_word.append(dictEmbed[exact_word])
    elif word.startswith('-'):
        neg_word.append(dictEmbed[exact_word])
    else:
        ntr_word.append(dictEmbed[word])

pos_vec = np.sum(pos_word, axis=0)
neg_vec = np.sum(neg_word, axis=0)
ntr_vec = np.sum(ntr_word, axis=0)

sub_vect = np.add(ntr_vec, np.subtract(pos_vec, neg_vec))

result = []

for key, val in dictEmbed.items():
    obj_name = key
    obj_vect = val
    #print(obj_vect)
    
    tmp_dist = np.linalg.norm(sub_vect - obj_vect)

    result.append((tmp_dist, obj_name))

result.sort()
result = result[0:10]

for d, r in result:
    print(f'{r} ({round(d, 4)})')

food (3.95)
development (4.1486)
emergency (4.2584)
children (4.3085)
nutrition (4.3287)
humanitarian (4.3358)
supplies (4.3369)
government (4.3412)
rice (4.3646)
areas (4.3652)
