# Importings

In [2]:
import os
import random
from tqdm.notebook import tqdm
from config import BaseConfig
from models.utils import load_pkl, load_json, save_pkl, save_json
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier


warnings.filterwarnings('ignore')
CONFIG = BaseConfig().get_args()
label_encoder = load_pkl(CONFIG.path_saved_le)

# Helpers

* data loader
* evaluation methods

In [None]:
def data_loader(data, vectorizer, label_encoder):
    head_ents = [pairs[0] for pairs in tqdm(data)]
    tail_ents = [pairs[1] for pairs in tqdm(data)]
    labels = [pairs[2] for pairs in tqdm(data)]
    
    X = np.concatenate((vectorizer.transform(head_ents),  
                        vectorizer.transform(tail_ents)), axis=1)
    
    Y = label_encoder.transform(labels)
    return X, Y

def evaluation_method(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred, average='macro')
    rec = recall_score(y_true, y_pred, average='macro')
    
    clf_report = classification_report(y_true, y_pred)
    return {
                "y-true": [int(l) for l in list(y_true)], "y-pred": [int(l) for l in list(y_pred)],
                "f1": f1, "accuracy": acc, 
                "precision": pre, "recall": rec, 
                "clf-report": clf_report
            }

# Loading Dataset

In [3]:
train_data = load_json(CONFIG.train_path)
test_data = load_json(CONFIG.test_path)
dev_data = load_json(CONFIG.dev_path)

train_rel = [[example['tail']['word'], example['head']['word'], example['relation']]
             for example in tqdm(train_data)]

dev_rel = [[example['tail']['word'], example['head']['word'], example['relation']]
           for example in tqdm(dev_data)]

test_rel = [[example['tail']['word'], example['head']['word'], example['relation']]
            for example in tqdm(test_data)]

  0%|          | 0/534277 [00:00<?, ?it/s]

  0%|          | 0/114506 [00:00<?, ?it/s]

  0%|          | 0/114565 [00:00<?, ?it/s]

In [4]:
train_vocabulary = [pairs[0].lower() for pairs in tqdm(train_rel)] + [pairs[1].lower() for pairs in tqdm(train_rel)]

print(f"size of train vocabulary: {len(train_vocabulary)}")
print(f"size of unique train vocabulary: {len(list(set(train_vocabulary)))}")

  0%|          | 0/534277 [00:00<?, ?it/s]

  0%|          | 0/534277 [00:00<?, ?it/s]

size of train vocabulary: 1068554
size of unique train vocabulary: 51947


# Feature Extraction (LSA: TFIDF + SVD)

In [5]:
vectorizer = Pipeline(steps=[('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=50, n_iter=7))])

vectorizer.fit(train_vocabulary)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('svd', TruncatedSVD(n_components=50, n_iter=7))])

In [7]:
X_train_vec, y_train = data_loader(train_rel, vectorizer ,label_encoder)
X_dev_vec, y_dev= data_loader(dev_rel, vectorizer ,label_encoder)
X_test_vec, y_test = data_loader(test_rel, vectorizer ,label_encoder)

  0%|          | 0/534277 [00:00<?, ?it/s]

  0%|          | 0/534277 [00:00<?, ?it/s]

  0%|          | 0/534277 [00:00<?, ?it/s]

  0%|          | 0/114506 [00:00<?, ?it/s]

  0%|          | 0/114506 [00:00<?, ?it/s]

  0%|          | 0/114506 [00:00<?, ?it/s]

  0%|          | 0/114565 [00:00<?, ?it/s]

  0%|          | 0/114565 [00:00<?, ?it/s]

  0%|          | 0/114565 [00:00<?, ?it/s]

In [38]:
# # Save models and datasets
# save_pkl(os.path.join(CONFIG.pre_trained_dir, "tfidf-svd-vectorizer.sav"), vectorizer)

# dataset = {
#     "x-train": X_train_vec, "y-train":y_train, 
#     "x-test":X_test_vec, "y-test":y_test,
#     "x-dev": X_dev_vec, "y-dev":y_dev
# }

# save_pkl(os.path.join(CONFIG.pre_trained_dir, "tfidf_svd_dataset.pkl"), dataset)

# Model 1: LSA + LogisticRegression

In [8]:
lr_model = LogisticRegression()

In [None]:
lr_model.fit(X_train_vec, y_train)

In [43]:
y_train_pred = lr_model.predict(X_train_vec)
y_dev_pred = lr_model.predict(X_dev_vec)
y_test_pred = lr_model.predict(X_test_vec)

In [80]:
lr_results = {
    "train": evaluation_method(y_train, y_train_pred),
    "dev": evaluation_method(y_dev, y_dev_pred),
    "test": evaluation_method(y_test, y_test_pred)
}

print(f"TRAIN, F1-Score: {lr_results['train']['f1']}, Accuracy: {lr_results['train']['accuracy']}")
print(f"DEV, F1-Score: {lr_results['dev']['f1']}, Accuracy: {lr_results['dev']['accuracy']}")
print(f"TEST, F1-Score: {lr_results['test']['f1']}, Accuracy: {lr_results['test']['accuracy']}")

save_json(os.path.join("assets/predictions", "lsa-lr-baseline.json"), lr_results)

TRAIN, F1-Score: 0.09035886701731136, Accuracy: 0.45075681715664345
DEV, F1-Score: 0.08982119442949407, Accuracy: 0.4512514628054425
TEST, F1-Score: 0.08994936980797953, Accuracy: 0.45060009601536244


# Model 2: LSA + MLP

In [10]:
mlp_model = MLPClassifier(hidden_layer_sizes=(500,300, 250), max_iter=50, verbose=True)

In [None]:
mlp_model.fit(X_train_vec, y_train)

Iteration 1, loss = 2.19500436
Iteration 2, loss = 1.95286585
Iteration 3, loss = 1.84193755
Iteration 4, loss = 1.76079521
Iteration 5, loss = 1.70094362
Iteration 6, loss = 1.65756611
Iteration 7, loss = 1.62188848
Iteration 8, loss = 1.59232209
Iteration 9, loss = 1.57010624
Iteration 10, loss = 1.54919664
Iteration 11, loss = 1.53078800
Iteration 12, loss = 1.51297543
Iteration 13, loss = 1.49516405
Iteration 14, loss = 1.48378470
Iteration 15, loss = 1.47077423
Iteration 16, loss = 1.45814666
Iteration 17, loss = 1.44630844
Iteration 18, loss = 1.43538401
Iteration 19, loss = 1.42420774
Iteration 20, loss = 1.41623964
Iteration 21, loss = 1.40755203
Iteration 22, loss = 1.39794950
Iteration 23, loss = 1.39048937
Iteration 24, loss = 1.38152967
Iteration 25, loss = 1.37546609
Iteration 26, loss = 1.36854050
Iteration 27, loss = 1.35978068
Iteration 28, loss = 1.35330498
Iteration 29, loss = 1.34672524
Iteration 30, loss = 1.34174098
Iteration 31, loss = 1.33502608
Iteration 32, los

In [None]:
y_train_pred = mlp_model.predict(X_train_vec)
y_dev_pred = mlp_model.predict(X_dev_vec)
y_test_pred = mlp_model.predict(X_test_vec)

In [15]:
mlp_results = {
    "train": evaluation_method(y_train, y_train_pred),
    "dev": evaluation_method(y_dev, y_dev_pred),
    "test": evaluation_method(y_test, y_test_pred)
}

print(f"TRAIN, F1-Score: {mlp_results['train']['f1']}, Accuracy: {mlp_results['train']['accuracy']}")
print(f"DEV, F1-Score: {mlp_results['dev']['f1']}, Accuracy: {mlp_results['dev']['accuracy']}")
print(f"TEST, F1-Score: {mlp_results['test']['f1']}, Accuracy: {mlp_results['test']['accuracy']}")

save_json(os.path.join("assets/predictions", "lsa-mlp-baseline.json"), mlp_results)

TRAIN, F1-Score: 0.42479097677442734, Accuracy: 0.6700419445343895
DEV, F1-Score: 0.39509147010784373, Accuracy: 0.6602798106649433
TEST, F1-Score: 0.4022318090656121, Accuracy: 0.6603500196395059
