In [1]:
import pandas as pd
import json
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# This reads the already generated pre-processing file in the Voting Notebook
df = pd.read_csv('elmo_output.csv')

In [3]:
# Just a check for EDA to see counts of relations
df['label'].value_counts()

INHIBITOR                 2454
SUBSTRATE                  802
INDIRECT-DOWNREGULATOR     757
INDIRECT-UPREGULATOR       665
ACTIVATOR                  580
ANTAGONIST                 434
PRODUCT-OF                 363
AGONIST                    267
DOWNREGULATOR              152
UPREGULATOR                 84
SUBSTRATE_PRODUCT-OF        19
AGONIST-ACTIVATOR           15
AGONIST-INHIBITOR            4
Name: label, dtype: int64

In [4]:
# Reading the preproessed test file already generated in the voting Notebook
test=pd.read_csv('test_csv.csv')

In [8]:
# Function to convert ELMo embeddings string to numeric format
def convert_elmo_string_to_array(elmo_str):
    return np.array([float(x) for x in elmo_str.split(',')])

# Function to calculate the novelty feature of Relative 
def calculate_relative_positions(sentence, entity1, entity2):
    # Ensure entity1 and entity2 are treated as strings
    entity1 = str(entity1).strip()
    entity2 = str(entity2).strip()
    
    words = sentence.split()
    len_sentence = len(words)
    # Using str.find() safely by ensuring entity1 and entity2 are strings
    entity1_pos = sentence.find(entity1) / len(sentence) if entity1 in sentence else -1
    entity2_pos = sentence.find(entity2) / len(sentence) if entity2 in sentence else -1
    relative_positions = [abs(i / len_sentence - entity1_pos) + abs(i / len_sentence - entity2_pos) for i, _ in enumerate(words)]
    
    # Handling cases where entities might not be found in the sentence
    if entity1_pos == -1 or entity2_pos == -1:
        return 0  # Or some default value, adjust based on your requirements
    
    return np.mean(relative_positions)  # Using the mean relative position as a feature

# Convert ELMo embeddings strings to arrays
df['elmo_embeddings'] = df['elmo_embeddings_str'].apply(convert_elmo_string_to_array)
test['elmo_embeddings'] = test['elmo_embeddings_str'].apply(convert_elmo_string_to_array)
# Calculate relative position features
df['relative_position_features'] = df.apply(lambda row: calculate_relative_positions(row['text'], row['E1'], row['E2']), axis=1)
test['relative_position_features'] = test.apply(lambda row: calculate_relative_positions(row['text'], row['E1'], row['E2']), axis=1)
# Prepare dataset for SVM
X = np.array([np.append(embeddings, pos_feature) for embeddings, pos_feature in zip(df['elmo_embeddings'], df['relative_position_features'])])
test_val= np.array([np.append(embeddings, pos_feature) for embeddings, pos_feature in zip(test['elmo_embeddings'], test['relative_position_features'])])
y = df['label'].values  # Assuming you have a 'label' column for classification
test_la=test['label'].values
# Example: Splitting data and training an SVM
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
test_scaled=scaler.transform(test_val)
# Train SVM
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train_scaled, y_train)

# Predict and evaluate
predictions = svm_model.predict(X_test_scaled)
predictions_test= svm_model.predict(test_scaled)
accuracy = accuracy_score(y_test, predictions)
accuracy_2 = accuracy_score(test_la, predictions_test)
print("Accuracy:", accuracy)
print("Accuracy:", accuracy_2)

Accuracy: 0.7090909090909091
Accuracy: 0.5332948976650331


In [9]:
from sklearn.metrics import classification_report

# Assuming 'predictions' and 'y_test' are already defined from your model's output
report = classification_report(y_test, predictions, output_dict=True)

# Printing the classification report
print(classification_report(y_test, predictions))

# Accessing F1 score, precision, and recall for each class
for label, metrics in report.items():
    if label.isdigit():  # Check to ensure processing class labels, adjust as necessary
        print(f"Class: {label}, Precision: {metrics['precision']}, Recall: {metrics['recall']}, F1-score: {metrics['f1-score']}")


                        precision    recall  f1-score   support

             ACTIVATOR       0.65      0.45      0.53       112
               AGONIST       0.57      0.40      0.47        42
     AGONIST-ACTIVATOR       0.00      0.00      0.00         3
     AGONIST-INHIBITOR       0.00      0.00      0.00         2
            ANTAGONIST       0.88      0.79      0.83        99
         DOWNREGULATOR       0.77      0.52      0.62        33
INDIRECT-DOWNREGULATOR       0.66      0.61      0.63       151
  INDIRECT-UPREGULATOR       0.58      0.66      0.62       129
             INHIBITOR       0.72      0.92      0.80       466
            PRODUCT-OF       0.66      0.58      0.62        77
             SUBSTRATE       0.84      0.68      0.75       181
  SUBSTRATE_PRODUCT-OF       0.00      0.00      0.00         7
           UPREGULATOR       0.67      0.11      0.19        18

              accuracy                           0.71      1320
             macro avg       0.54     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
from sklearn.metrics import classification_report

# Assuming 'predictions' and 'y_test' are already defined from your model's output
report = classification_report(test_la, predictions_test, output_dict=True)

# Printing the classification report
print(classification_report(test_la, predictions_test))

# Accessing F1 score, precision, and recall for each class
for label, metrics in report.items():
    if label.isdigit():  # Check to ensure processing class labels, adjust as necessary
        print(f"Class: {label}, Precision: {metrics['precision']}, Recall: {metrics['recall']}, F1-score: {metrics['f1-score']}")


                        precision    recall  f1-score   support

             ACTIVATOR       0.32      0.20      0.24       292
               AGONIST       0.57      0.21      0.31       182
     AGONIST-ACTIVATOR       0.00      0.00      0.00         4
     AGONIST-INHIBITOR       0.00      0.00      0.00        12
            ANTAGONIST       0.74      0.56      0.64       293
         DOWNREGULATOR       0.00      0.00      0.00        72
INDIRECT-DOWNREGULATOR       0.41      0.34      0.37       340
  INDIRECT-UPREGULATOR       0.40      0.33      0.36       334
             INHIBITOR       0.55      0.91      0.69      1255
            PRODUCT-OF       0.60      0.18      0.28       191
             SUBSTRATE       0.59      0.42      0.49       453
           UPREGULATOR       0.00      0.00      0.00        41

              accuracy                           0.53      3469
             macro avg       0.35      0.26      0.28      3469
          weighted avg       0.51     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
