In [2]:
# Import libraries
from __future__ import print_function
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch.nn.functional import sigmoid, relu
from scipy.cluster.hierarchy import dendrogram, linkage

In [4]:
with open('data/sem_items.txt','r') as fid:
    names_items = np.array([l.strip() for l in fid.readlines()])
with open('data/sem_relations.txt','r') as fid:
    names_relations = np.array([l.strip() for l in fid.readlines()])
with open('data/sem_attributes.txt','r') as fid:
    names_attributes = np.array([l.strip() for l in fid.readlines()])

nobj = len(names_items)
nrel = len(names_relations)
nattributes = len(names_attributes)
print('List of items:')
print(names_items)
print("List of relations:")
print(names_relations)
print("List of attributes:")
print(names_attributes)

List of items:
['Pine' 'Oak' 'Rose' 'Daisy' 'Robin' 'Canary' 'Sunfish' 'Salmon']
List of relations:
['ISA' 'Is' 'Can' 'Has']
List of attributes:
['Living thing' 'Plant' 'Animal' 'Tree' 'Flower' 'Bird' 'Fish' 'Pine'
 'Oak' 'Rose' 'Daisy' 'Robin' 'Canary' 'Sunfish' 'Salmon' 'Pretty' 'Big'
 'Living' 'Green' 'Red' 'Yellow' 'Grow' 'Move' 'Swim' 'Fly' 'Sing' 'Skin'
 'Roots' 'Leaves' 'Bark' 'Branch' 'Petals' 'Wings' 'Feathers' 'Gills'
 'Scales']


In [5]:
D = np.loadtxt('data/sem_data.txt')
input_pats = D[:,:nobj+nrel]
input_pats = torch.tensor(input_pats,dtype=torch.float)
output_pats = D[:,nobj+nrel:]
output_pats = torch.tensor(output_pats,dtype=torch.float)
N = input_pats.shape[0] # number of training patterns
input_v = input_pats[0,:].numpy().astype('bool')
output_v = output_pats[0,:].numpy().astype('bool')
print('Example input pattern:')
print(input_v.astype('int'))
print('Example output pattern:')
print(output_v.astype('int'))
print("")
print("Which encodes...")
print('Item ',end='')
print(names_items[input_v[:8]])
print('Relation ',end='')
print(names_relations[input_v[8:]])
print('Attributes ',end='')
print(names_attributes[output_v])

Example input pattern:
[1 0 0 0 0 0 0 0 1 0 0 0]
Example output pattern:
[1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Which encodes...
Item ['Pine']
Relation ['ISA']
Attributes ['Living thing' 'Plant' 'Tree' 'Pine']


In [6]:
# Function to generate questions and answers
def generate_questions_answers(names_items, names_relations, names_attributes, data_matrix):
    questions = []
    answers = []
    for i, item in enumerate(names_items):
        for j, relation in enumerate(names_relations):
            # print(item, relation)
            # print(data_matrix[4*i + j,:nobj+nrel])
            for k, attribute in enumerate(names_attributes):
                # print(attribute)
                question = f"{relation.lower()} {item.lower()} {attribute.lower()}"
                answer = int(data_matrix[4*i + j,nobj+nrel+k])
                questions.append(question)
                answers.append(answer)
    return questions, answers

# Generate questions and answers
questions, answers = generate_questions_answers(names_items, names_relations, names_attributes, D)
print(f"Dimensions: {len(questions)}\ncount_0, count_1 = {answers.count(0)}, {answers.count(1)}")

Dimensions: 1152
count_0, count_1 = 1059, 93


In [7]:
import random
rand = random.randint(1, nobj*nrel)
# Print a sample of randomly generated questions and answers
for q, a in zip(questions[nattributes*(rand-1):nattributes*rand], answers[nattributes*(rand-1):nattributes*rand]):
    print(f"Q: {q},\tA: {'Yes' if a == 1 else 'No'}")

Q: has canary living thing,	A: No
Q: has canary plant,	A: No
Q: has canary animal,	A: No
Q: has canary tree,	A: No
Q: has canary flower,	A: No
Q: has canary bird,	A: No
Q: has canary fish,	A: No
Q: has canary pine,	A: No
Q: has canary oak,	A: No
Q: has canary rose,	A: No
Q: has canary daisy,	A: No
Q: has canary robin,	A: No
Q: has canary canary,	A: No
Q: has canary sunfish,	A: No
Q: has canary salmon,	A: No
Q: has canary pretty,	A: No
Q: has canary big,	A: No
Q: has canary living,	A: No
Q: has canary green,	A: No
Q: has canary red,	A: No
Q: has canary yellow,	A: No
Q: has canary grow,	A: No
Q: has canary move,	A: No
Q: has canary swim,	A: No
Q: has canary fly,	A: No
Q: has canary sing,	A: No
Q: has canary skin,	A: Yes
Q: has canary roots,	A: No
Q: has canary leaves,	A: No
Q: has canary bark,	A: No
Q: has canary branch,	A: No
Q: has canary petals,	A: No
Q: has canary wings,	A: Yes
Q: has canary feathers,	A: Yes
Q: has canary gills,	A: No
Q: has canary scales,	A: No


In [None]:
import numpy as np
from sklearn.utils import resample

# Convert to numpy array for easier manipulation
questions_array = np.array(questions)
answers_array = np.array(answers)

# Separate the majority and minority classes
majority_questions = questions_array[answers_array == 0]
minority_questions = questions_array[answers_array == 1]
majority_labels = answers_array[answers_array == 0]
minority_labels = answers_array[answers_array == 1]

# Oversample the minority class
minority_questions_upsampled, minority_labels_upsampled = resample(
    minority_questions,
    minority_labels,
    replace=True,
    n_samples=len(majority_questions),
    random_state=123
)

# Combine back to a single dataset
oversampled_questions = np.concatenate([majority_questions, minority_questions_upsampled])
oversampled_answers = np.concatenate([majority_labels, minority_labels_upsampled])

In [None]:
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch

# Simple tokenizer
def tokenize(questions):
    return [q.lower().split() for q in questions]

# Build a vocabulary
tokenized_questions = tokenize(oversampled_questions)
vocabulary = set(word for sentence in tokenized_questions for word in sentence)
word_to_index = {word: i + 1 for i, word in enumerate(vocabulary)}  # start indexing from 1
word_to_index['<pad>'] = 0  # Add padding token

# Encode the questions
def encode_questions(questions, vocab, max_length):
    encoded = np.zeros((len(questions), max_length), dtype=int)
    for i, question in enumerate(questions):
        encoded[i, :len(question)] = [vocab[word] for word in question if word in vocab]
    return encoded

max_length = max(len(q) for q in tokenized_questions)
encoded_questions = encode_questions(tokenized_questions, word_to_index, max_length)

# Convert to PyTorch datasets
dataset = TensorDataset(torch.tensor(encoded_questions), torch.tensor(oversampled_answers))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
import torch.nn as nn
import torch.optim as optim

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.out = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        x = self.out(hidden[-1])
        return torch.sigmoid(x)

model = LSTMClassifier(len(word_to_index), embedding_dim=50, hidden_dim=100)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for questions, labels in dataloader:
        optimizer.zero_grad()
        output = model(questions)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        optimizer.step()

    # Evaluate (you might want to split your data or use a separate validation set)
    model.eval()
    with torch.no_grad():
        predictions = []
        truths = []
        for questions, labels in dataloader:
            output = model(questions)
            predictions.extend(output.squeeze().round().numpy())
            truths.extend(labels.numpy())

        acc = accuracy_score(truths, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(truths, predictions, average='binary')
        print(f"Epoch {epoch+1}: Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

Epoch 1: Accuracy: 0.66, Precision: 0.66, Recall: 0.66, F1: 0.66
Epoch 2: Accuracy: 0.72, Precision: 0.70, Recall: 0.78, F1: 0.73
Epoch 3: Accuracy: 0.84, Precision: 0.78, Recall: 0.95, F1: 0.86
Epoch 4: Accuracy: 0.92, Precision: 0.87, Recall: 0.98, F1: 0.92
Epoch 5: Accuracy: 0.95, Precision: 0.94, Recall: 0.97, F1: 0.95
Epoch 6: Accuracy: 0.98, Precision: 0.97, Recall: 0.99, F1: 0.98
Epoch 7: Accuracy: 0.99, Precision: 0.98, Recall: 1.00, F1: 0.99
Epoch 8: Accuracy: 0.99, Precision: 0.99, Recall: 1.00, F1: 0.99
Epoch 9: Accuracy: 1.00, Precision: 0.99, Recall: 1.00, F1: 1.00
Epoch 10: Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1: 1.00
Epoch 11: Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1: 1.00
Epoch 12: Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1: 1.00
Epoch 13: Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1: 1.00
Epoch 14: Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1: 1.00
Epoch 15: Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1: 1.00
Epoch 16: Accuracy:

In [None]:
import re

def clean_text(text):
    # Remove punctuation
    return re.sub(r'[^\w\s]', '', text)

def preprocess_question(question, vocab, max_length):
    # Clean and tokenize the question
    cleaned_question = clean_text(question)
    tokens = cleaned_question.lower().split()
    encoded = np.zeros(max_length, dtype=int)
    idx = 0
    for word in tokens:
        if word in vocab and idx < max_length:
            encoded[idx] = vocab[word]
            idx += 1
    return torch.tensor(encoded).unsqueeze(0)  # Add batch dimension

def predict_question(question, model, vocab, max_length):
    model.eval()  # Set the model to evaluation mode
    encoded_question = preprocess_question(question, vocab, max_length)
    with torch.no_grad():
        prediction = model(encoded_question)
        prediction_label = 1 if prediction.item() > 0.5 else 0
    return prediction_label

# Example usage:
question = "Isa Canary living thing?"
predicted_label = predict_question(question, model, word_to_index, max_length)
print(f"The predicted label for the question '{question}' is: {predicted_label}")

The predicted label for the question 'Isa Canary living thing?' is: 1
