In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# Define your RNN or transformer model
class MisusedWordDetector(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(MisusedWordDetector, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:, -1, :])  # Take only the last output in the sequence
        return output

# Example data (replace with your own dataset)
sentences = ["There they saw something extraordinary, far exceeding what they knew.",
             "Their knowledge of those facts was incomplete.",
             "They're going to learn something new from the ML course.",
             "They're they saw something extraordinary, far exceeding what they knew.",
             "There knowledge of those facts was incomplete.",
             "Their going to learn something new from the ML course.",
             "There they discovered an ancient artifact, surpassing all expectations.",
             "Their understanding of the subject matter was limited.",
             "They're exploring innovative concepts in the field of artificial intelligence.",
             "They're convinced they saw something extraordinary, beyond their comprehension.",
             "There exists a gap in their knowledge regarding these complex facts.",
             "Their pursuit of learning is leading them to new insights in the ML course.",
             "There was a moment of silence as they absorbed the breathtaking view.",
             "Their expertise in the subject matter was apparent in their insightful analysis.",
             "They're eagerly anticipating the upcoming advancements in technology.",
             "They're certain they witnessed something extraordinary, beyond ordinary comprehension.",
             "There seems to be a discrepancy in their understanding of these intricate details.",
             "Their commitment to learning is driving them towards excellence in the ML course.",
             "There, amidst the chaos, they found a sense of calm and clarity.",
             "Their meticulous approach to problem-solving sets them apart in their field.",
             "They're determined to gain new insights and skills from the evolving world of AI.",
             "They're sharing their knowledge with others to foster a collaborative learning environment."
             ]

labels = [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]  # 0: Correct, 1: Incorrect

# Tokenize sentences and convert to numerical representations
vocab = set(word for sentence in sentences for word in sentence.split())
word_to_index = {word: idx for idx, word in enumerate(vocab)}
indexed_sentences = [[word_to_index[word] for word in sentence.split()] for sentence in sentences]

# Ensure consistent lengths of input sequences
max_len = max(len(sentence) for sentence in indexed_sentences)
padded_sentences = [sentence + [0] * (max_len - len(sentence)) for sentence in indexed_sentences]

# Convert to PyTorch tensors
X = torch.tensor(padded_sentences, dtype=torch.long)
y = torch.tensor(labels, dtype=torch.float)

# Ensure consistent lengths of input sequences
max_len = max(len(sentence) for sentence in indexed_sentences)
padded_sentences = [sentence + [0] * (max_len - len(sentence)) for sentence in indexed_sentences]

# Check and adjust the lengths if necessary
if X.size(0) != len(y):
    num_samples = min(X.size(0), len(y))
    X = X[:num_samples]
    y = y[:num_samples]

# Use train_test_split for consistent splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure that y_train and y_test have the correct size
y_train = y_train.unsqueeze(1)  # Add unsqueeze(1) to match output size
y_test = y_test.unsqueeze(1)

# Create DataLoader for training
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Create DataLoader for testing
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Instantiate the model, loss function, and optimizer
vocab_size = len(vocab)
embedding_dim = 50
hidden_dim = 20
output_dim = 1
model = MisusedWordDetector(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation loop
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)

# Calculate accuracy
predicted_labels = (torch.sigmoid(test_outputs) >= 0.5).float()  # Applying threshold
correct_predictions = (predicted_labels == y_test).float()
test_accuracy = correct_predictions.mean().item()

print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Test the inference function
test_sentence = "They're they saw something extraordinary, far exceeding what they knew."
indexed_test_sentence = [word_to_index[word] for word in test_sentence.split()]
padded_test_sentence = indexed_test_sentence + [0] * (max_len - len(indexed_test_sentence))
input_tensor = torch.tensor([padded_test_sentence], dtype=torch.long)
with torch.no_grad():
    output = model(input_tensor)
prediction = torch.sigmoid(output.squeeze()).item()

print(f"Is the test sentence misused? {'Yes' if prediction < 0.5 else 'No'}")


Test Accuracy: 100.00%
Is the test sentence misused? Yes
