In [8]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

import re
import string
import nltk
import contractions
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Precompile regex patterns
punctuation_pattern = re.compile(f"[{string.punctuation}’]")
specific_chars_pattern = re.compile(r'[\'\"]')
multi_space_pattern = re.compile(r'\s{2,}') 

# Initialize objects for reuse
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [2]:
def load_and_prepare_training_data(claims_file_path, evidence_file_path):
    """
    Loads claims and evidence data from specified JSON files, flattens the evidence list in the claims data,
    and merges the claims with their corresponding evidence based on evidence IDs. Returns a DataFrame ready for training.

    Parameters:
    - claims_file_path: str, path to the JSON file containing claims data.
    - evidence_file_path: str, path to the JSON file containing evidence data.

    Returns:
    - pd.DataFrame: A Pandas DataFrame ready for training, where each row represents a claim-evidence pair.
    """
    # Load and flatten claims data
    claims_df = pd.read_json(claims_file_path, orient='index')
    claims_df['evidences'] = claims_df.agg(lambda df: df['evidences'] if isinstance(df['evidences'], list) else [df['evidences']], axis=1)
    claims_df = claims_df.explode('evidences')
    claims_df.reset_index(inplace=True)
    claims_df.columns = ['claim', 'claim_text', 'claim_label', 'evidence']
    
    # Load evidence data
    evidence_df = pd.read_json(evidence_file_path, orient='index')
    evidence_df.reset_index(inplace=True)
    evidence_df.columns = ['evidence', 'evidence_text']
    
    # Merge claims and evidence data
    train_df = pd.merge(claims_df, evidence_df, on='evidence', how='inner')
    train_df['claim'] = train_df.agg(lambda df: int(df['claim'].split('-')[1]), axis=1)
    train_df['evidence'] = train_df.agg(lambda df: int(df['evidence'].split('-')[1]), axis=1)
    
    return train_df

In [5]:
def get_wordnet_pos(treebank_tag):
    """Converts treebank POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def preprocess_text(text):
    """Preprocesses the given text."""
    try:
        # Expand contractions
        text = contractions.fix(text)
    except:
        pass 

    # Convert text to lowercase
    text = text.lower()

    # Remove specific characters like '"' and "'"
    # Remove punctuation
    text = specific_chars_pattern.sub('', text)
    text = punctuation_pattern.sub("", text)

    # Remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize each token based on its POS tag
    lemmatized_tokens = []
    for word, tag in nltk.pos_tag(tokens):
        wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN
        lemmatized_token = lemmatizer.lemmatize(word, pos=wordnet_pos)
        lemmatized_tokens.append(lemmatized_token)

    text = ' '.join(lemmatized_tokens)

    # Replace two or more spaces with a single space
    text = multi_space_pattern.sub(' ', text)

    # Strip leading and trailing spaces
    text = text.strip()

    return text

In [6]:
# Usage example
claims_file_path = '../data/dev-claims.json'
evidence_file_path = '../data/evidence.json'
train_df = load_and_prepare_training_data(claims_file_path, evidence_file_path)
train_df.tail()

Unnamed: 0,claim,claim_text,claim_label,evidence,evidence_text
486,1426,Many of the world’s coral reefs are already ba...,NOT_ENOUGH_INFO,288294,Aquaculture is showing promise as a potentiall...
487,1426,Many of the world’s coral reefs are already ba...,NOT_ENOUGH_INFO,946262,This can rapidly result in transitions to barr...
488,698,A recent study led by Lawrence Livermore Natio...,REFUTES,660755,"A 2007 study by David Douglass and coworkers, ..."
489,1021,"The corals may save themselves, as many other ...",SUPPORTS,242575,The poleward migration of coral species refers...
490,1021,"The corals may save themselves, as many other ...",SUPPORTS,1175280,"One way, however, that corals ""might escape oc..."


In [7]:
train_df_processed = train_df.copy()
train_df_processed.claim_text = train_df_processed.agg(lambda df: preprocess_text(df.claim_text), axis=1)
train_df_processed.evidence_text = train_df_processed.agg(lambda df: preprocess_text(df.evidence_text), axis=1)
train_df_processed.tail()

Unnamed: 0,claim,claim_text,claim_label,evidence,evidence_text
486,1426,many world coral reef already barren state con...,NOT_ENOUGH_INFO,288294,aquaculture show promise potentially effective...
487,1426,many world coral reef already barren state con...,NOT_ENOUGH_INFO,946262,rapidly result transition barren landscape rel...
488,698,recent study lead lawrence livermore national ...,REFUTES,660755,2007 study david douglas coworkers conclude 22...
489,1021,coral may save many creature attempt move towa...,SUPPORTS,242575,poleward migration coral specie refers phenome...
490,1021,coral may save many creature attempt move towa...,SUPPORTS,1175280,one way however coral might escape ocean warm ...


In [86]:
# evidence_df = pd.read_json('../data/evidence.json', orient='index')
# evidence_df.reset_index(inplace=True)
# evidence_df.columns = ['evidence', 'evidence_text']
# evidence_df['evidence_text'] = evidence_df['evidence_text'].progress_apply(preprocess_text)
# evidence_df.tail()

  0%|          | 0/1208827 [00:00<?, ?it/s]

Unnamed: 0,evidence,evidence_text
1208822,evidence-1208822,also property contribute garage apartment
1208823,evidence-1208823,class fn org fyrde 6110 volda
1208824,evidence-1208824,dragon storm game roleplay game collectible ca...
1208825,evidence-1208825,state zeriuani great realm tradition relate tr...
1208826,evidence-1208826,storyline revolve around giant plesiosaur akin...


In [20]:
output_path = '../data/processed_evidence.csv'
# evidence_df.to_csv(output_path, index=False)
evidence_df = pd.read_csv(output_path, dtype={'evidence_text': str}).dropna()
evidence_df.head()

Unnamed: 0,evidence,evidence_text
0,evidence-0,john bennet lawes english entrepreneur agricul...
1,evidence-1,lindberg begin professional career age 16 even...
2,evidence-2,boston lady cambridge vampire weekend
3,evidence-3,gerald francis goyer born october 20 1936 prof...
4,evidence-4,detect abnormality oxytocinergic function schi...


In [32]:
# Assuming preprocessing is done, directly using 'evidence_text'
evidence_corpus = evidence_df['evidence_text']

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

# Fit and transform the evidence corpus
tfidf_matrix_evidence = vectorizer.fit_transform(evidence_corpus)

In [35]:
vectorizer.transform(train_df_processed['claim_text'])

<1x6976016 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming `train_df_processed` is your DataFrame and already loaded

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Combine claim_text and evidence_text for vectorization
all_texts = pd.concat([train_df_processed['claim_text'], train_df_processed['evidence_text']])
vectorizer.fit(all_texts)

# Transform texts
claims_tfidf = vectorizer.transform(train_df_processed['claim_text'])
evidence_tfidf = vectorizer.transform(train_df_processed['evidence_text'])

In [53]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn as nn
import numpy as np
from tqdm.auto import tqdm

In [62]:
class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidences, labels=None):
        self.claims = claims
        self.evidences = evidences
        self.labels = labels

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.claims[idx], self.evidences[idx], self.labels[idx]
        return self.claims[idx], self.evidences[idx]

# Load data
df = pd.read_csv('../data/dev-train.csv')

# Preprocess and split data
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
claims_tfidf = vectorizer.fit_transform(df['claim_text']).toarray()
evidences_tfidf = vectorizer.transform(df['evidence_text']).toarray()
labels = df['evidence'].values

X_train_claims, X_test_claims, X_train_evidences, X_test_evidences, y_train, y_test = train_test_split(
    claims_tfidf, evidences_tfidf, labels, test_size=0.1, random_state=42)

train_dataset = ClaimEvidenceDataset(X_train_claims, X_train_evidences, y_train)
test_dataset = ClaimEvidenceDataset(X_test_claims, X_test_evidences, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [63]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x, (hn, cn) = self.lstm(x)
        x = self.dropout(hn[-1])
        return x

class FactCheckFNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FactCheckFNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [67]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100, early_stopping_tolerance=0.0001):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for claims, evidences, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            optimizer.zero_grad()
            # Ensure claims are tensors of the correct dtype and on the correct device
            claims_tensor = claims.float()
            # Adjust the labels tensor to match the output dimensions and dtype
            labels_tensor = labels.float().unsqueeze(1)
            outputs = model(claims_tensor)
            loss = criterion(outputs, labels_tensor)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        val_loss = evaluate_model(model, val_loader, criterion)
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}, Val Loss: {val_loss}')

        if val_loss < best_val_loss - early_stopping_tolerance:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter > 3:  # Stop after 3 epochs without improvement
                print("Early stopping triggered")
                break

def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for claims, evidences, labels in loader:
            # Ensure claims are tensors of the correct dtype and on the correct device
            claims_tensor = claims.float()
            # Adjust the labels tensor to match the output dimensions and dtype
            labels_tensor = labels.float().unsqueeze(1)
            outputs = model(claims_tensor)
            loss = criterion(outputs, labels_tensor)
            total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
# Assuming the FNN model is for binary classification
fnn_model = FactCheckFNN(input_dim=X_train_claims.shape[1], hidden_dim=128, output_dim=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(fnn_model.parameters())


train_model(fnn_model, train_loader, test_loader, criterion, optimizer)

In [69]:
def predict_relevant_evidence(claim_tfidf, evidence_tfidf_matrix, threshold=0.45, max_outputs=6):
    similarity_scores = cosine_similarity(claim_tfidf.reshape(1, -1), evidence_tfidf_matrix)[0]
    relevant_indices = np.where(similarity_scores >= threshold)[0]
    
    if len(relevant_indices) == 0:
        most_relevant_index = np.argmax(similarity_scores)
        return [f'evidence-{most_relevant_index}']
    else:
        sorted_indices = np.argsort(similarity_scores[relevant_indices])[::-1]
        return [f'evidence-{idx}' for idx in sorted_indices[:max_outputs]]

In [70]:
# Example usage
claim_index = 0  # Index of the claim to test
predicted_evidences = predict_relevant_evidence(X_test_claims[claim_index], X_train_evidences)
print(predicted_evidences)

['evidence-1', 'evidence-0']


In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

# Load data
df = pd.read_csv('data.csv')

# Initialize TF-IDF Vectorizer
# Initialize TF-IDF Vectorizer without specifying max_features
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

# Fit and transform the claim texts
claims_tfidf = vectorizer.fit_transform(df['claim_text']).toarray()

# Dynamically determine the actual size of TF-IDF vectors
input_dim = claims_tfidf.shape[1]

# Prepare TF-IDF for claims
claims_tfidf = vectorizer.fit_transform(df['claim_text']).toarray()

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(claims_tfidf, df['evidence'], test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Dataset
class TFIDFDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __len__(self):
        return len(self.X_data)
    
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]

train_dataset = TFIDFDataset(X_train_tensor, y_train_tensor)
test_dataset = TFIDFDataset(X_test_tensor, y_test_tensor)

batch_size = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [82]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        # Reshape input to [batch_size, 1, input_dim] to represent a sequence of length 1
        x = x.unsqueeze(1)
        
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # Pass the output of the last time step to the classifier
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

# Instantiate the model with the correct input_dim
model = LSTMClassifier(input_dim=input_dim, hidden_dim=128, output_dim=len(np.unique(df['evidence'])), num_layers=2, dropout=0.5)

In [83]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=100, early_stopping_tolerance=0.0001):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')
        # Implement early stopping logic here based on validation loss

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, criterion, optimizer)


IndexError: Target 797867 is out of bounds.

In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv('data.csv')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the entire 'evidence' column before splitting
df['evidence_encoded'] = label_encoder.fit_transform(df['evidence'])

# Now, when you split the data, use 'evidence_encoded' as your labels
X_train, X_test, y_train, y_test = train_test_split(df['claim_text'], df['evidence_encoded'], test_size=0.01, random_state=42)

In [88]:
tfidf_vectorizer = TfidfVectorizer(max_features=50000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

452               another global warming myth come crash
84     late ipcc report ar5 show global mean temperat...
434    unlikely scenario sudden become probable thoug...
474    ipcc report warn last week world “ nowhere nea...
428    protect restore forest would reduce 18 emissio...
Name: claim_text, dtype: object

In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

# Load dataset
df = pd.read_csv('data.csv')

# Simple text cleaning (for demonstration; consider more thorough preprocessing)
df['claim_text'] = df['claim_text'].str.lower()
df['evidence_text'] = df['evidence_text'].str.lower()

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(df[['claim_text', 'evidence_text']], df['evidence'], test_size=0.01, random_state=42)

# Tokenize text and build vocabulary
vocab = Counter()
for text in pd.concat([train_texts['claim_text'], train_texts['evidence_text']]):
    vocab.update(text.split())

vocab_size = len(vocab)
word_to_idx = {word: idx+1 for idx, (word, _) in enumerate(vocab.items())}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Function to encode texts
def encode_text(text, word_to_idx):
    return [word_to_idx[word] for word in text.split() if word in word_to_idx]

# Custom Dataset class
class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidences, labels, word_to_idx):
        self.claims = [encode_text(claim, word_to_idx) for claim in claims]
        self.evidences = [encode_text(evidence, word_to_idx) for evidence in evidences]
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.claims[idx]), torch.tensor(self.evidences[idx]), torch.tensor(self.labels[idx])

# Create datasets and dataloaders
train_dataset = ClaimEvidenceDataset(train_texts['claim_text'], train_texts['evidence_text'], train_labels, word_to_idx)
test_dataset = ClaimEvidenceDataset(test_texts['claim_text'], test_texts['evidence_text'], test_labels, word_to_idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: x)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=lambda x: x)


In [95]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden = hidden[-1]
        out = self.fc(hidden)
        return out


In [96]:
# Parameters
embedding_dim = 100
hidden_dim = 256
output_dim = 1  # Change according to your task, e.g., binary classification

# Initialize model, loss function, and optimizer
model = LSTMModel(vocab_size + 1, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()  # For binary classification
optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 5  # Example epoch count
for epoch in range(num_epochs):
    model.train()
    for texts, _, labels in train_loader:
        # Here you'd need to adjust the data handling to fit the LSTM input requirements
        predictions = model(texts)
        loss = criterion(predictions, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


ValueError: too many values to unpack (expected 3)

In [111]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

# Load dataset
df = pd.read_csv('data.csv')

# Simple text cleaning
df['claim_text'] = df['claim_text'].str.lower().fillna('')
df['evidence_text'] = df['evidence_text'].str.lower().fillna('')

# Ensure labels are numeric if they are not already
label_mapping = {'true': 1, 'false': 0}  # Adjust according to your data
df['claim_label'] = df['claim']

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(df[['claim_text', 'evidence_text']], df['claim_label'], test_size=0.01, random_state=42)

# Tokenize text and build vocabulary
vocab = Counter()
for text in pd.concat([train_texts['claim_text'], train_texts['evidence_text']]):
    vocab.update(text.split())

vocab_size = len(vocab)
word_to_idx = {word: idx+1 for idx, (word, _) in enumerate(vocab.items())}

# Function to encode texts
def encode_text(text, word_to_idx):
    return [word_to_idx[word] for word in text.split() if word in word_to_idx]

# Custom Dataset class
class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidences, labels, word_to_idx):
        self.claims = [encode_text(claim, word_to_idx) for claim in claims]
        self.evidences = [encode_text(evidence, word_to_idx) for evidence in evidences]
        self.labels = labels.to_numpy().astype(float)  # Ensure labels are float
        self.indices = np.arange(len(labels))  # Keep track of indices

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.claims[idx], self.evidences[idx], self.labels[idx], self.indices[idx]

# collate_fn function
def collate_batch(batch):
    claim_list, evidence_list, label_list, indices_list = [], [], [], []
    for (claim, evidence, label, idx) in batch:
        claim_tensor = torch.tensor(claim, dtype=torch.long)
        evidence_tensor = torch.tensor(evidence, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float32)
        claim_list.append(claim_tensor)
        evidence_list.append(evidence_tensor)
        label_list.append(label_tensor)
        indices_list.append(idx)
    claim_list = pad_sequence(claim_list, batch_first=True, padding_value=0)
    evidence_list = pad_sequence(evidence_list, batch_first=True, padding_value=0)
    label_list = torch.stack(label_list)
    return claim_list, evidence_list, label_list, torch.tensor(indices_list, dtype=torch.long)

# Initialize datasets and dataloaders
train_dataset = ClaimEvidenceDataset(train_texts['claim_text'], train_texts['evidence_text'], train_labels, word_to_idx)
test_dataset = ClaimEvidenceDataset(test_texts['claim_text'], test_texts['evidence_text'], test_labels, word_to_idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

# Model definition
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden = hidden[-1]
        out = self.fc(hidden)
        return out

# Initialize model, loss function, and optimizer
model = LSTMModel(vocab_size + 1, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop with gradient clipping
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for claims, _, labels, _ in train_loader:
        predictions = model(claims).squeeze(1)
        loss = criterion(predictions, labels)
        
        optimizer.zero_grad()
        loss.backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

# Evaluation function corrected to handle four items
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        # Corrected to unpack four items
        for claims, _, labels, _ in data_loader:
            predictions = model(claims).squeeze(1)
            loss = criterion(predictions, labels)
            total_loss += loss.item()
    return total_loss / len(data_loader)

# Training loop corrected for unpacking four items
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    # Corrected to unpack four items
    for claims, _, labels, _ in train_loader:
        predictions = model(claims).squeeze(1)
        loss = criterion(predictions, labels)
        
        optimizer.zero_grad()
        loss.backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

# Predict function with evidence_id retrieval updated to return up to 6 results
def predict_with_evidence_id(model, claims, dataset):
    model.eval()
    claims_encoded = [encode_text(claim, word_to_idx) for claim in claims]
    claims_encoded = pad_sequence([torch.tensor(claim) for claim in claims_encoded], batch_first=True, padding_value=0)
    evidence_ids = []
    with torch.no_grad():
        predictions = model(claims_encoded).squeeze(1)
        probabilities = torch.sigmoid(predictions)
        top_prob, top_idx = torch.topk(probabilities, k=min(6, len(claims))) 
        for idx in top_idx:
            _, _, _, evidence_id = dataset[idx.item()]
            evidence_ids.append(evidence_id)


    return top_prob, evidence_ids

# Example of using the evaluation function
val_loss = evaluate(model, test_loader)
print(f"Validation Loss: {val_loss}")

# Example of using the predict function
claims_new = ["[South Australia] has the most expensive electricity in the world."]
top_probabilities, evidence_ids = predict_with_evidence_id(model, claims_new, test_dataset)
print(f"Top Probabilities: {top_probabilities}")
print(f"Evidence IDs: {evidence_ids}")

Epoch 1, Loss: -675.6174755096436
Epoch 2, Loss: -2623.859085083008
Epoch 3, Loss: -5114.236389160156
Epoch 4, Loss: -7488.131774902344
Epoch 5, Loss: -10036.855499267578
Epoch 1, Loss: -11897.787048339844
Epoch 2, Loss: -13246.202209472656
Epoch 3, Loss: -14702.699890136719
Epoch 4, Loss: -15765.975280761719
Epoch 5, Loss: -16069.664428710938
Validation Loss: -10508.384765625
Top Probabilities: tensor([0.4876])
Evidence IDs: [0]


In [112]:
df

Unnamed: 0,claim,claim_text,claim_label,evidence,evidence_text
0,752,south australia expensive electricity world,752,67732,citation need south australia high retail pric...
1,530,south australia win unreliable grid world outs...,530,67732,citation need south australia high retail pric...
2,752,south australia expensive electricity world,752,572512,south australia high power price world
3,375,3 per cent total annual global emission carbon...,375,996421,2011 unep green economy report state aagricult...
4,375,3 per cent total annual global emission carbon...,375,1080858,market share 30 potentially clean electricity ...
...,...,...,...,...,...
486,1426,many world coral reef already barren state con...,1426,288294,aquaculture show promise potentially effective...
487,1426,many world coral reef already barren state con...,1426,946262,rapidly result transition barren landscape rel...
488,698,recent study lead lawrence livermore national ...,698,660755,2007 study david douglas coworkers conclude 22...
489,1021,coral may save many creature attempt move towa...,1021,242575,poleward migration coral specie refers phenome...
