
 a. BASIC TEXT ENTAILMENT USING SIMPLE RULE-BASED METHODS

In [4]:
# 1. Import necessary libraries and load dataset
from datasets import load_dataset
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score

# Download NLTK tokenizers
nltk.download('punkt')

# Sample dataset (as CNN/DailyMail isn't well-suited for entailment tasks, we'll create sample pairs)
data = pd.DataFrame({
    'sentence1': ["The cat is on the mat.", "The sun is shining brightly.", "The game is over."],
    'sentence2': ["The mat has a cat.", "The sky is bright.", "The players are done playing."],
    'label': [False, True, True]  # Labels for entailment (True/False)
})

# 2. Preprocess the data: tokenize and convert to lowercase
def preprocess(text):
    return nltk.word_tokenize(text.lower())

# Apply preprocessing to both sentences
data['sentence1_tokens'] = data['sentence1'].apply(preprocess)
data['sentence2_tokens'] = data['sentence2'].apply(preprocess)

# 3. Define simple rule-based method for text entailment
def simple_rule_based_entailment(s1, s2):
    return set(s2).issubset(set(s1))

# Apply the rule-based entailment check
data['prediction'] = data.apply(lambda row: simple_rule_based_entailment(row['sentence1_tokens'], row['sentence2_tokens']), axis=1)

# 4. Evaluate the model
accuracy = accuracy_score(data['label'], data['prediction'])
print(f'Accuracy: {accuracy}')

# Output the data for reference
print(data[['sentence1', 'sentence2', 'label', 'prediction']])

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Accuracy: 0.3333333333333333
                      sentence1                      sentence2  label  \
0        The cat is on the mat.             The mat has a cat.  False   
1  The sun is shining brightly.             The sky is bright.   True   
2             The game is over.  The players are done playing.   True   

   prediction  
0       False  
1       False  
2       False  


## b.NATURAL LANGUAGE INFERENCE WITH BERT

In [11]:
# Step 1: Import Required Libraries
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Step 2: Load the Dataset
dataset = load_dataset('snli')

# Check the first few examples to understand the structure
print(dataset['train'].features)  # Check the features of the training dataset
print(dataset['train'][0:5])       # Print the first 5 examples from the training dataset

# Step 3: Preprocess the Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

# Apply preprocessing to the dataset (train, validation, and test splits)
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Check the structure of the dataset again
print(encoded_dataset)

# Step 4: Inspect the label column directly to understand its structure
print("Label examples:")
print(encoded_dataset['train']['label'][0:5])  # Print the first 5 labels

# Step 5: Identify unique labels
unique_labels = set(encoded_dataset['train']['label'])
print(f"Unique labels in the dataset: {unique_labels}")

# Step 6: Define label mapping and handle unexpected labels
label_dict = {0: 0, 1: 1, 2: 2}  # Adjust this as necessary based on your labels

# Step 7: Map the labels correctly, handle unexpected labels
def map_labels(example):
    # Use the label_dict for mapping, and set a default for unexpected labels
    label = example['label']
    return {'labels': label_dict.get(label, -1)}  # Map to -1 if the label is unexpected

encoded_dataset = encoded_dataset.map(map_labels)

# Set the format for PyTorch
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Step 8: Load the Pre-Trained BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Step 9: Set Up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy='epoch',     # Evaluation during each epoch
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Strength of L2 regularization
    logging_dir='./logs',            # Directory for logs
)

# Initialize the Trainer with the model, training arguments, and datasets
trainer = Trainer(
    model=model,                         # The BERT model for training
    args=training_args,                  # Training arguments
    train_dataset=encoded_dataset['train'],  # Training dataset
    eval_dataset=encoded_dataset['validation'],  # Validation dataset
)

# Step 10: Train the Model
trainer.train()

# Step 11: Evaluate the Model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Step 12: Make Predictions
premise = "A man inspects the uniform of a figure in some East Asian country."
hypothesis = "The man is sleeping."

# Tokenize the input example
inputs = tokenizer(premise, hypothesis, return_tensors='pt', padding=True, truncation=True, max_length=128)

# Get model prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits).item()

# Convert prediction to human-readable label
label_map = {0: 'entailment', 1: 'contradiction', 2: 'neutral'}
print(f"Predicted Label: {label_map[predicted_label]}")

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}
{'premise': ['A person on a horse jumps over a broken down airplane.', 'A person on a horse jumps over a broken down airplane.', 'A person on a horse jumps over a broken down airplane.', 'Children smiling and waving at camera', 'Children smiling and waving at camera'], 'hypothesis': ['A person is training his horse for a competition.', 'A person is at a diner, ordering an omelette.', 'A person is outdoors, on a horse.', 'They are smiling at their parents', 'There are children present'], 'label': [1, 2, 0, 1, 0]}


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 550152
    })
})
Label examples:
[1, 2, 0, 1, 0]
Unique labels in the dataset: {0, 1, 2, -1}


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 23.12 MiB is free. Process 30294 has 15.86 GiB memory in use. Of the allocated memory 398.46 MiB is allocated by PyTorch, and 33.54 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)