In [None]:
!pip install textstat
!pip install fasttext
!pip install gensim
!pip install datasets



In [None]:
import nltk
nltk.download('words')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cmudict')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [None]:
import re
import numpy as np
import textstat
import spacy

nlp = spacy.load('en_core_web_sm')

def remove_possessive_markers(sentence):
    if not isinstance(sentence, str):
        sentence = str(sentence)
    return re.sub(r"'s|'", "", sentence)

def syllable_count(word):
    return max(1, len(re.findall(r'[aeiouy]+', word, re.IGNORECASE)))

def normalize_score(score, min_value, max_value):
    return (score - min_value) / (max_value - min_value) if max_value != min_value else 0.5

def calculate_sentence_complexity(sentence):
    # Remove possessive markers
    sentence = str(remove_possessive_markers(sentence))

    # Tokenize the sentence
    doc = nlp(sentence)
    #tokens = nltk.word_tokenize(sentence)
    tokens = [token for token in doc if not token.is_punct and not token.is_space
              and not token.like_email and not token.like_url and not token.text.endswith('.')]

    # Get unique lemmas
    unique_lemmas = set(token.lemma_ for token in doc if token.text != '.')

    # Calculate word metrics
    word_metrics = [(token.text, syllable_count(token.text), 'complex' if syllable_count(token.text) >= 3 else 'simple') for token in tokens]

    # Calculate total syllables, total words, and complex word count
    total_syllables = sum(metric[1] for metric in word_metrics)
    total_words = len(word_metrics)
    complex_word_count = sum(1 for _, syllables, _ in word_metrics if syllables >= 3)

    # Calculate metrics
    total_complex_ratio_norm = complex_word_count / total_words if total_words != 0 else 0
    max_dependency_depth = max(len(list(token.ancestors)) + 1 for token in doc)

    # Normalize and calculate metrics
    coleman_liau_index = normalize_score(textstat.coleman_liau_index(sentence), -8, 28)
    gunning_fog_index = normalize_score(textstat.gunning_fog(sentence), 0, 32)
    flesch_reading_ease_score = 1 - normalize_score(textstat.flesch_reading_ease(sentence), -73, 121)
    dale_chall_readability_score = normalize_score(textstat.dale_chall_readability_score(sentence), 0, 20)
    automated_readability_index = normalize_score(textstat.automated_readability_index(sentence), -5, 23)
    vocabulary_complexity = len(unique_lemmas) / total_words if total_words != 0 else 0
    sentence_length = total_words / 100
    dependency_depth = normalize_score(max_dependency_depth, 1, 8)

    # Calculate overall complexity score
    overall_complexity_score = (
        coleman_liau_index * 0.15 +
        gunning_fog_index * 0.15 +
        flesch_reading_ease_score * 0.15 +
        dale_chall_readability_score * 0.15 +
        automated_readability_index * 0.1 +
        (len(set(token.text for token in doc)) / total_words if total_words != 0 else 0) * 0.1 +
        total_complex_ratio_norm * 0.1 +
        sentence_length * 0.05 +
        dependency_depth * 0.05
    )

    # Return the numpy array of metric values
    metric_values = np.array([
        coleman_liau_index,
        gunning_fog_index,
        flesch_reading_ease_score,
        dale_chall_readability_score,
        automated_readability_index,
        vocabulary_complexity,
        total_complex_ratio_norm,
        sentence_length,
        dependency_depth,
        overall_complexity_score
    ])

    return metric_values

In [None]:
import gensim.downloader as api
glove_vectors = api.load('glove-wiki-gigaword-50')
!pip install -q tfds-nightly tensorflow matplotlib

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import tempfile

# Create a temporary directory for caching
tmp_dir = tempfile.mkdtemp()

# Load the training and validation data
train_dataset = tfds.load('squad', split='train')
val_dataset = tfds.load('squad', split='validation')

def text_generator(dataset):
    for example in dataset:
        question = example['question'].numpy().decode('utf-8')
        context = example['context'].numpy().decode('utf-8')
        answer_start = example['answers']['answer_start'].numpy()[0]
        answer_text = example['answers']['text'].numpy()[0].decode('utf-8')
        title = example['title'].numpy().decode('utf-8')
        id = example['id'].numpy().decode('utf-8')
        yield question, context, answer_start, answer_text, title, id

# Cache the pre-processed training data to disk
cached_train_dataset = tf.data.Dataset.from_generator(
    lambda: text_generator(train_dataset),
    output_types=(tf.string, tf.string, tf.int32, tf.string, tf.string, tf.string)
).cache(os.path.join(tmp_dir, 'train'))

# Cache the pre-processed validation data to disk
cached_val_dataset = tf.data.Dataset.from_generator(
    lambda: text_generator(val_dataset),
    output_types=(tf.string, tf.string, tf.int32, tf.string, tf.string, tf.string)
).cache(os.path.join(tmp_dir, 'val'))

# Iterate to cache data to disk
cached_train_dataset = cached_train_dataset.prefetch(tf.data.AUTOTUNE)
cached_val_dataset = cached_val_dataset.prefetch(tf.data.AUTOTUNE)

Downloading and preparing dataset 33.51 MiB (download: 33.51 MiB, generated: 94.06 MiB, total: 127.58 MiB) to /root/tensorflow_datasets/squad/v1.1/3.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/87599 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/squad/v1.1/incomplete.CMRV81_3.0.0/squad-train.tfrecord*...:   0%|        …

Generating validation examples...:   0%|          | 0/10570 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/squad/v1.1/incomplete.CMRV81_3.0.0/squad-validation.tfrecord*...:   0%|   …

Dataset squad downloaded and prepared to /root/tensorflow_datasets/squad/v1.1/3.0.0. Subsequent calls will reuse this data.


In [None]:
decoded_data = []

for question, context, answer_start, answer_text, title, id in cached_train_dataset:
    question = question.numpy().decode('utf-8')
    context = context.numpy().decode('utf-8')
    answer_start = int(answer_start.numpy())
    answer_text = answer_text.numpy().decode('utf-8')
    title = title.numpy().decode('utf-8')
    id = id.numpy().decode('utf-8')

    decoded_data.append({
        'question': question,
        'context': context,
        'answer_start': answer_start,
        'answer_text': answer_text,
        'title': title,
        'id': id
    })

In [None]:
import numpy as np
import torch
from transformers import BasicTokenizer

tokenizer = BasicTokenizer(do_lower_case=True)

def print_max_length_examples(dataset):
    max_question_length = 0
    max_context_length = 0
    max_question = None
    max_context = None

    for example in dataset:
        question_tensor, context_tensor, _, _, _, _ = example
        question_text = question_tensor.numpy().decode('utf-8')
        context_text = context_tensor.numpy().decode('utf-8')
        question_tokens = tokenizer.tokenize(question_text)
        context_tokens = tokenizer.tokenize(context_text)
        question_length = len(question_tokens)
        context_length = len(context_tokens)

        if question_length > max_question_length:
            max_question_length = question_length
            max_question = question_text

        if context_length > max_context_length:
            max_context_length = context_length
            max_context = context_text

    print(f"Maximum question length: {max_question_length} tokens")
    print("Question with maximum length:")
    print(max_question)
    print()

    print(f"Maximum context length: {max_context_length} tokens")
    print("Context with maximum length:")
    print(max_context)

# Call the function with your datasets
print_max_length_examples(cached_train_dataset)
print_max_length_examples(cached_val_dataset)

Maximum question length: 60 tokens
Question with maximum length:
Who defines the Near East as including Armenia, Azerbaijan, Bahrain, Cyprus, Egypt. Georgia, Iran, Iraq, Isreal, Jordan, Kuwait, Lebanon, Libya, Oman, Palestine, Qatar, Saudi Arabia, Sudan, Syria, Turkey, the United Arab Emirates, the West Bank, and Yemen?

Maximum context length: 815 tokens
Context with maximum length:
The sky is usually clear above the desert and the sunshine duration is extremely high everywhere in the Sahara. Most of the desert enjoys more than 3,600 h of bright sunshine annually or over 82% of the time and a wide area in the eastern part experiences in excess of 4,000 h of bright sunshine a year or over 91% of the time, and the highest values are very close to the theoretical maximum value. A value of 4,300 h or 98% of the time would be recorded in Upper Egypt (Aswan, Luxor) and in the Nubian Desert (Wadi Halfa). The annual average direct solar irradiation is around 2,800 kWh/(m2 year) in the Great D

In [None]:
import numpy as np
import torch
from transformers import BasicTokenizer

tokenizer = BasicTokenizer(do_lower_case=True)

def get_word_embedding(word, glove_vectors):
    try:
        return glove_vectors[word]
    except KeyError:
        return np.zeros(glove_vectors.vector_size)

def prepare_input_features(question_str, context_str, glove_vectors, max_question_length=60, max_context_length=815):

    # Tokenize questions and contexts
    question_tokens = tokenizer.tokenize(question_str)
    context_tokens = tokenizer.tokenize(context_str)

    # Calculate metrics for question and context
    question_metrics = calculate_sentence_complexity(question_str)
    context_metrics = calculate_sentence_complexity(context_str)

    # Prepare input features for question
    question_embeddings = []
    for word in question_tokens:
        embedding = get_word_embedding(word, glove_vectors)
        combined_feature = np.concatenate([embedding, question_metrics])
        question_embeddings.append(combined_feature)

    # Pad or truncate the question_embeddings to the maximum question length across the dataset
    padded_question_embeddings = np.zeros((max_question_length, combined_feature.shape[-1]))
    for i, embedding in enumerate(question_embeddings):
        padded_question_embeddings[i, :embedding.shape[0]] = embedding

    # Prepare input features for context
    context_embeddings = []
    for word in context_tokens:
        embedding = get_word_embedding(word, glove_vectors)
        combined_feature = np.concatenate([embedding, context_metrics])
        context_embeddings.append(combined_feature)

    # Pad or truncate the context_embeddings to the maximum context length across the dataset
    padded_context_embeddings = np.zeros((max_context_length, combined_feature.shape[-1]))
    for i, embedding in enumerate(context_embeddings):
        padded_context_embeddings[i, :embedding.shape[0]] = embedding

    # Convert the numpy arrays to PyTorch tensors
    question_features_tensor = torch.tensor(padded_question_embeddings)
    context_features_tensor = torch.tensor(padded_context_embeddings)

    return {'question_features': question_features_tensor, 'context_features': context_features_tensor}

# Iterate over the first 8000 examples from the training set
train_features = []
for i, example in enumerate(decoded_data):
    if i >= 8000:
        break
    question_str = example['question']
    context_str = example['context']
    features = prepare_input_features(question_str, context_str, glove_vectors)
    train_features.append(features)

# Iterate over the first 2000 examples from the validation set
val_features = []
for i, example in enumerate(decoded_data):
    if i >= 2000:
        break
    question_str = example['question']
    context_str = example['context']
    features = prepare_input_features(question_str, context_str, glove_vectors)
    val_features.append(features)

In [None]:
print("Question features tensor shape:", question_features_tensor.shape)
print("Context features tensor shape:", context_features_tensor.shape)

NameError: name 'question_features_tensor' is not defined

In [None]:
import torch
import torch.nn as nn
import tensorflow as tf
import tensorflow_datasets as tfds
from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig, BertModel
from tqdm import tqdm
from transformers import RobertaTokenizerFast

class CustomEmbeddingLayer(nn.Module):
    def __init__(self, output_dim):
        super(CustomEmbeddingLayer, self).__init__()
        self.projection = nn.Linear(self.get_input_dim(), output_dim)

    def get_input_dim(self):
        return self.current_input_shape[-1]

    def forward(self, x):
        self.current_input_shape = x.shape
        projected_embeddings = self.projection(x)
        return projected_embeddings


# Custom Question Answering Model
class CustomQAModel(nn.Module):
    def __init__(self, config):
        super(CustomQAModel, self).__init__()
        self.bert = BertModel(config)
        self.hidden_size = config.hidden_size
        self.qa_outputs = nn.Linear(self.hidden_size, 2)

    def forward(self, input_dict):
        input_ids = input_dict['input_ids'].reshape(batch_size, 512)
        attention_mask = input_dict['attention_mask']
        token_type_ids = None  # BERT doesn't require token_type_ids
        question_features_tensor = input_dict['question_features']
        context_features_tensor = input_dict['context_features']

        print("DEBUG: Input Shape to BertModel:", input_ids.shape)

        # *** Integration Point ***
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state

        question_features_tensor = question_features_tensor.unsqueeze(1)
        context_features_tensor = context_features_tensor.unsqueeze(1)

        combined_question_embeddings = torch.concat([sequence_output, question_features_tensor], dim=-1)
        combined_context_embeddings = torch.concat([sequence_output, context_features_tensor], dim=-1)

        question_output = self.qa_outputs(combined_question_embeddings)
        context_output = self.qa_outputs(combined_context_embeddings)

        start_logits, end_logits = question_output, context_output
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            ignored_index = start_logits.size(-1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        return total_loss, start_logits, end_logits

# Load and preprocess the SQuAD dataset
def load_preprocess_squad(decoded_data):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def preprocess_squad(example):
        question_str = example['question']
        context_str = example['context']
        answer_start = example['answer_start']
        answer_text = example['answer_text']

        # Tokenize question and context
        inputs = tokenizer(question_str, context_str, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

        if isinstance(answer_start, int) and answer_start >= 0:
            start_positions = answer_start
        else:
            start_positions = 0

        if isinstance(answer_text, str) and len(answer_text) > 0:
            end_positions = answer_start + len(answer_text) - 1
        else:
            end_positions = 0


        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)

        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'start_positions': start_positions,
            'end_positions': end_positions,
            'question_features': question_features_tensor,
            'context_features': context_features_tensor
        }


    # Preprocess examples from decoded_data
    processed_examples = [preprocess_squad(example) for example in decoded_data]

    # Split dataset into training and validation sets
    train_dataset = processed_examples[:8000]
    eval_dataset = processed_examples[8000:10000]

    return train_dataset, eval_dataset

# Training and evaluation functions
def train_epoch(model, train_dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc='Training'):
        batch_dict = {}
        print("Contents of the 'batch' dictionary (shapes):")
        for k, v in batch.items():
            if isinstance(v, list):
                batch_dict[k] = torch.tensor(v, device=device)
            elif isinstance(v, torch.Tensor):
                batch_dict[k] = v.to(device)
            else:
                batch_dict[k] = torch.tensor([v], device=device)
        loss, _, _ = model(batch_dict)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return total_loss / len(train_dataloader)

def eval_model(model, eval_dataloader):
    model.eval()
    total_loss = 0
    for batch in eval_dataloader:
        batch_dict = {}
        for k, v in batch.items():
            if isinstance(v, list):
                batch_dict[k] = torch.tensor(v, device=device)
            elif isinstance(v, torch.Tensor):
                batch_dict[k] = v.to(device)
            else:
                batch_dict[k] = torch.tensor([v], device=device)
        with torch.no_grad():
            loss, start_logits, end_logits = model(batch_dict)
        total_loss += loss.item()
    return total_loss / len(eval_dataloader)

# Main training loop
def train_model(model, train_dataloader, eval_dataloader, optimizer, num_epochs=3):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_dataloader, optimizer)
        val_loss = eval_model(model, eval_dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')

# Load datasets and model
batch_size = 8
train_dataset, eval_dataset = load_preprocess_squad(decoded_data)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_config = BertConfig.from_pretrained('bert-base-uncased')
model = CustomQAModel(config=model_config)
model.bert.load_state_dict(BertModel.from_pretrained('bert-base-uncased').state_dict())
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Train the model
train_model(model, train_dataloader, eval_dataloader, optimizer)

NameError: name 'question_features_tensor' is not defined

In [None]:
import torch
from transformers import BertTokenizer
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import f1_score

def predict_answer_spans(model, dataloader, device):
    all_start_logits = []
    all_end_logits = []
    for batch in tqdm(dataloader, desc='Predicting'):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            _, start_logits, end_logits = model(batch)
        all_start_logits.append(start_logits.cpu())
        all_end_logits.append(end_logits.cpu())
    all_start_logits = torch.cat(all_start_logits, dim=0)
    all_end_logits = torch.cat(all_end_logits, dim=0)
    return all_start_logits, all_end_logits

def get_predicted_answers(tokenizer, start_logits, end_logits, dataloader):
    predicted_answers = []
    for i, batch in enumerate(dataloader):
        for j in range(len(batch['input_ids'])):
            start_idx = torch.argmax(start_logits[i * dataloader.batch_size + j]).item()
            end_idx = torch.argmax(end_logits[i * dataloader.batch_size + j]).item()
            input_ids = batch['input_ids'][j]
            answer_tokens = input_ids[start_idx : end_idx + 1]
            predicted_answer = tokenizer.decode(answer_tokens)
            predicted_answers.append(predicted_answer)
    return predicted_answers

def calculate_accuracy_f1_score(predicted_answers, eval_dataset):
    ground_truth_answers = [example['answer_text'] for example in eval_dataset]
    correct_predictions = sum(pred == truth for pred, truth in zip(predicted_answers, ground_truth_answers))
    accuracy = correct_predictions / len(ground_truth_answers)

    f1_scores = []
    for pred, truth in zip(predicted_answers, ground_truth_answers):
        pred_tokens = pred.split()
        truth_tokens = truth.split()
        common_tokens = set(pred_tokens) & set(truth_tokens)
        precision = len(common_tokens) / len(pred_tokens) if len(pred_tokens) > 0 else 0
        recall = len(common_tokens) / len(truth_tokens) if len(truth_tokens) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    return accuracy, avg_f1_score

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load datasets and model
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CustomQAModel(config=model_config)
model.load_state_dict(torch.load('best_model.pth'))
model.to(device)

# Predict answer spans
start_logits, end_logits = predict_answer_spans(model, eval_dataloader, device)

# Get predicted answers
predicted_answers = get_predicted_answers(tokenizer, start_logits, end_logits, eval_dataloader)

# Calculate accuracy and F1 score
accuracy, f1_score = calculate_accuracy_f1_score(predicted_answers, eval_dataset)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1_score}')