In [64]:
!pip install textstat
!pip install fasttext
!pip install gensim
!pip install datasets



In [None]:
import nltk
nltk.download('words')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cmudict')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [None]:
import re
import numpy as np
import textstat
import spacy

nlp = spacy.load('en_core_web_sm')

def remove_possessive_markers(sentence):
    if not isinstance(sentence, str):
        sentence = str(sentence)
    return re.sub(r"'s|'", "", sentence)

def syllable_count(word):
    return max(1, len(re.findall(r'[aeiouy]+', word, re.IGNORECASE)))

def normalize_score(score, min_value, max_value):
    return (score - min_value) / (max_value - min_value) if max_value != min_value else 0.5

def calculate_sentence_complexity(sentence):
    # Remove possessive markers
    sentence = str(remove_possessive_markers(sentence))

    # Tokenize the sentence
    doc = nlp(sentence)
    #tokens = nltk.word_tokenize(sentence)
    tokens = [token for token in doc if not token.is_punct and not token.is_space
              and not token.like_email and not token.like_url and not token.text.endswith('.')]

    # Get unique lemmas
    unique_lemmas = set(token.lemma_ for token in doc if token.text != '.')

    # Calculate word metrics
    word_metrics = [(token.text, syllable_count(token.text), 'complex' if syllable_count(token.text) >= 3 else 'simple') for token in tokens]

    # Calculate total syllables, total words, and complex word count
    total_syllables = sum(metric[1] for metric in word_metrics)
    total_words = len(word_metrics)
    complex_word_count = sum(1 for _, syllables, _ in word_metrics if syllables >= 3)

    # Calculate metrics
    total_complex_ratio_norm = complex_word_count / total_words if total_words != 0 else 0
    max_dependency_depth = max(len(list(token.ancestors)) + 1 for token in doc)

    # Normalize and calculate metrics
    coleman_liau_index = normalize_score(textstat.coleman_liau_index(sentence), -8, 28)
    gunning_fog_index = normalize_score(textstat.gunning_fog(sentence), 0, 32)
    flesch_reading_ease_score = 1 - normalize_score(textstat.flesch_reading_ease(sentence), -73, 121)
    dale_chall_readability_score = normalize_score(textstat.dale_chall_readability_score(sentence), 0, 20)
    automated_readability_index = normalize_score(textstat.automated_readability_index(sentence), -5, 23)
    vocabulary_complexity = len(unique_lemmas) / total_words if total_words != 0 else 0
    sentence_length = total_words / 100
    dependency_depth = normalize_score(max_dependency_depth, 1, 8)

    # Calculate overall complexity score
    overall_complexity_score = (
        coleman_liau_index * 0.15 +
        gunning_fog_index * 0.15 +
        flesch_reading_ease_score * 0.15 +
        dale_chall_readability_score * 0.15 +
        automated_readability_index * 0.1 +
        (len(set(token.text for token in doc)) / total_words if total_words != 0 else 0) * 0.1 +
        total_complex_ratio_norm * 0.1 +
        sentence_length * 0.05 +
        dependency_depth * 0.05
    )

    # Return the numpy array of metric values
    metric_values = np.array([
        coleman_liau_index,
        gunning_fog_index,
        flesch_reading_ease_score,
        dale_chall_readability_score,
        automated_readability_index,
        vocabulary_complexity,
        total_complex_ratio_norm,
        sentence_length,
        dependency_depth,
        overall_complexity_score
    ])

    return metric_values

In [None]:
import gensim.downloader as api

In [None]:
glove_vectors = api.load('glove-wiki-gigaword-50')  # For 50-dimensional vectors
from datasets import load_dataset

# Load SQuAD dataset
squad_dataset = load_dataset('squad')

In [None]:
import numpy as np
import fasttext
from nltk.tokenize import word_tokenize
import torch

# Prepare input features for the first example in the training set
question = squad_dataset['train'][0]['question']
context = squad_dataset['train'][0]['context']

def get_word_embedding(word, glove_vectors):
    try:
        return glove_vectors[word]
    except KeyError:
        return np.zeros(glove_vectors.vector_size)

def prepare_input_features(question, context, glove_vectors, embedding_dim, batch_size):
    # Calculate metrics for question and context
    question_metrics = calculate_sentence_complexity(question)
    context_metrics = calculate_sentence_complexity(context)

    # Tokenize questions and contexts
    question_tokenized = word_tokenize(' '.join(question))
    context_tokenized = word_tokenize(' '.join(context))

    #print("Max Question Length:", max_question_length)
    #print("Max Context Length:", max_context_length)

    # Prepare input features for question
    question_features = []
    for start_index in range(0, len(question_tokenized), batch_size):
        end_index = min(start_index + batch_size, len(question_tokenized))
        batch_question_tokenized = question_tokenized[start_index:end_index]
        batch_question_embeddings = []
        for word in batch_question_tokenized:
            embedding = get_word_embedding(word, glove_vectors)
            combined_feature = np.concatenate([embedding, question_metrics])
            batch_question_embeddings.append(combined_feature)

        # Pad or truncate the batch_question_embeddings to the maximum length across the dataset
        padded_batch_question_embeddings = np.zeros((max_question_length, combined_feature.shape[-1]))
        for i, embedding in enumerate(batch_question_embeddings):
            padded_batch_question_embeddings[i, :embedding.shape[0]] = embedding

        #print("Shape of batch_question_embeddings before padding/truncation:", np.array(batch_question_embeddings).shape)
        #print("Shape of padded_batch_question_embeddings:", padded_batch_question_embeddings.shape)

        question_features.append(padded_batch_question_embeddings)

    # Prepare input features for context
    context_features = []
    for start_index in range(0, len(context_tokenized), batch_size):
        end_index = min(start_index + batch_size, len(context_tokenized))
        batch_context_tokenized = context_tokenized[start_index:end_index]
        batch_context_embeddings = []
        for word in batch_context_tokenized:
            embedding = get_word_embedding(word, glove_vectors)
            combined_feature = np.concatenate([embedding, context_metrics])
            batch_context_embeddings.append(combined_feature)

        # Pad or truncate the batch_context_embeddings to the maximum length across the dataset
        padded_batch_context_embeddings = np.zeros((max_context_length, combined_feature.shape[-1]))
        for i, embedding in enumerate(batch_context_embeddings):
            padded_batch_context_embeddings[i, :embedding.shape[0]] = embedding

        #print("Shape of batch_context_embeddings before padding/truncation:", np.array(batch_context_embeddings).shape)
        #print("Shape of padded_batch_context_embeddings:", padded_batch_context_embeddings.shape)

        context_features.append(padded_batch_context_embeddings)

    # Convert lists of numpy arrays to a single numpy array
    question_features_array = np.concatenate(question_features, axis=0)
    context_features_array = np.concatenate(context_features, axis=0)

    #print("Length of question_features just before it's returned:", len(question_features_array))
    #print("Length of context_features just before it's returned:", len(context_features_array))

    # Convert the numpy arrays to PyTorch tensors
    question_features_tensor = torch.tensor(question_features_array)
    context_features_tensor = torch.tensor(context_features_array)

    return {'question_features': question_features_tensor, 'context_features': context_features_tensor}

# Preprocess the dataset
batch_size = 8
embedding_dim = glove_vectors.vector_size
features = prepare_input_features(question, context, glove_vectors, embedding_dim, batch_size)

In [None]:
def find_max_lengths(dataset):
    max_question_length = 0
    max_context_length = 0
    for example in dataset:
        question_tokenized = word_tokenize(' '.join(example['question']))
        context_tokenized = word_tokenize(' '.join(example['context']))
        max_question_length = max(max_question_length, len(question_tokenized))
        max_context_length = max(max_context_length, len(context_tokenized))
    return max_question_length, max_context_length

# Find maximum lengths for the train dataset
train_dataset = squad_dataset['train']
max_question_length, max_context_length = find_max_lengths(train_dataset)

In [None]:
print(max_question_length)
print(max_context_length)

In [None]:
question_features_tensor = features['question_features']
context_features_tensor = features['context_features']

In [None]:
print("Question features tensor shape:", question_features_tensor.shape)
print("Context features tensor shape:", context_features_tensor.shape)

In [None]:
import numpy as np

question_features_np = question_features_tensor.numpy()
context_features_np = context_features_tensor.numpy()

print("First few question features:")
print(question_features_np[:5])

print("\nFirst few context features:")
print(context_features_np[:5])

In [None]:
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, RobertaConfig
from datasets import load_dataset
from tqdm import tqdm

class CustomEmbeddingLayer(nn.Module):
    def __init__(self, output_dim):  # Change: Remove input_dim
        super(CustomEmbeddingLayer, self).__init__()
        self.projection = nn.Linear(self.get_input_dim(), output_dim)  # Change: Dynamic calculation

    def get_input_dim(self): # New helper method
        # Assuming your features are always the last dimension of the input
        return self.current_input_shape[-1]

    def forward(self, x):
        self.current_input_shape = x.shape  # Store the input shape
        projected_embeddings = self.projection(x)
        return projected_embeddings

# Custom Question Answering Model
class CustomQAModel(nn.Module):
    def __init__(self, config):
        super(CustomQAModel, self).__init__()
        self.hidden_size = config.hidden_size
        self.qa_outputs = nn.Linear(self.hidden_size, 2)

    def forward(self, input_ids, attention_mask, token_type_ids, start_positions=None, end_positions=None, question_features=None, context_features=None):
        # Use question_features and context_features directly as input
        combined_question_input = question_features
        combined_context_input = context_features

        # Feed the combined input to the QA model
        logits = self.qa_outputs(combined_input)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            ignored_index = start_logits.size(-1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        return total_loss, start_logits, end_logits

# Load and preprocess the SQuAD dataset
def load_preprocess_squad():
  squad_dataset = load_dataset('squad')
  tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

  def preprocess_squad(examples):
      questions = [q.strip() for q in examples['question']]
      contexts = [c.strip() for c in examples['context']]
      answers = examples['answers']

      # Extract start positions and calculate end positions
      start_positions = [answer['answer_start'][0] for answer in answers]  # Extracting the first element from the list
      end_positions = [start + len(answer['text'][0]) for start, answer in zip(start_positions, answers)]  # Extracting the first element from the list

      input_features = prepare_input_features(question, context, glove_vectors, embedding_dim, batch_size)

      model_inputs = tokenizer(examples['question'], examples['context'], max_length=512, truncation=True, padding='max_length', return_tensors='pt')
      output = {
          'input_ids': model_inputs['input_ids'],
          'attention_mask': model_inputs['attention_mask'],
          'start_positions': start_positions,
          'end_positions': end_positions,
          'question_features': input_features['question_features'],
          'context_features': input_features['context_features']
      }

      # Print the length of the question_features column
      print(f"Length of question_features: {len(output['question_features'])}")

      return output

  # Select a random subset of 10000 samples
  train_dataset = squad_dataset['train'].shuffle(seed=42).select(range(10000))  # Seed for reproducibility
  eval_dataset = squad_dataset['validation'].map(preprocess_squad, batched=True, remove_columns=squad_dataset['validation'].column_names)

  return train_dataset, eval_dataset

# Training and evaluation functions
def train_epoch(model, train_dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc='Training'):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss, _, _ = model(**batch)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return total_loss / len(train_dataloader)

def eval_model(model, eval_dataloader):
    model.eval()
    total_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            loss, start_logits, end_logits = model(**batch)
        total_loss += loss.item()
    return total_loss / len(eval_dataloader)

# Main training loop
def train_model(model, train_dataloader, eval_dataloader, optimizer, num_epochs=3):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_dataloader, optimizer)
        val_loss = eval_model(model, eval_dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')

# Load datasets and model
batch_size = 8
train_dataset, eval_dataset = load_preprocess_squad()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_config = RobertaConfig.from_pretrained('roberta-base')
model = CustomQAModel.from_pretrained('roberta-base', config=model_config)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Train the model
train_model(model, train_dataloader, eval_dataloader, optimizer)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Length of question_features: 1848


ArrowInvalid: Column 4 named question_features expected length 1000 but got length 1848

In [None]:
for example in squad_dataset['train'].select(range(5)):
    print(example['answers'])
    print('---')

{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}
---
{'text': ['a copper statue of Christ'], 'answer_start': [188]}
---
{'text': ['the Main Building'], 'answer_start': [279]}
---
{'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]}
---
{'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]}
---


In [None]:
print(squad_dataset['train'].column_names)

['id', 'title', 'context', 'question', 'answers']


In [None]:
import numpy as np
import fasttext
from nltk.tokenize import word_tokenize
import torch

# Prepare input features for the first example in the training set
question = squad_dataset['train'][0]['question']
context = squad_dataset['train'][0]['context']

def get_word_embedding(word, glove_vectors):
    try:
        return glove_vectors[word]
    except KeyError:
        return np.zeros(glove_vectors.vector_size)

def get_positional_encodings(sequence_length, embedding_dim):
    pe = np.zeros((sequence_length, embedding_dim))
    position = np.arange(0, sequence_length, dtype=np.float32)
    div_term = np.exp(np.arange(0, embedding_dim, 2, dtype=np.float32) * (-np.log(10000.0) / embedding_dim))
    position = position[:, np.newaxis]  # Convert to column vector to enable broadcasting
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)
    return pe

def prepare_input_features(question, context, glove_vectors, embedding_dim, batch_size):
    # Calculate metrics for question and context
    question_metrics = calculate_sentence_complexity(question)
    context_metrics = calculate_sentence_complexity(context)

    # Tokenize questions and contexts
    question_tokenized = word_tokenize(' '.join(question))
    context_tokenized = word_tokenize(' '.join(context))

    # Get maximum sequence lengths for questions and contexts
    max_question_length = max(len(question_tokenized), 1)  # Ensure minimum length of 1
    max_context_length = max(len(context_tokenized), 1)  # Ensure minimum length of 1

    # Prepare input features for question
    question_features = []
    for start_index in range(0, len(question_tokenized), batch_size):
        end_index = min(start_index + batch_size, len(question_tokenized))
        batch_question_tokenized = question_tokenized[start_index:end_index]
        batch_question_embeddings = []
        for word in batch_question_tokenized:
            embedding = get_word_embedding(word, glove_vectors)
            positional_encoding = get_positional_encodings(max_question_length, embedding_dim)
            combined_feature = np.concatenate([embedding, positional_encoding.flatten(), question_metrics])
            batch_question_embeddings.append(combined_feature)
        question_features.extend(batch_question_embeddings)

    # Prepare input features for context
    context_features = []
    for start_index in range(0, len(context_tokenized), batch_size):
        end_index = min(start_index + batch_size, len(context_tokenized))
        batch_context_tokenized = context_tokenized[start_index:end_index]
        batch_context_embeddings = []
        for word in batch_context_tokenized:
            embedding = get_word_embedding(word, glove_vectors)
            positional_encoding = get_positional_encodings(max_question_length, embedding_dim)
            combined_feature = np.concatenate([embedding, positional_encoding.flatten(), context_metrics])
            batch_context_embeddings.append(combined_feature)
        context_features.extend(batch_context_embeddings)

    # Convert lists to tensors
    question_features_tensor = torch.tensor(question_features)
    context_features_tensor = torch.tensor(context_features)

    # Return input features as a dictionary
    return {'question_features': question_features_tensor, 'context_features': context_features_tensor}

embedding_dim = glove_vectors.vector_size
batch_size = 100
features = prepare_input_features(question, context, glove_vectors, embedding_dim, batch_size)

In [None]:
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, RobertaConfig
from datasets import load_dataset
from tqdm import tqdm

# Custom Embedding Layer
class CustomEmbeddingLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CustomEmbeddingLayer, self).__init__()
        self.projection = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        projected_embeddings = self.projection(x)
        return projected_embeddings

# Custom Question Answering Model
class CustomQAModel(RobertaForQuestionAnswering):
    def __init__(self, config):
        super(CustomQAModel, self).__init__(config)
        self.roberta = RobertaForQuestionAnswering(config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.custom_embedding = CustomEmbeddingLayer(input_dim=config.hidden_size + 11, output_dim=config.hidden_size)

    def forward(self, input_ids, attention_mask, token_type_ids, start_positions=None, end_positions=None, question_features=None, context_features=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]

        # Concatenate token embeddings with question and context features
        question_metrics_tensor = question_features.repeat(sequence_output.shape[1], 1).transpose(0, 1)
        context_metrics_tensor = context_features.repeat(sequence_output.shape[1], 1).transpose(0, 1)
        combined_question_input = torch.cat((sequence_output, question_metrics_tensor), dim=2)
        combined_context_input = torch.cat((sequence_output, context_metrics_tensor), dim=2)

        # Project combined inputs to the expected embedding dimension
        combined_question_input = self.custom_embedding(combined_question_input)
        combined_context_input = self.custom_embedding(combined_context_input)

        # Feed the combined input to the QA model
        logits = self.qa_outputs(combined_input)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            ignored_index = start_logits.size(-1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        return total_loss, start_logits, end_logits

# Load and preprocess the SQuAD dataset
def load_preprocess_squad():
  squad_dataset = load_dataset('squad')
  tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

  def preprocess_squad(examples):
    questions = [q.strip() for q in examples['question']]
    contexts = [c.strip() for c in examples['context']]
    input_features = prepare_input_features(questions, contexts, model)  # Call your function from the second program

    model_inputs = tokenizer(examples['question'], examples['context'], max_length=512, truncation=True, padding='max_length', return_tensors='pt')
    output = {
        'input_ids': model_inputs['input_ids'],
        'attention_mask': model_inputs['attention_mask'],
        'token_type_ids': model_inputs['token_type_ids'],
        'start_positions': examples['answer_start'],
        'end_positions': examples['answer_end'],
        'question_features': input_features['question_features'],
        'context_features': input_features['context_features']
    }
    return output

  # Select a random subset of 10000 samples
  train_dataset = squad_dataset['train'].shuffle(seed=42).select(range(10000))  # Seed for reproducibility
  eval_dataset = squad_dataset['validation'].map(preprocess_squad, batched=True, remove_columns=squad_dataset['validation'].column_names)

  return train_dataset, eval_dataset

# Training and evaluation functions
def train_epoch(model, train_dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc='Training'):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss, _, _ = model(**batch)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return total_loss / len(train_dataloader)

def eval_model(model, eval_dataloader):
    model.eval()
    total_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            loss, start_logits, end_logits = model(**batch)
        total_loss += loss.item()
    return total_loss / len(eval_dataloader)

# Main training loop
def train_model(model, train_dataloader, eval_dataloader, optimizer, num_epochs=3):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_dataloader, optimizer)
        val_loss = eval_model(model, eval_dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')

# Load datasets and model
train_dataset, eval_dataset = load_preprocess_squad()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=8)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_config = RobertaConfig.from_pretrained('roberta-base')
model = CustomQAModel.from_pretrained('roberta-base', config=model_config)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Train the model
train_model(model, train_dataloader, eval_dataloader, optimizer)


Sentence Metrics: {'coleman_liau_index': 0.5763888888888888, 'gunning_fog_index': 0.3584375, 'flesch_reading_ease_score': 0.3616494845360825, 'dale_chall_readability_score': 0.475, 'automated_readability_index': 0.5392857142857143, 'vocabulary_complexity': 1.0, 'total_complex_word_ratio': 0.25, 'sentence_length': 0.12, 'dependency_depth': 0.42857142857142855, 'overall_complexity_score': 0.48041185720422186, 'complexity': 'Moderate', 'word_metrics': [('Consuming', 3, 'complex'), ('cannabis', 3, 'complex'), ('can', 1, 'simple'), ('lead', 1, 'simple'), ('a', 1, 'simple'), ('person', 2, 'simple'), ('to', 1, 'simple'), ('hallucinations', 5, 'complex'), ('and', 1, 'simple'), ('disturb', 2, 'simple'), ('one', 2, 'simple'), ('mind', 1, 'simple')]}
