In [0]:
!pip install transformers
!pip install pytorch_lightning

In [0]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [0]:
!mkdir train_files
!cp /content/drive/My\ Drive/Que_Gen/train/cont_que_gen_model_train_data.pickle .

In [0]:
combine_cq =[]

## SQUAD dataset 
with open('/content/drive/My Drive/Que_Gen/data_corpus/squad/train-v1.1.json','rb') as outfile:
	squad_train = json.load(outfile)
with open('/content/drive/My Drive/Que_Gen/data_corpus/squad/dev-v1.1.json','rb') as outfile:
	squad_dev  = json.load(outfile)
    
print(len(squad_train))

num=0

print("SQUAD Examples")
print(len(squad_train))
print(len(squad_dev))
for i in range(len(squad_train['data'])):
	for j in range(len(squad_train['data'][i]['paragraphs'])):
		for k in range(len(squad_train['data'][i]['paragraphs'][j]['qas'])):

			combine_cq.append(
                  { "question":squad_train['data'][i]['paragraphs'][j]['qas'][k]['question'],
                    "answer": squad_train['data'][i]['paragraphs'][j]['qas'][k]['answers'][0]['text'],
                    "answer_start":squad_train['data'][i]['paragraphs'][j]['qas'][k]['answers'][0]['answer_start'],
                    "context":squad_train['data'][i]['paragraphs'][j]['context']
                  }
				)
print("SQAUD examples collected train")         
print(len(combine_cq))

2
SQUAD Examples
2
2
SQAUD examples collected train
87599


In [0]:
#MSMarco
combine_cq=[]
with open('/content/drive/My Drive/Que_Gen/data_corpus/ms_marco/train_v2.1.json',"rb") as outfile:
 	 ms_train = json.load(outfile)

j=0
for key in list(ms_train['answers'].keys()):
	for element in ms_train['passages'][key]:
		if element['is_selected']==1:
			context = element['passage_text'].rstrip()
			answer = ms_train['answers'][key][0]
			question = ms_train['query'][key]
			if any(word in question.lower() for word in ["who","what","when","why","how","can","where","?","which","whom"]):
			    combine_cq.append({"question":question,"context":context,"answer":answer,"answer_start":'None'})
			
print("MS Marco examples collected dev")         
print(len(combine_cq))
print("Writing MS Marco examples to files") 

MS Marco examples collected dev
378401
Writing MS Marco examples to files


In [0]:
combine_cq[0]

{'answer': 'The immediate impact of the success of the manhattan project was the only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.',
 'answer_start': 'None',
 'context': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.',
 'question': ')what was the immediate impact of the success of the manhattan project?'}

In [0]:
new =[]
for ex in combine_cq:
  sents = sent_tokenize(ex['context'])
  answer = ""
  for sent in sents:
    if ex['answer'] in sent:
      answer = sent
  if answer!="":
    ex['input'] = "answer: "+answer+" passage: "+ex['context']
    new.append(ex)

In [0]:
len(new)

127428

In [0]:
new[100]

{'answer': '$56.44 – $111.32 per hour.',
 'answer_start': 'None',
 'context': 'The median hourly wage for a general surgeon ranges from between $56.44 – $111.32 per hour. Some of the best paying states for general surgeons are New York $300,000, Houston $264,000, Miami $255,000, Los Angeles $287,000 and Seattle $280,000. general surgeon salary.',
 'input': 'answer: The median hourly wage for a general surgeon ranges from between $56.44 – $111.32 per hour. passage: The median hourly wage for a general surgeon ranges from between $56.44 – $111.32 per hour. Some of the best paying states for general surgeons are New York $300,000, Houston $264,000, Miami $255,000, Los Angeles $287,000 and Seattle $280,000. general surgeon salary.',
 'question': 'how much does a surgeon make an hour'}

In [0]:
data =[]


In [0]:
count = 0
for ex in new_data:
  if len(ex['input'].split(" "))<380:
    count = count +1
    data.append(ex)

In [0]:
len(data)

214613

In [0]:
with open("/content/drive/My Drive/new_data.pkl","wb") as f:
  pickle.dump(data,f)

In [0]:
import pickle
with open("/content/drive/My Drive/new_data.pkl","rb") as f:
  data = pickle.load(f)

In [0]:
len(data)

214613

In [0]:
len(model_train_data)

260524

In [0]:
from collections import Counter
start =[]
for ex in model_train_data:
  start.append(ex['question'].split(" ")[0].lower())
Counter(start).most_common()

In [0]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [0]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams = hparams
    
    self.model = T5ForConditionalGeneration.from_pretrained("/content/drive/My Drive/t5_qg_model_with_answer")
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
    self.model.cuda()
  
  def is_logger(self):
    return self.trainer.proc_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        lm_labels=lm_labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    if self.trainer.use_tpu:
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    #train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    #val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [0]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [0]:
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=1e-5,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=2,
    gradient_accumulation_steps=32,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [0]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f


In [0]:
data[0]

In [0]:
from torch.utils.data import Dataset, DataLoader


class Dataset_QG(Dataset):
  def __init__(self,max_len=512,type="train"):
      self.max_len = max_len
      #self.tokenizer = tokenizer
      self.inputs = []
      self.targets = []
      if type=="val":
        self.build(data[:1000])
      else:
        self.build(data)

  def build(self,model_train_data):
    for i,ex in enumerate(model_train_data):
      
      context = ex['input']+" </s>"
      question = ex['question'] + " </s>"
      
      self.inputs.append(context)
      self.targets.append(question)   
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    src= self.inputs[index].replace("\n"," ").replace("\t"," ")
    tgt = self.targets[index]
    #print(src)
    tokenized_inputs = tokenizer.encode_plus(
      src, max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
      )
    tokenized_targets = tokenizer.encode_plus(
            tgt, max_length=25, pad_to_max_length=True, return_tensors="pt"
          )

    source_ids = tokenized_inputs["input_ids"].squeeze()
    target_ids = tokenized_targets["input_ids"].squeeze()

    src_mask    = tokenized_inputs["attention_mask"].squeeze()  # might need to squeeze
    target_mask = tokenized_targets["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

In [0]:
train_dataset = Dataset_QG()

In [0]:
val_dataset = Dataset_QG(type="val")

In [0]:
args_dict.update({'data_dir': 'aclImdb', 'output_dir': '/content/drive/My Drive/QG_T5', 'num_train_epochs':1})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)



In [0]:
model = T5FineTuner(args)

INFO:transformers.configuration_utils:loading configuration file /content/drive/My Drive/t5_qg_model_with_answer/config.json
INFO:transformers.configuration_utils:Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": 

In [0]:
trainer = pl.Trainer(**train_params)

INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


In [0]:
trainer.fit(model)

INFO:lightning:
    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 222 M 
1   | model.shared                                                          | Embedding                  | 24 M  
2   | model.encoder                                                         | T5Stack                    | 109 M 
3   | model.encoder.block                                                   | ModuleList                 | 84 M  
4   | model.encoder.block.0                                                 | T5Block                    | 7 M   
5   | model.encoder.block.0.layer                                           | ModuleList                 | 7 M   
6   | model.encoder.block.0.layer.0                                     

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_val_loss = tensor(1.5869, device='cuda:0')

INFO:__main__:loss = tensor(1.5163, device='cuda:0')

INFO:__main__:train_loss = tensor(1.5163, device='cuda:0')

INFO:__main__:val_loss = tensor(1.5869, device='cuda:0')






1

In [0]:
model.model.save_pretrained('/content/drive/My Drive/t5_qg_model_with_answer2')

INFO:transformers.configuration_utils:Configuration saved in /content/drive/My Drive/t5_qg_model_with_answer2/config.json
INFO:transformers.modeling_utils:Model weights saved in /content/drive/My Drive/t5_qg_model_with_answer2/pytorch_model.bin


In [0]:
,input = "type: when"+ " context: "+ 'Stairs, stair stringers, and stair guards shall meet the requirements shown in Figure 27 through Figure 34 and Table 6 except where amended by the local jurisdiction. All stringers shall be a minimum of 2x12. Stair stringers shall not span more than the dimensions shown in Figure 28. If the stringer span exceeds these dimensions, then a 4x4 post may be provided to support the stringer and shorten its span length. The 4x4 post shall be notched and bolted to the stringer with (2) 1⁄2" diameter through- bolts with washers per Figure 8. The post shall be centered on a 12" diameter or 10" square, 6" thick footing. The footing shall be constructed as shown in Figure 34 and attached to the post as shown in Figure 12. An intermediate landing may also be provided to shorten </s>'

In [0]:
inp = tokenizer.batch_encode_plus([input],max_length=512, pad_to_max_length=True, return_tensors="pt").to('cuda')

In [0]:
ids = inp['input_ids']

In [0]:
typ = inp['attention_mask']

In [0]:
op = model.model.generate(input_ids=ids,attention_mask=typ,max_length=40,topk=1)

In [0]:
[tokenizer.decode(id) for id in op]

['when to attach stair stringers to a stair']

In [0]:
!pip install PyMuPDF

Collecting PyMuPDF
[?25l  Downloading https://files.pythonhosted.org/packages/9c/5c/a43e9bd5c182b6125c301504f711e603cc8beb57ccbaad32caea82135e6d/PyMuPDF-1.16.18-cp36-cp36m-manylinux2010_x86_64.whl (5.7MB)
[K     |████████████████████████████████| 5.7MB 2.4MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.16.18


In [0]:
import fitz
max_chunk_words =100
chunk_stride =50
def chunk_text(file_text):
	"""
	This function takes text(either paragraph or a complete page text )
	and chunks them into multiple smaller pieces

	"""
	page_text = file_text.replace("  "," ").replace('\n',' ')
	page_tokens = page_text.split(' ')
	num_page_words = len(page_text.split(" "))
	page_chunks = []
	start_offset = 0

	while start_offset < num_page_words:
	    remaining_length = num_page_words -start_offset
	    if remaining_length>max_chunk_words:
	        length = max_chunk_words
	    else:
	        length = remaining_length
	    page_chunks.append(" ".join(page_tokens[start_offset:(start_offset+length)]))

	    if start_offset + length == num_page_words:
	        break
	    start_offset = start_offset + min(remaining_length,chunk_stride)

	return page_chunks

In [0]:
doc = fitz.open("/content/biological_classification.pdf")
toc = doc.getToC()
totalpages = doc.pageCount
skip = [0]
final_paralist=[]
for page_no in range(totalpages):
    print("---processing page {} of {}----".format(page_no,totalpages))
    if page_no in skip:
        continue
    page = doc.loadPage(page_no)
    html = page.getText("html")
    blocks = page.getText("blocks")
    para_list = [b[4] for b in blocks]
    
    for para in para_list:
      if len(para.split(" "))<10:
        continue
      if len(para.split(" ")) >100:
        chunks =chunk_text(para)
        final_paralist.extend(chunks)
      else:
        final_paralist.append(para)

---processing page 0 of 13----
---processing page 1 of 13----
---processing page 2 of 13----
---processing page 3 of 13----
---processing page 4 of 13----
---processing page 5 of 13----
---processing page 6 of 13----
---processing page 7 of 13----
---processing page 8 of 13----
---processing page 9 of 13----
---processing page 10 of 13----
---processing page 11 of 13----
---processing page 12 of 13----


In [0]:
for para in final_paralist:
  input = "type: when"+ " context: "+ para + "</s>"
  inp = tokenizer.batch_encode_plus([input],max_length=512, pad_to_max_length=True, return_tensors="pt").to('cuda')
  ids = inp['input_ids']
  typ = inp['attention_mask']
  op = model.model.generate(input_ids=ids,attention_mask=typ,max_length=40,topk=1)
  print([tokenizer.decode(id) for id in op])

['when was the monera kingdom defined']
['when did monera get its name']
['when did the classification system start']
['when did the algae come into being']
['when does the heterotrophic phase start']
['when were fungi placed in the animal kingdom']
['when did chlamydomonas come together']
['when was the classification system changed']
['when did the kingdom monera begin']
['when does a bacterial bacterium synthesise itself']
['when do methanogens live in the gut']
['when are eubacteria found']
['when does a bacterium bloom']
['when does ammonia use nitrates']
['when does a bacterium get a spore']
['when do protista form']
['when do diatoms form']
['when did diatoms leave a soap box']
['when do red tidal algae appear']
['when do euglenoids grow']
['when do spores form']
['when do protozoans live']
['when are protozoans found']
['when did amoeba eat their prey?']
['when do parasitic protozoans cause sleep sickness']
['when is a cilia a ciliated organism']
['when does a spore form in the

In [0]:
for para in final_paralist:
  input = "type: what"+ " context: "+ para + "</s>"
  inp = tokenizer.batch_encode_plus([input],max_length=512, pad_to_max_length=True, return_tensors="pt").to('cuda')
  ids = inp['input_ids']
  typ = inp['attention_mask']
  op = model.model.generate(input_ids=ids,attention_mask=typ,max_length=40,topk=1)
  print([tokenizer.decode(id) for id in op])

['what is the classification of the five kingdoms']
['what is the eukaryotic kingdom monera']
['what were the five kingdoms']
['what type of organisms are in the cyanobacteria']
['what is heterotrophic']
['what kingdoms are fungi']
['what is the difference between the three kingdoms']
['what is the classification of plants and animals']
['what is the shape of bacilli']
['what is the structure of bacteria']
['what are the biogas produced by a bacterium']
['what are the cyanobacteria']
['what is the difference between a colonial and anabaena']
['what are the most important nutrients in nature']
['what are citrus canker diseases']
['what is the protistan kingdom']
['what is diatoms']
['what is the name of the diatom']
['what is the color of a marine organism']
['what are the pigments of euglenoids']
['what is a slime mould']
['what are protozoans']
['what is a protozoa']
['what is the name of the marine species']
['what is the difference between parasitic and free-living protozoans']
['wh

In [0]:
for para in final_paralist:
  input = "type: how"+ " context: "+ para + "</s>"
  inp = tokenizer.batch_encode_plus([input],max_length=512, pad_to_max_length=True, return_tensors="pt").to('cuda')
  ids = inp['input_ids']
  typ = inp['attention_mask']
  op = model.model.generate(input_ids=ids,attention_mask=typ,max_length=40,topk=1)
  print([tokenizer.decode(id) for id in op])

['how many kingdoms are in a kingdom']
['how many kingdoms are there in the monera system']
['how did the kingdom of algae classification work']
['how did the classification of fungi differ from the plants']
['how many years is heterotrophic']
['how did fungi get to kingdom fungi']
['how was chlamydomonas placed']
['how is classification of the kingdoms']
['how many bacteria are in the kingdom of monera']
['how do bacteria eat']
['how do bacteria survive in harsh environments']
['how many different bacteria are there?']
['how do heterotrophic bacteria work']
['how do nitrites and ammonia work']
['how do bacterial diseases occur']
['how do protists form']
['how do diatoms float in water']
['how do diatoms get to the soil']
['how do they look like in the ocean']
['how do euglenoids behave']
['how do slime moulds grow']
['how many types of protozoans are there']
['how do protozoans live']
['how do marine forms form']
['how do parasites cause sleeping sickness']
['how do cilia move']
['how