In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/
%cd My\ Drive/
%cd NLP
%ls

/content/drive
/content/drive/My Drive
/content/drive/.shortcut-targets-by-id/1rIZhBK8iImh_zFMtVBkm5XqK4x9VxDIr/NLP
Brainstorming.ipynb   Model.ipynb           questions.gsheet
[0m[01;34mcache_dir[0m/            [01;34moutputs[0m/              rnn.ipynb
DataCollection.ipynb  pca_question.png      [01;34mruns[0m/
data.csv              pca_story.png         [01;34mtmp[0m/
data-mc.csv           [01;34mquestion_generation[0m/  ValidationDataCollection.ipynb
data-validation.csv   questions.csv


In [3]:
!pip install transformers --quiet
!pip install sentencepiece --quiet
!pip install datasets --quiet
!pip install sklearn --quiet
!pip install pandas --quiet
!pip install transformers datasets accelerate nvidia-ml-py3 --quiet
!python -m nltk.downloader punkt
!pip install simpletransformers --quiet
!pip3 install torch torchvision torchaudio

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!git clone https://github.com/patil-suraj/question_generation.git
%cd question_generation

fatal: destination path 'question_generation' already exists and is not an empty directory.
/content/drive/MyDrive/NLP/question_generation


In [4]:
from pynvml import *
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
#from pipelines import pipeline
from transformers import Trainer
from transformers import TrainingArguments
from google.colab import files

In [None]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [None]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


In [19]:
data = pd.read_csv('data.csv')

data = data.rename(columns={'story': 'input_text', 'question': 'target_text'})

# using only 25% of our training data to hopefully reduce RAM usage
data = data.sample(frac = 0.001) 

eval_df = pd.read_csv('data-validation.csv')

eval_df = eval_df.rename(columns={'story': 'input_text', 'question': 'target_text'})

In [20]:
eval_df.head()
data.head()

Unnamed: 0,index,target_text,input_text,answers
62233,61033,What business activity made the Gupta Empire p...,The high points of this cultural creativity ar...,"{'text': ['Strong trade ties'], 'answer_start'..."
57136,55936,Religious diversity in North Carolina has gene...,"Currently, the rapid influx of northerners and...","{'text': ['increased'], 'answer_start': [188]}"
34069,32869,How tall is the cathedral's tower?,The Valencia Cathedral was called Iglesia Mayo...,"{'text': ['58 m'], 'answer_start': [1143]}"
10136,8936,Where has interest outside of those areas main...,"Overall, however, Whitehead's influence is ver...",{'text': ['through the work of his students an...
83817,82617,What would the Chinese use to continue Confuci...,Chinese generals and officials such as Zuo Zon...,"{'text': ['western military technology'], 'ans..."


In [7]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

input_sequences = []
output_sequences = []

for item in data['input_text']:
  input_sequences.append(item)

for item in data['target_text']:
  output_sequences.append(item)

In [8]:
# encode the inputs
task_prefix = "create questions for the passage: "

encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

In [9]:
# encode the targets
target_encoding = tokenizer(
    output_sequences, padding="longest", max_length=max_target_length, truncation=True
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100
labels = torch.tensor(labels)
# labels[labels == tokenizer.pad_token_id] = -100

In [10]:
# forward pass
outputs = model(input_ids=input_ids, labels=labels)

In [21]:
print(eval_df['target_text'])

0      If the rain arrived on Saturday, what day did ...
1                   What was Stewart doing at the beach?
2                    What did the bug want from Stewart?
3      What was the bug that Stewart found at the beach?
4                           What does Mommy do as a job?
                             ...                        
195              What day was story time at the library?
196                    What is the name of the pet bird?
197                 Why can't Andy play fetch with Lucy?
198       On what day of the week is Martha pet-sitting?
199                     What is Martha's favorite color?
Name: target_text, Length: 200, dtype: object


In [22]:
print(eval_df['input_text'][0])

One sunny morning, Stewart chose to go to the beach. He knew that a rainstorm was going to arrive on Saturday and wanted to enjoy some sun before it arrived. \newline\newlineStewart laid on his belly and began reading his newspaper. As he was enjoying the sun and the newspaper, he saw a bug crawl across his blanket. At first Stewart thought it was an ant, but the bug was moving too slowly to be an ant. As Stewart leaned in for a closer look, the bug cried out, "Excuse me, sir! Can you help me?" Stewart was surprised and jumped to his feet. "You can talk!" he shouted. "Yes," said the bug. "Please help me! As you might know, there is a rainstorm coming into town. Unfortunately, I do not have a place to call home. I need to find somewhere to sleep before the rain washes me away!" Stewart felt sad. "Gee," he cried. "That sounds terrible!"\newline\newlineStewart thought about the problem. "Perhaps you can stay at my house," he said. The bug jumped for joy and said, "Oh, that would be wonder

In [24]:
test_story = eval_df['input_text'][0]
test_input_ids = tokenizer(test_story, return_tensors="pt").input_ids
test_question = eval_df['target_text'][0]
test_label = tokenizer(test_question, return_tensors="pt").input_ids
test_output = model(input_ids=test_input_ids, labels=test_label)
#print(test_output)

In [25]:
test_output_1 = model.generate(test_input_ids)

In [26]:
print(tokenizer.decode(test_output_1[0], skip_special_tokens=True))

, he saw a bug crawl across his blanket and jumped to his feet


In [27]:
loss = outputs.loss

In [29]:
# testing
test_input_sequences = []

for item in eval_df['input_text']:
  test_input_sequences.append(item)

encoding = tokenizer(
    [task_prefix + sequence for sequence in test_input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)
test_input_ids, test_attention_mask = encoding.input_ids, encoding.attention_mask

In [30]:
test_outputs = model.generate(test_input_ids)

tcmalloc: large alloc 1677721600 bytes == 0x39f520000 @  0x7f82a8d76b6b 0x7f82a8d96379 0x7f81ca66250e 0x7f81ca6547c2 0x7f82045c610f 0x7f82045c6a51 0x7f82045c6aa4 0x7f82045c6bb2 0x7f820518ec28 0x7f82051c8132 0x7f82048f0141 0x7f82048dcd8c 0x7f8205194147 0x7f820519419f 0x7f8205013fc7 0x7f8205014092 0x7f8205f812c3 0x7f8205f81b52 0x7f8205056216 0x7f82048d448c 0x7f82048d568a 0x7f820531ce7f 0x7f82051060a6 0x7f827fadff13 0x593784 0x548c51 0x51566f 0x549e0e 0x4bca8a 0x59c019 0x595ef6
tcmalloc: large alloc 1677721600 bytes == 0x403520000 @  0x7f82a8d76b6b 0x7f82a8d96379 0x7f81ca66250e 0x7f81ca6547c2 0x7f82045c51a7 0x7f82045c5449 0x7f82045c54a6 0x7f82045c5599 0x7f820518ec07 0x7f82051c782f 0x7f820460b7e0 0x7f820460ccbb 0x7f820460e074 0x7f82047bcdaf 0x7f82051a9b43 0x7f82051a9bf2 0x7f8204f38b39 0x7f8205edca72 0x7f8205edd1c5 0x7f8204f64f4a 0x7f827fa7f06f 0x7f827fa7f366 0x593a14 0x594cd3 0x531a7c 0x4d12af 0x5122db 0x549e0e 0x4bca8a 0x59c019 0x595ef6
tcmalloc: large alloc 1677721600 bytes == 0x46752000

In [31]:
for item in test_outputs:
  print(tokenizer.decode(item, skip_special_tokens=True))

True
True
True
True
True
True
True
True
Joey is always friends with him.
Joey is always friends with him.
Joey is always friends with him.
Joey is always friends with him.
True
True
True
True
Samantha
Samantha
Samantha
Samantha
True
True
True
True
I am going to get a hamster in a few weeks, or maybe
I am going to get a hamster in a few weeks, or maybe
I am going to get a hamster in a few weeks, or maybe
I am going to get a hamster in a few weeks, or maybe
Jenny
Jenny
Jenny
Jenny
Jen put on her favorite dress because it had cats on it. She liked it because it had
Jen put on her favorite dress because it had cats on it. She liked it because it had
Jen put on her favorite dress because it had cats on it. She liked it because it had
Jen put on her favorite dress because it had cats on it. She liked it because it had
True
True
True
True
True
True
True
True
Pester
Pester
Pester
Pester
Jimmy was a duck. He was wet, but he didn't care.
Jimmy was a duck. He was wet, but he didn't care.
Jimmy wa

In [16]:
print(loss)

tensor(2.2582, grad_fn=<NllLossBackward0>)


In [None]:
from simpletransformers.t5 import T5Model

In [None]:
model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 128,
    "train_batch_size": 8,
    "num_train_epochs": 1,
    "save_eval_checkpoints": True,
    "save_steps": -1,
    "use_multiprocessing": False,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 15000,
    "evaluate_during_training_verbose": True,
    "fp16": False,
}

model = T5Model("t5", "t5-small", args=model_args)

In [None]:
model.train_model(data, eval_data=eval_df)

  0%|          | 0/89 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/12 [00:00<?, ?it/s]

FileNotFoundError: ignored

In [None]:
test_story = "My mommy has a really cool job! She makes costumes for movie stars. She works in the attic of our house, so I get to see her work all the time. I watch every movie I can, and try to learn all of the different costumes by heart. Mommy says if I work hard and keep up my practice, one day I'll get to to make costumes for movie stars too! That job sounds like heaven.\newline\newlineOne day I was in the attic, helping Mommy make a boot for a costume. They were covered in little beads, and mom had to sew them on. She kept dropping the needles on the ground. Then I was helping by picking them up. I was also helping by moving the lamp around so Mommy could see the boot better. \newline\newline Ouch! I said. I accidentally stuck my finger with the needle! \newline\newline My mommy looked at my finger, and gave it a kiss. Welcome to the life of a costumer!"
input_ids = tokenizer(test_story, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

ewline ewline My mommy has a really cool job!


In [None]:
# forward pass -- EXCEEDING RAM HERE
#loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
# loss = model(input_ids=input_ids, labels=labels).loss

In [None]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [None]:
from datasets import load_dataset
#dataset = load_dataset('csv', data_files='data.csv')
dataset = load_dataset('sagnikrayc/mctest', 'mc500', split='train') 

Downloading builder script:   0%|          | 0.00/8.49k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

Downloading and preparing dataset mc_test/mc500 (download: 1.13 MiB, generated: 2.74 MiB, post-processed: Unknown size, total: 3.86 MiB) to /root/.cache/huggingface/datasets/sagnikrayc___mc_test/mc500/1.0.0/90d0767a5628921c59c5a1e8d4b25dbf398d2e0fb6300d96bd4c364a3859e93f...


Downloading data:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/600 [00:00<?, ? examples/s]

Dataset mc_test downloaded and prepared to /root/.cache/huggingface/datasets/sagnikrayc___mc_test/mc500/1.0.0/90d0767a5628921c59c5a1e8d4b25dbf398d2e0fb6300d96bd4c364a3859e93f. Subsequent calls will reuse this data.


In [None]:
dataset.shape

(1200, 7)

In [None]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
result = trainer.train()
print_summary(result)



IndexError: ignored

In [None]:
# use GPU efficiently
class TaskDataset(torch.utils.data.Dataset):
  def __init__(self, inputids, attnmasks, labels):
    self.inputids = inputids
    self.attnmasks = attnmasks
    self.labels = labels

  def __getitem__(self, idx):
    item = {}
    item['input_ids'] = self.inputids[idx]
    item['attention_mask'] = self.attnmasks[idx]
    item['labels'] = self.labels[idx]
    return item

  def __len__(self):
    return len(self.labels)

In [None]:
'''trainer = Trainer(
		model=model,
		args=training_args,
		train_dataset=train_loader,
		eval_dataset=test_loader,
		compute_metrics=compute_metrics
		)'''

# fine-tune model
# trainer.train()

'trainer = Trainer(\n\t\tmodel=model,\n\t\targs=training_args,\n\t\ttrain_dataset=train_loader,\n\t\teval_dataset=test_loader,\n\t\tcompute_metrics=compute_metrics\n\t\t)'

In [None]:
# hyper-parameters for our model
numepochs = 5

# large number to avoid saving models
logging_steps = 10000000
save_steps = 1000000

In [None]:
training_args = TrainingArguments(
		output_dir=' ',
		num_train_epochs=numepochs,
		warmup_steps=1000,
		weight_decay=0.01,
    per_device_train_batch_size=32,
		logging_dir=' ',
		logging_steps=logging_steps,
		save_steps=save_steps
		)

In [None]:
data_v = pd.read_csv('data-validation.csv')

In [None]:
data_v = data_v['story']
data_v = data_v.drop_duplicates()
data_v.shape

(50,)

In [None]:
story_array = data_v
story_array.size

50

In [None]:
nlp = pipeline("e2e-qg")

In [None]:
question_array = []

for story in story_array:
  question_array = np.append(question_array, nlp(story))

In [None]:
question_array.size
question_df = pd.DataFrame(question_array)
questions = question_df.to_csv('questions.csv')
files.download('questions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(question_df)

ValueError: ignored