In [None]:
!pip install -U transformers
!pip install sentencepiece
!python -m nltk.downloader punkt
!pip install langdetect

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
text1 = "Gravity (from Latin gravitas, meaning 'weight'), or gravitation, is a natural phenomenon by which all \
things with mass or energy—including planets, stars, galaxies, and even light—are brought toward (or gravitate toward) \
one another. On Earth, gravity gives weight to physical objects, and the Moon's gravity causes the ocean tides. \
The gravitational attraction of the original gaseous matter present in the Universe caused it to begin coalescing \
and forming stars and caused the stars to group together into galaxies, so gravity is responsible for many of \
the large-scale structures in the Universe. Gravity has an infinite range, although its effects become increasingly \
weaker as objects get further away"

text2 = "Je m’appelle Jessica. Je suis une fille, je suis française et j’ai treize ans.\
 Je vais à l’école à Nice, mais j’habite à Cagnes-Sur-Mer. J’ai deux frères. Le premier s’appelle Thomas, \
 il a quatorze ans. Le second s’appelle Yann et il a neuf ans. Mon papa est italien et il est fleuriste. \
 Ma mère est allemande et est avocate. Mes frères et moi parlons français, italien et allemand à la maison.\
  Nous avons une grande maison avec un chien, un poisson et deux chats."

text3 = "सील से भरी हुई यह छोटी सी कोठरी, जिसकी दीवारों से गरीबी असहाय के असमर्थ साथी की तरह चिपटी हुई है।\
छत इतनी टूटी-फूटी जैसे गिरने ही वाली हो। किसी जमाने में इसमें एक लैंप लटक रहा था, जिसका कंकाल मात्र आज भी टंगा हुआ है।"

text4 = "모든 사람은 교육을 받을 권리를 가진다 . 교육은 최소한 초등 및 기초단계에서는 무상이어야 한다. 초등교육은 의무적이어야 한다. 기술 및 직업교육은 일반적으로 접근이 가능하여야 하며, 고등교육은 모든 사람에게 실력에 근거하여 동등하게 접근 가능하여야 한다.교육은 인격의 완전한 발전과 인권과 기본적"

## Single task QA

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import re

# Sample dataset in Hindi
paragraphs = [
  "सील से भरी हुई यह छोटी सी कोठरी, जिसकी दीवारों से गरीबी असहाय के असमर्थ साथी की तरह चिपटी हुई है।\
छत इतनी टूटी-फूटी जैसे गिरने ही वाली हो। किसी जमाने में इसमें एक लैंप लटक रहा था, जिसका कंकाल मात्र आज भी टंगा हुआ है।"
]

# Function to clean a Hindi sentence
def clean_hindi_sentence(sentence):
    sentence = re.sub(r'[^\w\s]', '', sentence) # Remove punctuations
    sentence = re.sub(r'[a-zA-Z]', '', sentence) # Remove English characters
    sentence = re.sub(r'\d+', '', sentence) # Remove digits
    sentence = sentence.strip() # Remove leading/trailing whitespaces
    return sentence

# Function to convert a paragraph to a list of sentences
def paragraph_to_sentences(paragraph):
    return paragraph.split('। ')

# Convert the paragraphs to a list of sentences
sentences = []
for paragraph in paragraphs:
    sentences.extend(paragraph_to_sentences(paragraph))

# Clean the sentences
cleaned_sentences = [clean_hindi_sentence(sentence) for sentence in sentences]

# Tokenize the sentences
word2idx = {"<PAD>": 0, "<UNK>": 1}
idx2word = {0: "<PAD>", 1: "<UNK>"}
for sentence in cleaned_sentences:
    for word in sentence.split():
        if word not in word2idx:
            word2idx[word] = len(word2idx)
            idx2word[len(idx2word)] = word

# Convert the sentences to sequences of word indices
sequences = []
for sentence in cleaned_sentences:
    sequence = []
    for word in sentence.split():
        if word in word2idx:
            sequence.append(word2idx[word])
        else:
            sequence.append(word2idx["<UNK>"])
    sequences.append(sequence)

# Pad the sequences to a fixed length
max_len = max([len(sequence) for sequence in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", value=word2idx["<PAD>"])

# Generate input and output data
X = padded_sequences[:, :-1]
y = padded_sequences[:, 1:]

y = to_categorical(y, num_classes=len(word2idx))
model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=100, input_length=max_len-1))
model.add(LSTM(256, return_sequences=True))
model.add(Dense(len(word2idx), activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X, y, batch_size=32, epochs=50)
seed_text = "एक लैंप जिसका कंकाल मात्र आज भी टंगा हुआ है।"
next_words = 10
for _ in range(next_words):
# Convert the seed text to a sequence of word indices
  seed_sequence = []
  for word in seed_text.split():
    if word in word2idx:
      seed_sequence.append(word2idx[word])
    else:
      seed_sequence.append(word2idx["<UNK>"])
# Pad the sequence to a fixed length
    seed_padded = pad_sequences([seed_sequence], maxlen=max_len-1, padding="post", value=word2idx["<PAD>"])

# Predict the next word
predicted = model.predict(seed_padded)[0]
predicted_idx = np.argmax(predicted)

# Convert the predicted index to a word
# Convert the predicted index to a word
if predicted_idx in idx2word:
    predicted_word = idx2word[predicted_idx]
else:
    predicted_word = "<UNK>"

# Update the seed text
seed_text += " " + predicted_word


# Update the seed text
seed_text += " " + predicted_word


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Sample dataset
paragraphs = [
  "Je suis allé à la plage avec mes amis et j'ai nagé dans la mer.",
  "Le chat est monté sur le toit et a attrapé une souris.",
  "La cuisine française est renommée pour sa gastronomie raffinée."
]

# Function to convert a paragraph to a list of sentences
def paragraph_to_sentences(paragraph):
    return paragraph.split('. ')

# Convert the paragraphs to a list of sentences
sentences = []
for paragraph in paragraphs:

    sentences.extend(paragraph_to_sentences(paragraph))

# Tokenize the sentences
word2idx = {"<PAD>": 0, "<UNK>": 1}
idx2word = {0: "<PAD>", 1: "<UNK>"}
for sentence in sentences:
    for word in sentence.split():
        if word not in word2idx:
            word2idx[word] = len(word2idx)
            idx2word[len(idx2word)] = word

# Convert the sentences to sequences of word indices
sequences = []
for sentence in sentences:
    sequence = []
    for word in sentence.split():
        if word in word2idx:
            sequence.append(word2idx[word])
        else:
            sequence.append(word2idx["<UNK>"])
    sequences.append(sequence)

# Pad the sequences to a fixed length
max_len = max([len(sequence) for sequence in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", value=word2idx["<PAD>"])

# Generate input and output data
X = padded_sequences[:, :-1]
y = padded_sequences[:, 1:]

y_onehot = np.zeros((len(sequences), max_len, len(word2idx)))
for i, sequence in enumerate(sequences):
    for j, word_idx in enumerate(sequence):
        y_onehot[i, j, word_idx] = 1
y = y_onehot[:, :-1, :]

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=50, input_length=max_len-1))
model.add(LSTM(50, return_sequences=True))
model.add(Dense(len(word2idx), activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
# Train the LSTM model
model.fit(X, y, epochs=100)

# Function to generate a question from a sentence
def generate_question(sentence):
    # Tokenize the sentence
    words = sentence.lower().split()
    # Convert the words to word indices
    sequence = []
    for word in words:
        if word in word2idx:
            sequence.append(word2idx[word])
    # Pad the sequence
    sequence = pad_sequences([sequence], maxlen=max_len-1)
    # Make the prediction
    prediction = model.predict(sequence)
    # Convert the prediction to a word
    predicted_word_idx = np.argmax(prediction)
    predicted_word = idx2word.get(predicted_word_idx, "")
    # Generate the question
    if predicted_word:
        question = f"What is {predicted_word} in the sentence \"{sentence}\"?"
    else:
        question = ""
    return question

# Test the function
sentence = "le ciel est bleu"

question = generate_question(sentence)
print(question)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import re

# Sample dataset in Hindi
paragraphs = [
  "이 봉인된 작은 감방의 벽에는 무력한 자의 무력한 동반자처럼 가난이 달라붙어 있습니다.\
지붕이 무너질 정도로 무너져 내렸습니다. 옛날 옛적에 등불이 걸려 있었는데 그 해골이 오늘날에도 여전히 매달려 있습니다."
]

# Function to clean a Hindi sentence
def clean_hindi_sentence(sentence):
    sentence = re.sub(r'[^\w\s]', '', sentence) # Remove punctuations
    sentence = re.sub(r'[a-zA-Z]', '', sentence) # Remove English characters
    sentence = re.sub(r'\d+', '', sentence) # Remove digits
    sentence = sentence.strip() # Remove leading/trailing whitespaces
    return sentence

# Function to convert a paragraph to a list of sentences
def paragraph_to_sentences(paragraph):
    return paragraph.split('। ')

# Convert the paragraphs to a list of sentences
sentences = []
for paragraph in paragraphs:
    sentences.extend(paragraph_to_sentences(paragraph))

# Clean the sentences
cleaned_sentences = [clean_hindi_sentence(sentence) for sentence in sentences]

# Tokenize the sentences
word2idx = {"<PAD>": 0, "<UNK>": 1}
idx2word = {0: "<PAD>", 1: "<UNK>"}
for sentence in cleaned_sentences:
    for word in sentence.split():
        if word not in word2idx:
            word2idx[word] = len(word2idx)
            idx2word[len(idx2word)] = word

# Convert the sentences to sequences of word indices
sequences = []
for sentence in cleaned_sentences:
    sequence = []
    for word in sentence.split():
        if word in word2idx:
            sequence.append(word2idx[word])
        else:
            sequence.append(word2idx["<UNK>"])
    sequences.append(sequence)

# Pad the sequences to a fixed length
max_len = max([len(sequence) for sequence in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", value=word2idx["<PAD>"])

# Generate input and output data
X = padded_sequences[:, :-1]
y = padded_sequences[:, 1:]

y = to_categorical(y, num_classes=len(word2idx))
model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=100, input_length=max_len-1))
model.add(LSTM(256, return_sequences=True))
model.add(Dense(len(word2idx), activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X, y, batch_size=32, epochs=50)
seed_text = "뼈대만 매달려 있는 램프."
next_words = 10
for _ in range(next_words):
# Convert the seed text to a sequence of word indices
  seed_sequence = []
  for word in seed_text.split():
    if word in word2idx:
      seed_sequence.append(word2idx[word])
    else:
      seed_sequence.append(word2idx["<UNK>"])
# Pad the sequence to a fixed length
    seed_padded = pad_sequences([seed_sequence], maxlen=max_len-1, padding="post", value=word2idx["<PAD>"])

# Predict the next word
predicted = model.predict(seed_padded)[0]
predicted_idx = np.argmax(predicted)

# Convert the predicted index to a word
# Convert the predicted index to a word
if predicted_idx in idx2word:
    predicted_word = idx2word[predicted_idx]
else:
    predicted_word = "<UNK>"

# Update the seed text
seed_text += " " + predicted_word


# Update the seed text
seed_text += " " + predicted_word


In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Sample dataset
paragraphs = [
  "내 가장 친한 친구는 자연이 매우 사랑스럽고 부모님, 담임 선생님, 이웃 등 모두에게 사랑 받고 있습니다.",
  "그는 시간을 잘 지키고 제 시간에 학교에 옵니다.",
  "그는 항상 자신의 집안일을 적시에 정기적으로 완료하고 저를 도와줍니다."
]

# Function to convert a paragraph to a list of sentences
def paragraph_to_sentences(paragraph):
    return paragraph.split('. ')

# Convert the paragraphs to a list of sentences
sentences = []
for paragraph in paragraphs:

    sentences.extend(paragraph_to_sentences(paragraph))

# Tokenize the sentences
word2idx = {"<PAD>": 0, "<UNK>": 1}
idx2word = {0: "<PAD>", 1: "<UNK>"}
for sentence in sentences:
    for word in sentence.split():
        if word not in word2idx:
            word2idx[word] = len(word2idx)
            idx2word[len(idx2word)] = word

# Convert the sentences to sequences of word indices
sequences = []
for sentence in sentences:
    sequence = []
    for word in sentence.split():
        if word in word2idx:
            sequence.append(word2idx[word])
        else:
            sequence.append(word2idx["<UNK>"])
    sequences.append(sequence)

# Pad the sequences to a fixed length
max_len = max([len(sequence) for sequence in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", value=word2idx["<PAD>"])

# Generate input and output data
X = padded_sequences[:, :-1]
y = padded_sequences[:, 1:]

y_onehot = np.zeros((len(sequences), max_len, len(word2idx)))
for i, sequence in enumerate(sequences):
    for j, word_idx in enumerate(sequence):
        y_onehot[i, j, word_idx] = 1
y = y_onehot[:, :-1, :]

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=50, input_length=max_len-1))
model.add(LSTM(50, return_sequences=True))
model.add(Dense(len(word2idx), activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
# Train the LSTM model
model.fit(X, y, epochs=100)

# Function to generate a question from a sentence
def generate_question(sentence):
    # Tokenize the sentence
    words = sentence.lower().split()
    # Convert the words to word indices
    sequence = []
    for word in words:
        if word in word2idx:
            sequence.append(word2idx[word])
    # Pad the sequence
    sequence = pad_sequences([sequence], maxlen=max_len-1)
    # Make the prediction
    prediction = model.predict(sequence)
    # Convert the prediction to a word
    predicted_word_idx = np.argmax(prediction)
    predicted_word = idx2word.get(predicted_word_idx, "")
    # Generate the question
    if predicted_word:
        question = f"What is {predicted_word} in the sentence \"{sentence}\"?"
    else:
        question = ""
    return question

# Test the function
sentence = "그는 항상 자신의 집안일을 적시에 정기적으로 완료하고 저를 도와줍니다."

question = generate_question(sentence)
print(question)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Sample dataset
paragraphs = [
  "The sky is blue. The grass is green.",
  "The quick brown fox jumps over the lazy dog.",
  "Roses are red. Violets are blue."
]

# Function to convert a paragraph to a list of sentences
def paragraph_to_sentences(paragraph):
    return paragraph.split('. ')

# Convert the paragraphs to a list of sentences
sentences = []
for paragraph in paragraphs:

    sentences.extend(paragraph_to_sentences(paragraph))

# Tokenize the sentences
word2idx = {"<PAD>": 0, "<UNK>": 1}
idx2word = {0: "<PAD>", 1: "<UNK>"}
for sentence in sentences:
    for word in sentence.split():
        if word not in word2idx:
            word2idx[word] = len(word2idx)
            idx2word[len(idx2word)] = word

# Convert the sentences to sequences of word indices
sequences = []
for sentence in sentences:
    sequence = []
    for word in sentence.split():
        if word in word2idx:
            sequence.append(word2idx[word])
        else:
            sequence.append(word2idx["<UNK>"])
    sequences.append(sequence)

# Pad the sequences to a fixed length
max_len = max([len(sequence) for sequence in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", value=word2idx["<PAD>"])

# Generate input and output data
X = padded_sequences[:, :-1]
y = padded_sequences[:, 1:]

y_onehot = np.zeros((len(sequences), max_len, len(word2idx)))
for i, sequence in enumerate(sequences):
    for j, word_idx in enumerate(sequence):
        y_onehot[i, j, word_idx] = 1
y = y_onehot[:, :-1, :]

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=50, input_length=max_len-1))
model.add(LSTM(50, return_sequences=True))
model.add(Dense(len(word2idx), activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
# Train the LSTM model
model.fit(X, y, epochs=100)

# Function to generate a question from a sentence
def generate_question(sentence):
    # Tokenize the sentence
    words = sentence.lower().split()
    # Convert the words to word indices
    sequence = []
    for word in words:
        if word in word2idx:
            sequence.append(word2idx[word])
    # Pad the sequence
    sequence = pad_sequences([sequence], maxlen=max_len-1)
    # Make the prediction
    prediction = model.predict(sequence)
    # Convert the prediction to a word
    predicted_word_idx = np.argmax(prediction)
    predicted_word = idx2word.get(predicted_word_idx, "")
    # Generate the question
    if predicted_word:
        question = f"What is {predicted_word} in the sentence \"{sentence}\"?"
    else:
        question = ""
    return question

# Test the function
sentence = "The sky is blue."
question = generate_question(sentence)
print(question)

In [None]:
import itertools
import logging
from typing import Optional, Dict, Union
from nltk import sent_tokenize
from langdetect import detect
import torch
from transformers import(
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)
logger = logging.getLogger(__name__)
import random
import nltk
from nltk.corpus import wordnet
from transformers import pipeline
nltk.download('wordnet')
generator = pipeline('text-generation', model='gpt2')
!pip install googletrans==3.1.0a0
from googletrans import Translator
ans=[]
class QGPipeline:
    """Poor man's QG pipeline"""
    def __init__(
        self,
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        ans_model: PreTrainedModel,
        ans_tokenizer: PreTrainedTokenizer,
        qg_format: str,
        use_cuda: bool
    ):
        self.model = model
        self.tokenizer = tokenizer

        self.ans_model = ans_model
        self.ans_tokenizer = ans_tokenizer

        self.qg_format = qg_format

        self.device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
        self.model.to(self.device)

        if self.ans_model is not self.model:
            self.ans_model.to(self.device)

        assert self.model.__class__.__name__ in ["T5ForConditionalGeneration", "BartForConditionalGeneration"]

        if "T5ForConditionalGeneration" in self.model.__class__.__name__:
            self.model_type = "t5"
        else:
            self.model_type = "bart"

    #def generate_distractors(target_word, num_distractors):
        # Find synonyms, antonyms, hyponyms, and hypernyms of the target word


    def __call__(self, inputs: str):
        f=detect(inputs)
        t = Translator()
        x = t.translate(inputs)
        inputs=x.text
        inputs = " ".join(inputs.split())
        sents, answers = self._extract_answers(inputs)
        flat_answers = list(itertools.chain(*answers))

        if len(flat_answers) == 0:
          return []

        if self.qg_format == "prepend":
            qg_examples = self._prepare_inputs_for_qg_from_answers_prepend(inputs, answers)
        else:
            qg_examples = self._prepare_inputs_for_qg_from_answers_hl(sents, answers)

        qg_inputs = [example['source_text'] for example in qg_examples]
        questions = self._generate_questions(qg_inputs)
        for example, que in zip(qg_examples, questions):

            #distractors = generate_distractors(example['answer'], 3)
            if len(example['answer'])==0:
              continue
            words=example['answer'].split()
            last_word = words[-1]
            target_word=last_word
            num_distractors=3
            synonyms = set()
            antonyms = set()
            hyponyms = set()
            hypernyms = set()

            for syn in wordnet.synsets(target_word):
                for lemma in syn.lemmas():
                    synonyms.add(lemma.name())
                    if lemma.antonyms():
                        antonyms.add(lemma.antonyms()[0].name())
                for hypo in syn.hyponyms():
                    for lemma in hypo.lemmas():
                        hyponyms.add(lemma.name())
                for hyper in syn.hypernyms():
                    for lemma in hyper.lemmas():
                        hypernyms.add(lemma.name())

            # Generate candidate distractors using the language model
            candidates = list(synonyms.union(antonyms).union(hyponyms).union(hypernyms))
            distractors = []
            for candidate in candidates:
                if candidate != target_word:
                    try:
                        generated_text = generator(f"Which is more related to {target_word}? {target_word} or {candidate}", max_length=20, num_return_sequences=1, do_sample=True)[0]['generated_text'].strip()
                        distractors.append((candidate, generated_text))
                    except:
                        pass

            # Rank distractors by relevance and choose top N
            distractors.sort(key=lambda x: x[1])
            xx = t.translate(example['answer'],dest=f)
            example['answer']=xx.text
            xy=t.translate(que,dest=f)
            que=xy.text
            distractors= [d[0] for d in distractors[:num_distractors]]
            if len(distractors)==0 and f=='hi':
              distractors.append('जिजीविषा')
              distractors.append('प्रेमशक्त')
              distractors.append('तमक')
            elif len(distractors)==0:
              distractors.append('Morrow')
              distractors.append('Kerfuffle')
              distractors.append('Crapulous')
            for m in range(len(distractors)):
              mm=t.translate(distractors[m],dest=f)
              distractors[m]=mm.text
            index = random.randint(0, len(distractors))
            distractors.insert(index,example['answer'])
            output=[{'question': que, 'distractors':distractors,'answer': example['answer']}]
            ans.append(output)
            #print(output)
        for i in range(len(ans)):
            print(ans[i])
        ans1=ans.copy()
        ans.clear()
        #print(ans)
        return ans1
    def _generate_questions(self, inputs):
        inputs = self._tokenize(inputs, padding=True, truncation=True)

        outs = self.model.generate(
            input_ids=inputs['input_ids'].to(self.device),
            attention_mask=inputs['attention_mask'].to(self.device),
            max_length=32,
            num_beams=4,
        )

        questions = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
        return questions

    def _extract_answers(self, context):
        sents, inputs = self._prepare_inputs_for_ans_extraction(context)
        inputs = self._tokenize(inputs, padding=True, truncation=True)

        outs = self.ans_model.generate(
            input_ids=inputs['input_ids'].to(self.device),
            attention_mask=inputs['attention_mask'].to(self.device),
            max_length=32,
        )

        dec = [self.ans_tokenizer.decode(ids, skip_special_tokens=False) for ids in outs]
        answers = [item.split('<sep>') for item in dec]
        answers = [i[:-1] for i in answers]

        return sents, answers

    def _tokenize(self,
        inputs,
        padding=True,
        truncation=True,
        add_special_tokens=True,
        max_length=512
    ):
        inputs = self.tokenizer.batch_encode_plus(
            inputs,
            max_length=max_length,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            padding="max_length" if padding else False,
            pad_to_max_length=padding,
            return_tensors="pt"
        )
        return inputs

    def _prepare_inputs_for_ans_extraction(self, text):
        sents = sent_tokenize(text)

        inputs = []
        for i in range(len(sents)):
            source_text = "extract answers:"
            for j, sent in enumerate(sents):
                if i == j:
                    sent = "<hl> %s <hl>" % sent
                source_text = "%s %s" % (source_text, sent)
                source_text = source_text.strip()

            if self.model_type == "t5":
                source_text = source_text + " </s>"
            inputs.append(source_text)

        return sents, inputs

    def _prepare_inputs_for_qg_from_answers_hl(self, sents, answers):
        inputs = []
        #print(answers)
        #print(sents)
        for i, answer in enumerate(answers):
            if len(answer) == 0:
              continue
            for answer_text in answer:
                sent = sents[i]
                sent=sent.lower()
                sents_copy = sents[:]
                answer_text=answer_text[5:]
                answer_text=answer_text.lower()
                #print(answer_text)
                answer_text = answer_text.strip()

                ans_start_idx = sent.index(answer_text)

                sent = f"{sent[:ans_start_idx]} <hl> {answer_text} <hl> {sent[ans_start_idx + len(answer_text): ]}"
                sents_copy[i] = sent

                source_text = " ".join(sents_copy)
                source_text = f"generate question: {source_text}"
                if self.model_type == "t5":
                    source_text = source_text + " </s>"

                inputs.append({"answer": answer_text, "source_text": source_text})

        return inputs

    def _prepare_inputs_for_qg_from_answers_prepend(self, context, answers):
        flat_answers = list(itertools.chain(*answers))
        examples = []
        for answer in flat_answers:
            source_text = f"answer: {answer} context: {context}"
            if self.model_type == "t5":
                source_text = source_text + " </s>"

            examples.append({"answer": answer, "source_text": source_text})
        return examples


class MultiTaskQAQGPipeline(QGPipeline):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def __call__(self, inputs: Union[Dict, str]):
        if type(inputs) is str:
            # do qg
            return super().__call__(inputs)
        else:
            # do qa
            return self._extract_answer(inputs["question"], inputs["context"])

    def _prepare_inputs_for_qa(self, question, context):
        source_text = f"question: {question}  context: {context}"
        if self.model_type == "t5":
            source_text = source_text + " </s>"
        return  source_text

    def _extract_answer(self, question, context):
        source_text = self._prepare_inputs_for_qa(question, context)
        inputs = self._tokenize([source_text], padding=False)

        outs = self.model.generate(
            input_ids=inputs['input_ids'].to(self.device),
            attention_mask=inputs['attention_mask'].to(self.device),
            max_length=16,
        )

        answer = self.tokenizer.decode(outs[0], skip_special_tokens=True)
        return answer


class E2EQGPipeline:
    def __init__(
        self,
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        use_cuda: bool
    ) :

        self.model = model
        self.tokenizer = tokenizer

        self.device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
        self.model.to(self.device)

        assert self.model.__class__.__name__ in ["T5ForConditionalGeneration", "BartForConditionalGeneration"]

        if "T5ForConditionalGeneration" in self.model.__class__.__name__:
            self.model_type = "t5"
        else:
            self.model_type = "bart"

        self.default_generate_kwargs = {
            "max_length": 256,
            "num_beams": 4,
            "length_penalty": 1.5,
            "no_repeat_ngram_size": 3,
            "early_stopping": True,
        }

    def __call__(self, context: str, **generate_kwargs):
        inputs = self._prepare_inputs_for_e2e_qg(context)

        # TODO: when overrding default_generate_kwargs all other arguments need to be passsed
        # find a better way to do this
        if not generate_kwargs:
            generate_kwargs = self.default_generate_kwargs

        input_length = inputs["input_ids"].shape[-1]

        # max_length = generate_kwargs.get("max_length", 256)
        # if input_length < max_length:
        #     logger.warning(
        #         "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
        #             max_length, input_length
        #         )
        #     )

        outs = self.model.generate(
            input_ids=inputs['input_ids'].to(self.device),
            attention_mask=inputs['attention_mask'].to(self.device),
            **generate_kwargs
        )

        prediction = self.tokenizer.decode(outs[0], skip_special_tokens=True)
        questions = prediction.split("<sep>")
        questions = [question.strip() for question in questions[:-1]]
        return questions

    def _prepare_inputs_for_e2e_qg(self, context):
        source_text = f"generate questions: {context}"
        if self.model_type == "t5":
            source_text = source_text + " </s>"

        inputs = self._tokenize([source_text], padding=False)
        return inputs

    def _tokenize(
        self,
        inputs,
        padding=True,
        truncation=True,
        add_special_tokens=True,
        max_length=512
    ):
        inputs = self.tokenizer.batch_encode_plus(
            inputs,
            max_length=max_length,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            padding="max_length" if padding else False,
            pad_to_max_length=padding,
            return_tensors="pt"
        )
        return inputs


SUPPORTED_TASKS = {
    "question-generation": {
        "impl": QGPipeline,
        "default": {
            "model": "valhalla/t5-small-qg-hl",
            "ans_model": "valhalla/t5-base-qa-qg-hl",
        }
    },
    "multitask-qa-qg": {
        "impl": MultiTaskQAQGPipeline,
        "default": {
            "model": "valhalla/t5-base-qa-qg-hl",
        }
    },
    "e2e-qg": {
        "impl": E2EQGPipeline,
        "default": {
            "model": "valhalla/t5-small-e2e-qg",
        }
    }
}

def pipeline(
    task: str,
    model = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
    qg_format: Optional[str] = "highlight",
    ans_model = None,
    ans_tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
    use_cuda: Optional[bool] = True,
    **kwargs,
):
    # Retrieve the task
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))

    targeted_task = SUPPORTED_TASKS[task]
    task_class = targeted_task["impl"]

    # Use default model/config/tokenizer for the task if no model is provided
    if model is None:
        model = targeted_task["default"]["model"]

    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
        if isinstance(model, str):
            tokenizer = model
        else:
            # Impossible to guest what is the right tokenizer here
            raise Exception(
                "Impossible to guess which tokenizer to use. "
                "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
            )

    # Instantiate tokenizer if needed
    if isinstance(tokenizer, (str, tuple)):
        if isinstance(tokenizer, tuple):
            # For tuple we have (tokenizer name, {kwargs})
            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)

    # Instantiate model if needed
    if isinstance(model, str):
        model = AutoModelForSeq2SeqLM.from_pretrained(model)

    if task == "question-generation":
        if ans_model is None:
            # load default ans model
            ans_model = targeted_task["default"]["ans_model"]
            ans_tokenizer = AutoTokenizer.from_pretrained(ans_model)
            ans_model = AutoModelForSeq2SeqLM.from_pretrained(ans_model)
        else:
            # Try to infer tokenizer from model or config name (if provided as str)
            if ans_tokenizer is None:
                if isinstance(ans_model, str):
                    ans_tokenizer = ans_model
                else:
                    # Impossible to guest what is the right tokenizer here
                    raise Exception(
                        "Impossible to guess which tokenizer to use. "
                        "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
                    )

            # Instantiate tokenizer if needed
            if isinstance(ans_tokenizer, (str, tuple)):
                if isinstance(ans_tokenizer, tuple):
                    # For tuple we have (tokenizer name, {kwargs})
                    ans_tokenizer = AutoTokenizer.from_pretrained(ans_tokenizer[0], **ans_tokenizer[1])
                else:
                    ans_tokenizer = AutoTokenizer.from_pretrained(ans_tokenizer)

            if isinstance(ans_model, str):
                ans_model = AutoModelForSeq2SeqLM.from_pretrained(ans_model)

    if task == "e2e-qg":
        return task_class(model=model, tokenizer=tokenizer, use_cuda=use_cuda)
    elif task == "question-generation":
        return task_class(model=model, tokenizer=tokenizer, ans_model=ans_model, ans_tokenizer=ans_tokenizer, qg_format=qg_format, use_cuda=use_cuda)
    else:
        return task_class(model=model, tokenizer=tokenizer, ans_model=model, ans_tokenizer=tokenizer, qg_format=qg_format, use_cuda=use_cuda)
print(ans)
print(len(ans))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[]
0


In [None]:
nlp = pipeline("question-generation")

In [None]:
answers = [['<pad> Python'], ['<pad> Guido van Rossum']]
sents = ['Python is an interpreted, high-level, general-purpose programming language.', "Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace."]



In [None]:
inputs = []
# print(answers)
# print(sents)
for i, answer in enumerate(answers):
  print(answer)
  if len(answer) == 0: continue
  for answer_text in answer:
      print(answer_text)
      sent = sents[i]
      sents_copy = sents[:]

      answer_text = answer_text.split(" ")[1]
      print(answer_text)
      print(sent)
      ans_start_idx = sent.index(answer_text)

      sent = f"{sent[:ans_start_idx]} <hl> {answer_text} <hl> {sent[ans_start_idx + len(answer_text): ]}"
      sents_copy[i] = sent

      source_text = " ".join(sents_copy)
      source_text = f"generate question: {source_text}"
      # if self.model_type == "t5":
      #     source_text = source_text + " </s>"

      inputs.append({"answer": answer_text, "source_text": source_text})

['<pad> Python']
<pad> Python
Python
Python is an interpreted, high-level, general-purpose programming language.
['<pad> Guido van Rossum']
<pad> Guido van Rossum
Guido
Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace.


If you want to use the t5-base model, then pass the path through model parameter

In [None]:
nlp = pipeline("question-generation", model="valhalla/t5-base-qg-hl")

In [None]:
!pip install anvil-uplink

Collecting argparse (from anvil-uplink)
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0


In [None]:
import anvil.server
anvil.server.connect("server_T7AFCZLZIBPXCTTIJJ5IQLPB-JARBJ3BJAHU5SSSZ")

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default Environment" as SERVER


In [None]:
def add_backslash_after_word_count(text, word_count):
    words = text.split()
    modified_text = ''
    for i, word in enumerate(words, 1):
        modified_text += word
        if i % word_count == 0:
            modified_text += '\\'
        else:
            modified_text += ' '
    return modified_text


In [None]:
mylist = []

In [None]:
# def format_data(data):
#     result = ""
#     for element in data:
#         if isinstance(element, dict):
#             for key, value in element.items():
#                 result += f"{key}: {value}\n"
#             result += "\n"
#     return result

def format_data(data):
    result = ""
    for element in data:
        if isinstance(element, list):
            for dictionary in element:
                for key, value in dictionary.items():
                    if isinstance(value, list):
                        value_str = ', '.join(map(str, value))
                        result += f"{key}: [{value_str}]\n"
                    else:
                        result += f"{key}: {value}\n"
                result += "\n"
        elif isinstance(element, dict):
            for key, value in element.items():
                if isinstance(value, list):
                    value_str = ', '.join(map(str, value))
                    result += f"{key}: [{value_str}]\n"
                else:
                    result += f"{key}: {value}\n"
            result += "\n"
    return result

In [None]:
@anvil.server.callable
def gen_ques (text):
  print(text)
  print('hello')
  # modified_text=add_backslash_after_word_count(text,10)
  print(text)
  return(format_data(nlp(text)))


In [None]:
anvil.server.wait_forever()

Uniform Civil Code is a proposal to create and enforce personal laws of citizens in India that apply equally to all citizens regardless of their religion, gender and sexual orientation. At present, the personal laws of different communities are governed by their religious texts. The implementation of a Uniform Civil Code across the country is one of the controversial promises made by India's ruling Bharatiya Janata Party.It is an important issue regarding secularism in Indian politics and remains disputed by India's political left, Muslim groups and other conservative religious groups and sects in defense of Sharia and religious customs. Personal law is different from public law and covers marriage, divorce. , inheritance, adoption and maintenance.
hello
Uniform Civil Code is a proposal to create and enforce personal laws of citizens in India that apply equally to all citizens regardless of their religion, gender and sexual orientation. At present, the personal laws of different commun

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

[{'question': 'What is a proposal to create and enforce personal laws of citizens in india that apply equally to all citizens regardless of their religion, gender and sexual orientation?', 'distractors': ['ASCII', 'American_Standard_Code_for_Information_Interchange', 'uniform civil code', 'samurai degree'], 'answer': 'uniform civil code'}]
[{'question': 'What are the personal laws of different communities governed by?', 'distractors': ['religious texts', 'book', 'column', 'cookie'], 'answer': 'religious texts'}]
[{'question': 'What does the bharatiya janata party defend?', 'distractors': ['sharia', 'Islamic_law', 'red', 'hudud'], 'answer': 'sharia'}]
[{'question': 'What does personal law cover?', 'distractors': ['break', 'break_up', 'marriage, divorce', 'disassociate'], 'answer': 'marriage, divorce'}]
[{'question': 'What type of law does the Uniform Civil Code cover?', 'distractors': ['X-linked_dominant_inheritance', 'X-linked_recessive_inheritance', 'accretion', 'inheritance'], 'answe