In [2]:
import sklearn
import spacy
import random
from collections import Counter

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
text  = '''spaCy is an advanced open-source library for Natural Language Processing (NLP) in Python, designed to help developers build applications that process and understand large volumes of text. NLP involves the interaction between computers and human languages, enabling machines to read, interpret, and generate human language in a valuable way. spaCy stands out due to its speed, efficiency, and ease of use, providing pre-trained models, word vectors, and an extensive suite of tools for various NLP tasks such as tokenization, part-of-speech tagging, named entity recognition, dependency parsing, and lemmatization.

One of spaCy's strengths is its industrial-grade architecture that emphasizes performance and accuracy. This makes it suitable for production use in large-scale applications. spaCy is also designed to be extensible, allowing users to create custom components and pipelines tailored to specific needs. Its compatibility with other NLP libraries, such as scikit-learn, TensorFlow, and PyTorch, enables seamless integration with machine learning workflows.

The library includes various pre-trained models that cater to different languages and NLP tasks. One of the most commonly used models is "en_core_web_sm," a small English model that provides essential functionalities for many NLP applications. This model includes components for tokenization, part-of-speech tagging, dependency parsing, named entity recognition, and lemmatization, all trained on web text to ensure broad coverage and robustness. Despite its relatively small size, "en_core_web_sm" offers a good balance between performance and resource efficiency, making it ideal for developers who need a lightweight solution for common NLP tasks.

Using "en_core_web_sm" with spaCy is straightforward, involving just a few lines of code to load the model and process text. This ease of use allows developers to quickly prototype and deploy NLP applications. Additionally, spaCy's user-friendly documentation and active community support make it accessible to both beginners and experienced practitioners. The library's design also supports fine-tuning and transfer learning, enabling users to adapt pre-trained models to specific domains or tasks.

'''

In [6]:
doc = nlp(text)

In [8]:
# extracting sentences from text:
sentences = [sent.text for sent in doc.sents]

In [40]:
# extracting few random sentences:
num_of_questions = 5
selected_sentences = random.sample(sentences,min(num_of_questions,len(sentences)))

In [41]:
selected_sentences

["The library's design also supports fine-tuning and transfer learning, enabling users to adapt pre-trained models to specific domains or tasks.\n\n",
 'This makes it suitable for production use in large-scale applications.',
 'NLP involves the interaction between computers and human languages, enabling machines to read, interpret, and generate human language in a valuable way.',
 'This ease of use allows developers to quickly prototype and deploy NLP applications.',
 "One of spaCy's strengths is its industrial-grade architecture that emphasizes performance and accuracy."]

In [42]:
mcqs = []

for sentence in selected_sentences:
    sentence = sentence.lower()
    sent_doc = nlp(sentence) #creating doc of each sentence from selected sentence
    nouns = [token.text for token in sent_doc if token.pos_ == "NOUN"] #seperating nouns from each sentence
    if len(nouns)<2:
        continue        #excluding nouns having less than 2 letters
    # print(nouns)

    noun_counts = Counter(nouns) #getting count of occurance
    # print(noun_counts)

    if noun_counts:
        subject = noun_counts.most_common(1)[0][0]
        # print(subject)
        answer_choices = [subject]
        question_stem = sentence.replace(subject,"________")
        # print(question_stem)

        for _ in range(3):
            distractor = random.choice(list(set(nouns)-set([subject])))
            answer_choices.append(distractor)

        random.shuffle(answer_choices)
        # print(answer_choices)

        correct_answer = chr(64 + answer_choices.index(subject)+1) #convert index to letter
        mcqs.append((question_stem,answer_choices,correct_answer))
        

In [43]:
mcqs

[("the ________'s design also supports fine-tuning and transfer learning, enabling users to adapt pre-trained models to specific domains or tasks.\n\n",
  ['domains', 'library', 'users', 'domains'],
  'B'),
 ('this makes it suitable for ________ use in large-scale applications.',
  ['use', 'applications', 'use', 'production'],
  'D'),
 ('nlp involves the ________ between computers and human languages, enabling machines to read, interpret, and generate human language in a valuable way.',
  ['machines', 'interaction', 'machines', 'way'],
  'B'),
 ('this ________ of use allows developers to quickly prototype and deploy nlp applications.',
  ['applications', 'ease', 'developers', 'developers'],
  'B'),
 ("one of ________'s strengths is its industrial-grade architecture that emphasizes performance and accuracy.",
  ['architecture', 'spacy', 'accuracy', 'accuracy'],
  'B')]

# A function to do the above task

In [46]:
def generate_mcqs(text,num_of_questions=5):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    selected_sentences = random.sample(sentences,min(num_of_questions,len(sentences)))
    mcqs = []
    for sentence in selected_sentences:
        sentence = sentence.lower()
        sent_doc = nlp(sentence)
        nouns = [token.text for token in sent_doc if token.pos_ == "NOUN"]
        if len(nouns)<2:
            continue 
        noun_counts = Counter(nouns)
        if noun_counts:
            subject = noun_counts.most_common(1)[0][0]
            answer_choices = [subject]
            question_stem = sentence.replace(subject,"________")
            for _ in range(3):
                distractor = random.choice(list(set(nouns)-set([subject])))
                answer_choices.append(distractor)
            random.shuffle(answer_choices)
            correct_answer = chr(64 + answer_choices.index(subject)+1) #convert index to letter
            mcqs.append((question_stem,answer_choices,correct_answer))
    return mcqs
    

In [49]:
# testing:
generate_mcqs(text,3)

[("one of ________'s strengths is its industrial-grade architecture that emphasizes performance and accuracy.",
  ['performance', 'architecture', 'grade', 'spacy'],
  'D'),
 ('the ________ includes various pre-trained models that cater to different languages and nlp tasks.',
  ['library', 'tasks', 'models', 'languages'],
  'A'),
 ('________ is an advanced open-source library for natural language processing (nlp) in python, designed to help developers build applications that process and understand large volumes of text.',
  ['developers', 'spacy', 'applications', 'developers'],
  'B')]