<a href="https://colab.research.google.com/github/shreyas074/MCQ-Generator-using-NLP/blob/main/MULTIPLE_CHOICE_QUESTION_GENERATOR_FROM_GIVEN_TEXT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SETUP: Install & Load Libraries

In [1]:
!pip install spacy textblob nltk -q
!python -m spacy download en_core_web_sm -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree
import random
import spacy
from textblob import TextBlob
import re
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# NLTK & SpaCy Setup Functions

In [7]:
def install_nltk_data():
    required_packages = [
        'punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker',
        'words', 'wordnet', 'punkt_tab', 'averaged_perceptron_tagger_eng'
    ]

    for pkg in required_packages:
        try:
            nltk.download(pkg, quiet=True)
            print(f"Downloaded {pkg}")
        except Exception as e:
            print(f"Failed to download {pkg}: {e}")
            if pkg == 'averaged_perceptron_tagger_eng':
                try:
                    nltk.download('averaged_perceptron_tagger', quiet=True)
                    print("Downloaded fallback tagger")
                except:
                    pass

def load_spacy_model():
    try:
        import spacy
        nlp = spacy.load('en_core_web_sm')
        print("SpaCy model loaded")
        return nlp
    except OSError:
        print("SpaCy model not found, attempting download...")
        import subprocess
        try:
            subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'], check=True)
            nlp = spacy.load('en_core_web_sm')
            print("SpaCy model downloaded and loaded")
            return nlp
        except:
            print("Could not download SpaCy model, using basic mode")
            return None
    except ImportError:
        print("SpaCy not installed, using basic mode")
        return None

# TextProcessor Class (Core MCQ Logic)

In [23]:
class TextProcessor:
    def __init__(self, spacy_model):
        self.nlp = spacy_model

        self.question_starters = {
            'PERSON': ['Who', 'Which person', 'What individual'],
            'ORG': ['Which organization', 'What company', 'Which group', 'Who runs', 'Which institution'],
            'GPE': ['Where', 'Which place', 'Which region', 'Which country', 'What location'],
            'LOC': ['Where', 'Which geographic location', 'Which area'],
            'DATE': ['When', 'On which date', 'Which year', 'In what time period'],
            'TIME': ['At what time', 'When exactly', 'During which hour'],
            'EVENT': ['What event', 'Which historical event', 'What happened'],
            'WORK_OF_ART': ['Which artwork', 'What piece of art', 'Which painting or sculpture'],
            'LAW': ['Which law', 'Which regulation', 'What legal act'],
            'LANGUAGE': ['Which language', 'What language was spoken'],
            'MONEY': ['How much money', 'What amount', 'What was the cost'],
            'PERCENT': ['What percentage', 'How much', 'What proportion'],
            'QUANTITY': ['What amount', 'How much', 'How many units'],
            'ORDINAL': ['Which number', 'What position', 'What rank'],
            'CARDINAL': ['How many', 'What is the count', 'What number'],
            'FAC': ['Which facility', 'Which structure', 'Which building'],
            'PRODUCT': ['What product', 'Which item', 'What object'],
            'OBJECT': ['What', 'Which item', 'What thing'],
            'DESCRIPTION': ['What was described as', 'How was it characterized', 'What appeared to be'],
            'ACTION': ['What happened', 'What action was taken', 'How did it happen']
        }


        self.semantic_groups = {
            'colors': ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'black', 'white', 'brown', 'pink', 'gray', 'violet', 'indigo', 'cyan'],
            'animals': ['elephant', 'tiger', 'lion', 'bear', 'wolf', 'fox', 'rabbit', 'deer', 'monkey', 'zebra', 'giraffe', 'panda', 'leopard', 'kangaroo'],
            'birds': ['sparrow', 'eagle', 'owl', 'parrot', 'pigeon', 'peacock', 'crow', 'hawk', 'swan'],
            'plants': ['oak', 'pine', 'maple', 'birch', 'cedar', 'palm', 'willow', 'bamboo', 'fern', 'rose', 'cactus', 'orchid', 'tulip'],
            'body_parts': ['head', 'arm', 'leg', 'hand', 'foot', 'eye', 'ear', 'nose', 'mouth', 'finger', 'toe', 'knee', 'shoulder'],
            'food': ['apple', 'banana', 'orange', 'bread', 'rice', 'meat', 'fish', 'milk', 'cheese', 'egg', 'butter', 'pizza', 'pasta', 'burger'],
            'weather': ['sunny', 'rainy', 'cloudy', 'windy', 'stormy', 'snowy', 'foggy', 'humid', 'dry', 'cold', 'hot', 'chilly'],
            'emotions': ['happy', 'sad', 'angry', 'excited', 'worried', 'calm', 'surprised', 'afraid', 'proud', 'confused', 'anxious', 'bored'],
            'materials': ['wood', 'metal', 'plastic', 'glass', 'stone', 'paper', 'fabric', 'rubber', 'leather', 'ceramic', 'cotton', 'silk'],
            'vehicles': ['car', 'bus', 'train', 'plane', 'boat', 'bicycle', 'motorcycle', 'truck', 'helicopter', 'ship', 'scooter', 'submarine'],
            'buildings': ['house', 'school', 'hospital', 'church', 'store', 'factory', 'office', 'library', 'museum', 'theater', 'stadium'],
            'occupations': ['doctor', 'teacher', 'engineer', 'scientist', 'farmer', 'artist', 'actor', 'nurse', 'police', 'firefighter', 'pilot'],
            'technologies': ['internet', 'computer', 'robot', 'satellite', 'AI', 'blockchain', 'smartphone', 'drone'],
            'sports': ['football', 'cricket', 'tennis', 'basketball', 'volleyball', 'hockey', 'badminton', 'golf'],
            'countries': ['India', 'USA', 'France', 'Germany', 'China', 'Japan', 'Brazil', 'Canada', 'Australia', 'Italy'],
            'instruments': ['guitar', 'piano', 'violin', 'drums', 'flute', 'trumpet', 'saxophone'],
            'clothing': ['shirt', 'jeans', 'dress', 'skirt', 'jacket', 'hat', 'socks', 'scarf', 'gloves'],
            'professions': ['lawyer', 'dentist', 'architect', 'chef', 'writer', 'photographer'],
            'programming_languages': ['Python', 'Java', 'C++', 'JavaScript', 'Ruby', 'Go', 'Swift'],
        }


    def extract_facts(self, text):
        if self.nlp:
            return self.extract_with_spacy(text)
        else:
            return self.extract_with_nltk(text)

    def extract_with_spacy(self, text):
        facts = []
        doc = self.nlp(text)
        for sent in doc.sents:
            sent_doc = self.nlp(sent.text)
            for ent in sent_doc.ents:
                if len(ent.text.strip()) > 1 and ent.label_ in ['PERSON', 'ORG', 'GPE', 'DATE', 'TIME']:
                    facts.append({
                        'answer': ent.text.strip(),
                        'sentence': sent.text.strip(),
                        'type': ent.label_,
                        'context': self.get_sentence_context(text, sent.text, 2)
                    })
            for chunk in sent_doc.noun_chunks:
                if len(chunk.text.strip()) > 2 and chunk.text.lower() not in ['the', 'a', 'an', 'this', 'that']:
                    facts.append({
                        'answer': chunk.text.strip(),
                        'sentence': sent.text.strip(),
                        'type': 'OBJECT',
                        'context': self.get_sentence_context(text, sent.text, 2)
                    })
        return facts

    def extract_with_nltk(self, text):
        facts = []
        sentences = sent_tokenize(text)
        for sentence in sentences:
            try:
                tokens = word_tokenize(sentence)
                tagged = pos_tag(tokens)
                entities = ne_chunk(tagged, binary=False)
                for chunk in entities:
                    if isinstance(chunk, Tree):
                        entity_text = ' '.join([token for token, pos in chunk.leaves()])
                        entity_type = chunk.label()
                        if len(entity_text) > 1:
                            facts.append({
                                'answer': entity_text,
                                'sentence': sentence,
                                'type': entity_type,
                                'context': self.get_sentence_context(text, sentence, 2)
                            })
                nouns = [word for word, pos in tagged if pos in ['NN', 'NNP', 'NNS', 'NNPS']
                         and len(word) > 2 and word.lower() not in ['the', 'and', 'for', 'are', 'was']]
                for noun in nouns[:3]:
                    facts.append({
                        'answer': noun,
                        'sentence': sentence,
                        'type': 'OBJECT',
                        'context': self.get_sentence_context(text, sentence, 2)
                    })
            except Exception as e:
                print(f"Error processing sentence: {e}")
                continue
        return facts

    def get_sentence_context(self, text, target_sentence, window_size=2):
        sentences = sent_tokenize(text)
        try:
            target_idx = sentences.index(target_sentence)
            start = max(0, target_idx - window_size)
            end = min(len(sentences), target_idx + window_size + 1)
            return ' '.join(sentences[start:end])
        except ValueError:
            return target_sentence

    def find_wordnet_alternatives(self, word):
        alternatives = set()
        word_lower = word.lower()
        word_parts = word_lower.split()

        for part in word_parts:
            synsets = wordnet.synsets(part)

            for syn in synsets:
                for lemma in syn.lemmas():
                    synonym = lemma.name().replace("_", " ").title()
                    if synonym.lower() != part and len(synonym) > 2:
                        alternatives.add(synonym)

                for lemma in syn.lemmas():
                    for ant in lemma.antonyms():
                        antonym = ant.name().replace("_", " ").title()
                        if antonym.lower() != part and len(antonym) > 2:
                            alternatives.add(antonym)

                for hyper in syn.hypernyms():
                    for lemma in hyper.lemmas():
                        hyper_word = lemma.name().replace("_", " ").title()
                        if hyper_word.lower() != part and len(hyper_word) > 2:
                            alternatives.add(hyper_word)

                for hypo in syn.hyponyms()[:5]:
                    for lemma in hypo.lemmas():
                        hypo_word = lemma.name().replace("_", " ").title()
                        if hypo_word.lower() != part and len(hypo_word) > 2:
                            alternatives.add(hypo_word)

                for mero in syn.part_meronyms():
                    for lemma in mero.lemmas():
                        mero_word = lemma.name().replace("_", " ").title()
                        if mero_word.lower() != part and len(mero_word) > 2:
                            alternatives.add(mero_word)

                for holo in syn.part_holonyms():
                    for lemma in holo.lemmas():
                        holo_word = lemma.name().replace("_", " ").title()
                        if holo_word.lower() != part and len(holo_word) > 2:
                            alternatives.add(holo_word)

        return alternatives

    def find_category_alternatives(self, word):
        alternatives = set()
        word_lower = word.lower()

        for category, items in self.semantic_groups.items():
            if any(item in word_lower for item in items):
                category_items = [item.title() for item in items if item not in word_lower]
                alternatives.update(random.sample(category_items, min(5, len(category_items))))
                break

        return alternatives

    def rank_by_similarity(self, target, candidates):
        if not self.nlp or not candidates:
            return list(candidates)

        try:
            ranked = []
            target_doc = self.nlp(target)
            for candidate in candidates:
                candidate_doc = self.nlp(candidate)
                score = target_doc.similarity(candidate_doc)
                ranked.append((candidate, score))

            ranked.sort(key=lambda x: x[1], reverse=True)
            return [x[0] for x in ranked]
        except:
            return list(candidates)

    def create_distractors(self, answer, sentence, full_context, answer_type):
        distractors = set()

        # Get WordNet alternatives
        wordnet_alts = self.find_wordnet_alternatives(answer)
        distractors.update(wordnet_alts)

        # Get category-based alternatives
        category_alts = self.find_category_alternatives(answer)
        distractors.update(category_alts)

        if self.nlp:
            try:
                doc = self.nlp(full_context)
                if answer_type == 'PERSON':
                    distractors.update([ent.text for ent in doc.ents
                                      if ent.label_ == 'PERSON' and ent.text != answer])
                elif answer_type in ['ORG', 'GPE']:
                    distractors.update([ent.text for ent in doc.ents
                                      if ent.label_ == answer_type and ent.text != answer])
                else:
                    context_chunks = [chunk.text for chunk in doc.noun_chunks
                                    if chunk.text != answer and len(chunk.text) > 2]
                    distractors.update(context_chunks)
            except Exception as e:
                print(f"SpaCy error: {e}")

        # basic alternatives
        distractors.update(self.get_basic_alternatives(answer, full_context))

        # type-specific alternatives
        distractors.update(self.get_type_alternatives(answer_type))

        # Clean and filter
        cleaned = self.clean_distractors(distractors, answer)

        # Rank by similarity
        ranked = self.rank_by_similarity(answer, list(cleaned))

        return ranked[:15]

    def get_basic_alternatives(self, answer, context):
        alternatives = set()
        try:
            tokens = word_tokenize(context)
            tagged = pos_tag(tokens)
            for word, pos in tagged:
                if pos.startswith("NN") and word.lower() != answer.lower() and len(word) > 2:
                    alternatives.add(word.title())
        except:
            pass
        return alternatives

    def get_type_alternatives(self, answer_type):
        type_alts = {
            'PERSON': ['Alexander Smith', 'Maria Garcia', 'James Wilson', 'Emma Davis', 'John Clark', 'Olivia Miller'],
            'GPE': ['Chicago', 'Berlin', 'Tokyo', 'Nairobi', 'Paris', 'Mumbai', 'Toronto', 'Beijing'],
            'ORG': ['World Health Organization', 'NASA', 'Google', 'UNICEF', 'Harvard University', 'Greenpeace'],
            'LOC': ['Himalayas', 'Amazon Rainforest', 'Pacific Ocean', 'Sahara Desert', 'Grand Canyon'],
            'DATE': ['2019', '2022', 'July 4', 'October 10', 'last month', 'next year', 'two years ago'],
            'TIME': ['10 AM', 'midnight', 'noon', '3 PM', 'evening', 'early morning'],
            'EVENT': ['World War II', 'Olympics', 'COVID-19 Pandemic', 'French Revolution', 'Earthquake in Haiti'],
            'PRODUCT': ['iPhone', 'Tesla Model 3', 'Samsung TV', 'Kindle', 'PlayStation 5'],
            'LANGUAGE': ['French', 'Spanish', 'Hindi', 'Mandarin', 'German', 'Arabic'],
            'LAW': ['Constitution Act', 'Civil Rights Act', 'GDPR', 'Right to Education', 'Labor Law'],
            'WORK_OF_ART': ['Mona Lisa', 'The Starry Night', 'Hamlet', 'The Thinker', 'The Last Supper'],
            'MONEY': ['$100', '₹5000', '€50', '£20', '200 yen'],
            'PERCENT': ['10%', '50%', '75%', '33%'],
            'QUANTITY': ['3 liters', '100 grams', '20 units', '5 miles'],
            'ORDINAL': ['first', 'second', 'third', 'fourth', 'fifth'],
            'CARDINAL': ['one', 'two', 'three', 'four', 'five', 'six'],
            'FAC': ['Golden Gate Bridge', 'Eiffel Tower', 'Empire State Building', 'Burj Khalifa'],
            'OBJECT': ['tablet', 'backpack', 'water bottle', 'notebook', 'camera'],
        }
        return set(type_alts.get(answer_type, []))

    def clean_distractors(self, distractors, answer):
        cleaned = set()
        answer_lower = answer.lower()
        answer_words = set(answer_lower.split())

        for dist in distractors:
            if not dist or len(dist.strip()) < 2:
                continue

            dist_clean = dist.strip()
            dist_lower = dist_clean.lower()
            dist_words = set(dist_lower.split())

            if dist_lower == answer_lower:
                continue

            if dist_lower in answer_lower or answer_lower in dist_lower:
                continue

            overlap_ratio = len(answer_words.intersection(dist_words)) / max(len(answer_words), len(dist_words))
            if overlap_ratio > 0.5:
                continue

            if dist_clean.replace(' ', '').isalpha() and len(dist_clean) <= 50:
                cleaned.add(dist_clean)

        return cleaned

    def build_question(self, fact):
        answer = fact['answer']
        sentence = fact['sentence']
        answer_type = fact['type']
        starters = self.question_starters.get(answer_type, ['What'])
        starter = random.choice(starters)

        if answer in sentence:
            question = sentence.replace(answer, "_____")
            question = f"{starter} fills in the blank: {question}?"
        else:
            question = f"{starter} is mentioned in: {sentence}?"

        return re.sub(r'\s+', ' ', question).strip()

    def generate_questions(self, text, num_questions=5):
        print("Extracting information from text...")
        facts = self.extract_facts(text)
        if not facts:
            print("No suitable content found for questions.")
            return []

        facts.sort(key=lambda x: (
            1 if x['type'] in ['PERSON', 'ORG', 'GPE'] else 0,
            len(x['answer']),
            -x['sentence'].count(x['answer'])
        ), reverse=True)

        questions = []
        used_answers = set()

        print(f"Creating {num_questions} questions...")
        for fact in facts:
            if len(questions) >= num_questions:
                break

            answer = fact['answer']
            if answer.lower() in used_answers:
                continue

            # Create distractors
            distractors = self.create_distractors(
                answer, fact['sentence'], fact['context'], fact['type']
            )

            if len(distractors) >= 3:
                selected_distractors = distractors[:3]
                question_text = self.build_question(fact)

                options = [answer] + selected_distractors
                random.shuffle(options)

                questions.append({
                    "question": question_text,
                    "options": options,
                    "answer": answer,
                    "correct_index": options.index(answer)
                })

                used_answers.add(answer.lower())
                print(f"Created question for '{answer}'")
            else:
                print(f"Insufficient options for '{answer}' (found {len(distractors)})")

        return questions

# Sample Text

In [25]:
SAMPLE_TEXT = """Mahatma Gandhi, whose full name was Mohandas Karamchand Gandhi, was one of the most influential leaders in India’s struggle for independence. He was born on October 2, 1869, in Porbandar, a coastal town in present-day Gujarat, India. Gandhi studied law in London and later went to South Africa, where he faced racial discrimination. His experiences there deeply shaped his philosophy of nonviolent resistance, known as Satyagraha.

Returning to India in 1915, Gandhi quickly became a prominent figure in the Indian National Congress. He led several mass movements against British rule, such as the Non-Cooperation Movement in 1920, the Salt March in 1930, and the Quit India Movement in 1942. These campaigns were characterized by peaceful protests, boycotts of British goods, and civil disobedience.

The Salt March, also known as the Dandi March, was a major act of defiance against British salt laws. Gandhi walked over 240 miles from Sabarmati Ashram to the coastal village of Dandi in Gujarat, where he made salt from seawater, inspiring millions to do the same. This nonviolent act became a symbol of Indian resistance.

Throughout his life, Gandhi emphasized the values of truth, nonviolence, and simplicity. He believed in self-reliance and promoted the use of khadi, a hand-spun cloth, as a way to boycott British textiles. He also worked to eradicate untouchability and improve the status of marginalized communities in India.

Gandhi's impact was not limited to India. His methods inspired global leaders like Martin Luther King Jr. in the United States and Nelson Mandela in South Africa. Both of them used Gandhian principles in their own fights against oppression.

India finally gained independence on August 15, 1947. However, the country was also partitioned into India and Pakistan, leading to widespread violence. Gandhi worked tirelessly to stop the communal riots and promote peace between Hindus and Muslims. On January 30, 1948, Mahatma Gandhi was assassinated in New Delhi by Nathuram Godse, a Hindu nationalist who opposed his efforts for harmony.

Today, Gandhi is remembered as the 'Father of the Nation' in India. His birthday, October 2, is celebrated as Gandhi Jayanti and is also observed as the International Day of Non-Violence. Statues of Gandhi can be found across the world, and his teachings continue to inspire people seeking justice and equality.
"""

# MCQ GENERATOR INTERFACE

In [27]:
from IPython.display import display, HTML, clear_output

def create_mcqs(text, num_questions=5):
    install_nltk_data()
    spacy_model = load_spacy_model()
    processor = TextProcessor(spacy_model)
    return processor.generate_questions(text, num_questions)

def main():
    display(HTML("""
    <style>
    .mcq-container { background: linear-gradient(135deg, #2c3e50 0%, #34495e 100%); padding: 30px; border-radius: 15px; margin: 20px 0; }
    .mcq-title { color: white; font-size: 28px; font-weight: bold; text-align: center; margin-bottom: 20px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
    .question-card { background: white; border-radius: 15px; padding: 25px; margin: 15px 0; box-shadow: 0 8px 25px rgba(0,0,0,0.1); border-left: 5px solid #34495e; }
    .question-text { font-size: 18px; font-weight: 600; color: #2c3e50; margin-bottom: 15px; }
    .option { font-size: 16px; padding: 8px 0; color: #34495e; }
    .correct { color: #27ae60; font-weight: bold; }
    </style>
    <div class="mcq-container">
        <div class="mcq-title">Multiple Choice Question Generator</div>
    </div>
    """))

    print("Choose input method:")
    print("1. Use sample text")
    print("2. Enter your own custom text")
    choice = input("Enter choice (1 or 2): ").strip()

    if choice == '2':
        print("\nPaste your text (end with Ctrl+D or an empty line):")
        try:
            user_lines = []
            while True:
                line = input()
                if not line.strip():
                    break
                user_lines.append(line)
            text = "\n".join(user_lines).strip()
        except EOFError:
            pass
        if not text:
            print("No text entered. Using sample text.")
            text = SAMPLE_TEXT
    else:
        text = SAMPLE_TEXT
        print("\nUsing default sample text.\n")

    try:
        num_questions = int(input("How many questions to generate? (default 5): ").strip() or "5")
    except ValueError:
        num_questions = 5

    print("\nGenerating questions...\n")
    mcqs = create_mcqs(text, num_questions)

    if not mcqs:
        print("Unable to generate questions. Try longer or more informative text.")
        return

    print(f"Generated {len(mcqs)} questions:\n")

    for idx, mcq in enumerate(mcqs, 1):
        display(HTML(f"""
        <div class="question-card">
            <div class="question-text">Question {idx}: {mcq['question']}</div>
            {''.join(f'<div class="option{" correct" if option == mcq["answer"] else ""}">{chr(64 + i)}. {option}{" [CORRECT]" if option == mcq["answer"] else ""}</div>'
                     for i, option in enumerate(mcq['options'], 1))}
        </div>
        """))

if __name__ == "__main__":
    main()


Choose input method:
1. Use sample text
2. Enter your own custom text
Enter choice (1 or 2): 2

Paste your text (end with Ctrl+D or an empty line):
In 1969, Neil Armstrong became the first human to walk on the Moon as part of NASA's Apollo 11 mission. The historic event was broadcast to millions of people across the world and marked a major milestone in space exploration. Buzz Aldrin, his fellow astronaut, also walked on the lunar surface. The mission launched from Kennedy Space Center in Florida and returned safely to Earth after eight days.  NASA, the United States’ national space agency, has conducted numerous missions since then, including the Mars Rover missions and the Hubble Space Telescope project. In recent years, private companies like SpaceX, founded by Elon Musk, have begun to play a major role in space exploration, launching rockets and resupplying the International Space Station.  Meanwhile, climate change has become a pressing global issue. In 2015, 195 countries signed 

In [None]:
import os
os.kill(os.getpid(), 9)