In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from nltk.chunk import RegexpParser
import random

# Download required NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Sample input text
input_text = """
Google was founded on September 4, 1998, by American computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together, they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet.[14]
"""

def extract_keywords(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    keywords = set()
    
    # Define a chunk grammar to identify noun phrases
    grammar = r"""
        NP: {<DT>?<JJ>*<NN.*>+}  # Noun phrase
    """
    chunk_parser = RegexpParser(grammar)
    
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tagged = pos_tag(tokens)
        tree = chunk_parser.parse(tagged)
        
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                # Join the leaves to form the noun phrase
                noun_phrase = ' '.join(word for word, tag in subtree.leaves())
                # Filter out short words and common stopwords
                if len(noun_phrase) > 3:
                    keywords.add(noun_phrase.lower())
    
    return list(keywords)

def generate_distractors(keyword):
    distractors = set()
    keyword_synsets = wn.synsets(keyword, pos=wn.NOUN)
    
    if not keyword_synsets:
        return ['algorithm', 'software', 'computer']  # Default distractors if no synsets are found
    
    # Use the first synset
    keyword_synset = keyword_synsets[0]
    
    # Get hypernyms and hyponyms
    hypernyms = keyword_synset.hypernyms()
    hyponyms = keyword_synset.hyponyms()
    
    # Add hypernyms and hyponyms to distractors
    for hypernym in hypernyms:
        for lemma in hypernym.lemmas():
            distractors.add(lemma.name().replace('_', ' '))
    for hyponym in hyponyms:
        for lemma in hyponym.lemmas():
            distractors.add(lemma.name().replace('_', ' '))
    
    # Remove the keyword itself from distractors
    distractors.discard(keyword)
    
    # If not enough distractors, add some defaults
    if len(distractors) < 3:
        default_distractors = ['algorithm', 'software', 'computer']
        distractors.update(default_distractors)
    
    return random.sample(distractors, min(3, len(distractors)))

def generate_mcqs(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    
    # Extract keywords
    keywords = extract_keywords(text)
    
    # Generate MCQs
    mcqs = []
    for keyword in keywords:
        for sentence in sentences:
            if keyword in sentence.lower():
                distractors = generate_distractors(keyword)
                question = sentence.replace(keyword, '_____')
                options = [keyword] + distractors
                random.shuffle(options)
                
                mcqs.append({
                    "question": question,
                    "options": options,
                    "correct_answer": keyword
                })
                break  # Move to the next keyword after creating one question
    
    return mcqs

# Generate MCQs
mcqs = generate_mcqs(input_text)

# Print the MCQs in a formatted way
for i, mcq in enumerate(mcqs, 1):
    print(f"\nQuestion {i}:")
    print(mcq['question'])
    print("\nOptions:")
    for j, option in enumerate(['A', 'B', 'C', 'D'][:len(mcq['options'])]):
        print(f"{option}) {mcq['options'][j]}")
    print(f"\nCorrect Answer: {mcq['correct_answer']}")
    print("-" * 80)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!



Question 1:

Google was founded on September 4, 1998, by American computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California.

Options:
A) computer
B) algorithm
C) software
D) sergey brin

Correct Answer: sergey brin
--------------------------------------------------------------------------------

Question 2:

Google was founded on September 4, 1998, by American computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California.

Options:
A) phd students
B) computer
C) software
D) algorithm

Correct Answer: phd students
--------------------------------------------------------------------------------

Question 3:
The company went public via _____ (IPO) in 2004.

Options:
A) algorithm
B) computer
C) software
D) an initial public offering

Correct Answer: an initial public offering
--------------------------------------------------------------------------------

Question 4:

Google was fou

since Python 3.9 and will be removed in a subsequent version.
  return random.sample(distractors, min(3, len(distractors)))


In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from nltk.chunk import RegexpParser
import random

# Download required NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Sample input text
input_text = """
Google was founded on September 4, 1998, by American computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together, they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet.
"""

def extract_keywords(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    keywords = set()
    
    # Define a chunk grammar to identify noun phrases
    grammar = r"""
        NP: {<DT>?<JJ>*<NN.*>+}  # Noun phrase
    """
    chunk_parser = RegexpParser(grammar)
    
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tagged = pos_tag(tokens)
        tree = chunk_parser.parse(tagged)
        
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                # Join the leaves to form the noun phrase
                noun_phrase = ' '.join(word for word, tag in subtree.leaves())
                # Filter out short words and common stopwords
                if len(noun_phrase) > 3:
                    keywords.add(noun_phrase.lower())
    
    return list(keywords)

def generate_distractors(keyword):
    distractors = set()
    keyword_synsets = wn.synsets(keyword, pos=wn.NOUN)
    
    if not keyword_synsets:
        return ['algorithm', 'software', 'computer']  # Default distractors if no synsets are found
    
    # Use the first synset
    keyword_synset = keyword_synsets[0]
    
    # Get hypernyms and hyponyms
    hypernyms = keyword_synset.hypernyms()
    hyponyms = keyword_synset.hyponyms()
    
    # Add hypernyms and hyponyms to distractors
    for hypernym in hypernyms:
        for lemma in hypernym.lemmas():
            distractors.add(lemma.name().replace('_', ' '))
    for hyponym in hyponyms:
        for lemma in hyponym.lemmas():
            distractors.add(lemma.name().replace('_', ' '))
    
    # Remove the keyword itself from distractors
    distractors.discard(keyword)
    
    # If not enough distractors, add some defaults
    if len(distractors) < 3:
        default_distractors = ['algorithm', 'software', 'computer']
        distractors.update(default_distractors)
    
    return random.sample(distractors, min(3, len(distractors)))

def generate_mcqs(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    
    # Extract keywords
    keywords = extract_keywords(text)
    
    # Generate MCQs
    mcqs = []
    for keyword in keywords:
        for sentence in sentences:
            if keyword in sentence.lower():
                distractors = generate_distractors(keyword)
                question = sentence.lower().replace(keyword, '_____')
                options = [keyword] + distractors
                random.shuffle(options)
                
                mcqs.append({
                    "question": question.capitalize(),
                    "options": options,
                    "correct_answer": keyword
                })
                break  # Move to the next keyword after creating one question
    
    return mcqs

# Generate MCQs
mcqs = generate_mcqs(input_text)

# Print the MCQs in a formatted way
for i, mcq in enumerate(mcqs, 1):
    print(f"\nQuestion {i}:")
    print(mcq['question'])
    print("\nOptions:")
    for j, option in enumerate(['A', 'B', 'C', 'D'][:len(mcq['options'])]):
        print(f"{option}) {mcq['options'][j]}")
    print(f"\nCorrect Answer: {mcq['correct_answer']}")
    print("-" * 80)



Question 1:

google was founded on september 4, 1998, by american computer scientists larry page and _____ while they were phd students at stanford university in california.

Options:
A) software
B) algorithm
C) sergey brin
D) computer

Correct Answer: sergey brin
--------------------------------------------------------------------------------

Question 2:

google was founded on september 4, 1998, by american computer scientists larry page and sergey brin while they were _____ at stanford university in california.

Options:
A) algorithm
B) phd students
C) computer
D) software

Correct Answer: phd students
--------------------------------------------------------------------------------

Question 3:
The company went public via _____ (ipo) in 2004.

Options:
A) software
B) an initial public offering
C) computer
D) algorithm

Correct Answer: an initial public offering
--------------------------------------------------------------------------------

Question 4:

google was founded on septe

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
since Python 3.9 and will be removed in a subsequent version.
  return random.sample(distractors, min(3, len(distractors)))


In [3]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from nltk.chunk import RegexpParser
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random
import re

# Download required NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')

class AdvancedMCQGenerator:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.ne_chunker = nltk.ne_chunk
        
    def extract_named_entities(self, text):
        """Extract named entities using NLTK"""
        sentences = sent_tokenize(text)
        entities = []
        
        for sentence in sentences:
            tokens = word_tokenize(sentence)
            pos_tags = pos_tag(tokens)
            ne_tree = self.ne_chunker(pos_tags)
            
            for subtree in ne_tree:
                if hasattr(subtree, 'label'):
                    entity_text = ' '.join([token for token, pos in subtree.leaves()])
                    entities.append({
                        'text': entity_text,
                        'type': subtree.label(),
                        'context': sentence  # Use full sentence as context
                    })
        return entities

    def generate_smart_distractors(self, concept):
        """Generate contextually relevant distractors"""
        distractors = set()
        
        # Predefined similar entities
        similar_entities = {
            'PERSON': {
                'Larry Page': ['Mark Zuckerberg', 'Bill Gates', 'Steve Jobs'],
                'Sergey Brin': ['Jeff Bezos', 'Elon Musk', 'Tim Cook'],
                'Sundar Pichai': ['Satya Nadella', 'Tim Cook', 'Andy Jassy']
            },
            'ORGANIZATION': {
                'Google': ['Microsoft', 'Apple', 'Meta', 'Amazon'],
                'Alphabet': ['Meta Platforms', 'Microsoft Corporation', 'Apple Inc.'],
                'Stanford University': ['MIT', 'Harvard University', 'Berkeley']
            }
        }
        
        # Try predefined entities first
        entity_type = concept['type']
        if entity_type in similar_entities:
            if concept['text'] in similar_entities[entity_type]:
                distractors.update(similar_entities[entity_type][concept['text']])
        
        # For dates
        if re.search(r'\d{4}', concept['text']):
            year_match = re.search(r'\d{4}', concept['text'])
            year = int(year_match.group())
            nearby_years = [str(year + i) for i in [-1, 1, 2]]
            distractors.update([
                concept['text'].replace(str(year), ny)
                for ny in nearby_years
            ])
        
        # Add WordNet-based distractors
        main_word = concept['text'].split()[-1]
        synsets = wn.synsets(main_word)
        if synsets:
            for synset in synsets[:2]:
                for lemma in synset.lemmas():
                    if lemma.name() != main_word:
                        distractors.add(lemma.name().replace('_', ' '))
        
        # Add generic distractors if needed
        if len(distractors) < 3:
            generic_distractors = {
                'PERSON': ['John Smith', 'Michael Johnson', 'David Brown'],
                'ORGANIZATION': ['Tech Corp', 'Global Systems', 'Digital Solutions'],
                'GPE': ['London', 'Tokyo', 'Paris'],
                'DATE': ['2005', '2010', '2012']
            }
            distractors.update(generic_distractors.get(entity_type, 
                             ['algorithm', 'software', 'computer']))
        
        # Filter and limit distractors
        distractors = [d for d in distractors if d.lower() != concept['text'].lower()]
        return distractors[:3]

    def _safe_split(self, text, delimiter, entity):
        """Safely split text and handle edge cases"""
        parts = text.split(entity)
        if len(parts) < 2:
            return text, ""  # Return original text if split fails
        return parts[0], parts[1]

    def _get_context_parts(self, context, entity_text):
        """Extract before and after parts of context safely"""
        before, after = self._safe_split(context, entity_text)
        # Clean up the parts and handle periods
        after = after.split('.')[0] if '.' in after else after
        return before.strip(), after.strip()

    def generate_question_variants(self, entity, context):
        """Generate different types of questions for the same concept"""
        before_ctx, after_ctx = self._get_context_parts(context, entity['text'])
        
        if not after_ctx:  # If entity is at the end, use before context
            if entity['type'] == 'PERSON':
                return f"Which individual {before_ctx}?"
            elif entity['type'] == 'ORGANIZATION':
                return f"Which company {before_ctx}?"
            else:
                return f"What {entity['type'].lower()} {before_ctx}?"
        
        # Choose question type based on context availability
        if len(before_ctx) > 0 and len(after_ctx) > 0:
            templates = [
                f"Based on the context, which {entity['type'].lower()} {after_ctx}?",
                f"In relation to {before_ctx}, which {entity['type'].lower()} {after_ctx}?",
                f"Who or what {after_ctx} after {before_ctx}?"
            ]
        else:
            templates = [
                f"Which {entity['type'].lower()} is associated with {after_ctx}?",
                f"What {entity['type'].lower()} {after_ctx}?",
                f"Select the correct {entity['type'].lower()} that {after_ctx}."
            ]
        
        return random.choice(templates)

    def generate_mcqs(self, text):
        """Generate advanced MCQs from the input text"""
        entities = self.extract_named_entities(text)
        mcqs = []
        
        for entity in entities:
            distractors = self.generate_smart_distractors(entity)
            if distractors:  # Only create question if we have distractors
                try:
                    question = self.generate_question_variants(entity, entity['context'])
                    options = [entity['text']] + distractors
                    random.shuffle(options)
                    
                    mcqs.append({
                        'question': question,
                        'options': options,
                        'correct_answer': entity['text'],
                        'type': entity['type']
                    })
                except Exception as e:
                    continue  # Skip this entity if question generation fails
        
        return mcqs

# Example usage
if __name__ == "__main__":
    input_text = """
    Google was founded on September 4, 1998, by American computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together, they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet.
    """
    
    mcq_generator = AdvancedMCQGenerator()
    mcqs = mcq_generator.generate_mcqs(input_text)
    
    # Print the MCQs in a formatted way
    for i, mcq in enumerate(mcqs, 1):
        print(f"\nQuestion {i}:")
        print(mcq['question'])
        print("\nOptions:")
        for j, option in enumerate(['A', 'B', 'C', 'D'][:len(mcq['options'])]):
            print(f"{option}) {mcq['options'][j]}")
        print(f"\nCorrect Answer: {mcq['correct_answer']}")
        print(f"Question Type: {mcq['type']}")
        print("-" * 80)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dsoni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-