In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import spacy
import random

# Load spaCy model with word vectors
nlp = spacy.load("en_core_web_md")  # Use "en_core_web_md" or "en_core_web_lg" for word vectors

# Function to preprocess text
def preprocess_text(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

# Function to create training data for LSTM
def create_training_data(sentences, tokenizer, max_length):
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences

# LSTM Model for learning sentence structures
def build_lstm_model(vocab_size, max_length, embedding_dim):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64),
        Dense(64, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Function to find similar words using spaCy
def find_similar_words(word, num_similar=3):
    word_token = nlp.vocab[word] if word in nlp.vocab else None
    if not word_token or not word_token.has_vector:
        return ["[Distractor]"] * num_similar  # Return placeholders if no vector is found

    # Compute similarity with other words in vocab
    similarities = []
    for token in nlp.vocab:
        if token.is_alpha and token.has_vector and token != word_token:
            similarity = word_token.similarity(token)
            similarities.append((token.text, similarity))
    
    # Sort and return top similar words
    similarities.sort(key=lambda x: x[1], reverse=True)
    return [word for word, _ in similarities[:num_similar]]

# Function to generate MCQs using LSTM and spaCy word embeddings
def generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=5):
    sentences = preprocess_text(text)
    selected_sentences = random.sample(sentences, min(num_questions, len(sentences)))

    mcqs = []
    for sentence in selected_sentences:
        doc = nlp(sentence)
        nouns = [token.text for token in doc if token.pos_ == "NOUN"]
        if len(nouns) < 1:
            continue

        subject = random.choice(nouns)
        question_stem = sentence.replace(subject, "______")

        # Generate similar words using spaCy
        similar_words = find_similar_words(subject, num_similar=3)

        answer_choices = [subject] + similar_words
        random.shuffle(answer_choices)
        correct_answer = chr(65 + answer_choices.index(subject))

        mcqs.append((question_stem, answer_choices, correct_answer))

    return mcqs

# Example usage
text = """Deep learning is a subset of machine learning that uses neural networks. LSTMs are useful for processing sequential data like text. 
Natural language processing involves techniques like tokenization and named entity recognition."""

# Tokenizer setup
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocess_text(text))
vocab_size = len(tokenizer.word_index) + 1
max_length = 20

# Train LSTM model (Note: Training requires large datasets)
model = build_lstm_model(vocab_size, max_length, embedding_dim=100)

# Generate MCQs
mcqs = generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=3)
for i, (q, choices, ans) in enumerate(mcqs, 1):
    print(f"Q{i}: {q}")
    print(f" A) {choices[0]}  B) {choices[1]}  C) {choices[2]}  D) {choices[3]}")
    print(f"Correct Answer: {ans}\n")





Q1: Natural language processing involves techniques like ______ and named entity recognition.
 A) sequential  B) recognition  C) processing  D) tokenization
Correct Answer: D

Q2: LSTMs are useful for processing sequential ______ like text. 

 A) processing  B) tokenization  C) data  D) networks
Correct Answer: C

Q3: Deep ______ is a subset of machine ______ that uses neural networks.
 A) processing  B) techniques  C) recognition  D) learning
Correct Answer: D



In [2]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     -- ------------------------------------- 2.6/42.8 MB 18.9 MB/s eta 0:00:03
     ------- -------------------------------- 7.6/42.8 MB 22.4 MB/s eta 0:00:02
     ----------- --------------------------- 12.6/42.8 MB 23.2 MB/s eta 0:00:02
     ---------------- ---------------------- 18.4/42.8 MB 24.6 MB/s eta 0:00:01
     ----------------------- --------------- 26.0/42.8 MB 27.9 MB/s eta 0:00:01
     ----------------------------- --------- 32.5/42.8 MB 28.3 MB/s eta 0:00:01
     --------------------------------- ----- 37.2/42.8 MB 27.2 MB/s eta 0:00:01
     --------------------------------------  42.7/42.8 MB 27.7 MB/s eta 0:00:01
     --------------------------------------- 42.8/42.8 MB 26.7 MB/s eta 0:00:00
[38;5;2m✔ Download and installa