In [1]:
"""
Advanced NLP Approach for Automated Knowledge Gap Detection in Student Responses
-------------------------------------------------------------------------------
This script demonstrates:
    - Advanced data pre-processing (tokenization, stopword removal, normalization, lemmatization)
    - Semantic similarity analysis using Sentence-BERT
    - Knowledge gap detection by comparing key concepts
    - Detailed feedback generation for open-ended responses
    - Visualization of similarity scores and missing concepts

Requirements:
    - Python 3.x
    - spaCy (and the English model: en_core_web_sm)
    - nltk
    - sentence-transformers
    - scikit-learn
    - numpy
    - matplotlib
    - seaborn

To install required packages, you can run:
    pip install spacy nltk sentence-transformers scikit-learn numpy matplotlib seaborn
    python -m spacy download en_core_web_sm
"""

'\nAdvanced NLP Approach for Automated Knowledge Gap Detection in Student Responses\n-------------------------------------------------------------------------------\nThis script demonstrates:\n    - Advanced data pre-processing (tokenization, stopword removal, normalization, lemmatization)\n    - Semantic similarity analysis using Sentence-BERT\n    - Knowledge gap detection by comparing key concepts\n    - Detailed feedback generation for open-ended responses\n    - Visualization of similarity scores and missing concepts\n\nRequirements:\n    - Python 3.x\n    - spaCy (and the English model: en_core_web_sm)\n    - nltk\n    - sentence-transformers\n    - scikit-learn\n    - numpy\n    - matplotlib\n    - seaborn\n\nTo install required packages, you can run:\n    pip install spacy nltk sentence-transformers scikit-learn numpy matplotlib seaborn\n    python -m spacy download en_core_web_sm\n'

In [None]:
import re
import spacy
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Download nltk data (if not already downloaded)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab') # Add this line to download punkt_tab
STOPWORDS = set(stopwords.words("english"))

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
questions = [
    "Explain the process of photosynthesis.",
    "Describe the significance of the water cycle in Earth's ecosystem.",
    "Discuss the impact of the Industrial Revolution on modern society."
]

In [None]:
reference_answers = [
    "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water. It involves the chlorophyll in leaves and generates oxygen as a byproduct.",
    "The water cycle is crucial as it distributes fresh water across the globe, supporting life. It involves processes like evaporation, condensation, precipitation, and infiltration, maintaining ecological balance.",
    "The Industrial Revolution marked a major turning point in history; it led to advancements in technology, manufacturing, and transportation, significantly influencing modern society's economic and social structures."
]

In [None]:
student_responses = [
    "Plants use sunlight to make food from carbon dioxide and water, releasing oxygen.",
    "Water evaporates, forms clouds, and comes back as rain, which is important for life.",
    "The Industrial Revolution changed how things were made and had effects on today."
]


In [None]:
def preprocess_text(text):
    """
    Preprocess text by lowercasing, removing non-alphabetic characters, 
    stopwords, and performing lemmatization.
    """
    # Lowercase the text
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in STOPWORDS]
    # Return cleaned text
    return " ".join(tokens)

In [None]:
preprocessed_responses = [preprocess_text(resp) for resp in student_responses]
preprocessed_references = [preprocess_text(ans) for ans in reference_answers]
preprocessed_questions = [preprocess_text(q) for q in questions]

In [None]:
# Semantic Similarity Analysis
# Encode responses and reference answers
response_embeddings = sbert_model.encode(preprocessed_responses)
reference_embeddings = sbert_model.encode(preprocessed_references)

In [None]:
# Compute similarity scores
similarity_scores = []
for resp_emb, ref_emb in zip(response_embeddings, reference_embeddings):
    sim_score = util.cos_sim(resp_emb, ref_emb).item()
    similarity_scores.append(sim_score)

In [None]:
# Display similarity scores
print("\nSemantic Similarity Scores:")
for idx, score in enumerate(similarity_scores):
    print(f"Response {idx+1} Similarity: {score:.4f}")

In [None]:
# Knowledge Gap Detection
# --------------------------
def extract_key_concepts(text):
    """
    Extract key concepts (nouns and noun phrases) from the text.
    """
    doc = nlp(text)
    concepts = set()
    for chunk in doc.noun_chunks:
        concepts.add(chunk.lemma_.lower())
    for token in doc:
        if token.pos_ in ['NOUN', 'PROPN']:
            concepts.add(token.lemma_.lower())
    return concepts

In [None]:
# Identify missing concepts in student responses
def identify_missing_concepts(response_text, reference_text):
    response_concepts = extract_key_concepts(response_text)
    reference_concepts = extract_key_concepts(reference_text)
    missing_concepts = reference_concepts - response_concepts
    return missing_concepts

In [None]:
# Detailed Feedback Generation
# --------------------------
def generate_feedback(response_idx, threshold=0.75):
    """
    Generate detailed feedback for a student's response.
    """
    sim_score = similarity_scores[response_idx]
    student_text = student_responses[response_idx]
    reference_text = reference_answers[response_idx]
    
    feedback = f"Your Response:\n{student_text}\n"
    feedback += f"\nSimilarity Score: {sim_score:.4f}\n"
    if sim_score >= threshold:
        feedback += "Great work! Your response covers the key concepts.\n"
    else:
        missing_concepts = identify_missing_concepts(preprocessed_responses[response_idx], preprocessed_references[response_idx])
        if missing_concepts:
            feedback += "Your response is missing the following key concepts:\n"
            feedback += ", ".join(missing_concepts) + "\n"
        else:
            feedback += "Your response could be elaborated further to include more details.\n"
    return feedback

In [None]:
# Generate feedback for each student response
print("\nDetailed Feedback:")
for idx in range(len(student_responses)):
    print(f"\nFeedback for Response {idx+1}:\n{generate_feedback(idx)}")

In [None]:
# Visualization of Similarity Scores and Missing Concepts
# --------------------------
def plot_similarity_scores(similarity_scores):
    """
    Plot similarity scores for student responses.
    """
    plt.figure(figsize=(8, 6))
    sns.barplot(x=[f'Response {i+1}' for i in range(len(similarity_scores))],
                y=similarity_scores, palette='viridis')
    plt.axhline(y=0.75, color='red', linestyle='--')
    plt.xlabel('Student Responses')
    plt.ylabel('Similarity Score')
    plt.title('Semantic Similarity Scores of Student Responses')
    plt.ylim(0, 1)
    plt.show()

plot_similarity_scores(similarity_scores)

In [None]:
# Visualize Missing Concepts
def visualize_missing_concepts(response_idx):
    """
    Create a word cloud of missing concepts for a student's response.
    """
    from wordcloud import WordCloud
    missing_concepts = identify_missing_concepts(preprocessed_responses[response_idx], preprocessed_references[response_idx])
    if missing_concepts:
        text = " ".join(missing_concepts)
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Missing Concepts in Response {response_idx+1}')
        plt.show()

In [None]:
# Visualize missing concepts for each response
for idx in range(len(student_responses)):
    visualize_missing_concepts(idx)