In [1]:
!pip install sentence-transformers numpy scikit-learn pandas
!pip install pandas torch transformers tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [2]:
# Bloom taxomony

In [3]:
# finding bloom tags for each learning objective

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import sys
import os

# Suppress TensorFlow and transformers warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logging
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # Suppress TensorFlow warnings

# Suppress sentence transformers progress bars
from tqdm.auto import tqdm
tqdm.pandas(disable=True)

# Initialize the MiniLM model for semantic embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Bloom's Taxonomy categories and their example keywords
blooms_keywords = {
    'remember': ['define', 'identify', 'recall', 'recognize', 'state', 'memorize'],
    'understand': ['comprehend', 'explain', 'summarize', 'infer', 'translate', 'discuss'],
    'apply': ['apply', 'demonstrate', 'use', 'execute', 'implement', 'solve'],
    'analyze': ['analyze', 'compare', 'contrast', 'distinguish', 'differentiate', 'examine'],
    'evaluate': ['evaluate', 'justify', 'assess', 'support', 'judge', 'rate'],
    'create': ['create', 'design', 'develop', 'generate', 'plan', 'formulate']
}

# Generate Bloom's category centroid embeddings
def generate_bloom_centroids():
    centroids = {}
    for level, keywords in blooms_keywords.items():
        embeddings = model.encode(keywords, show_progress_bar=False)
        centroids[level] = np.mean(embeddings, axis=0)
    return centroids

bloom_centroids = generate_bloom_centroids()

# Function to classify Bloom's level based on cosine similarity
def classify_blooms_level(text):
    """Classify Bloom's Taxonomy level for a given learning objective."""
    if not text or pd.isna(text):
        return None  # Skip empty LOs

    text_embedding = model.encode([text], show_progress_bar=False)[0]
    similarities = {level: cosine_similarity([text_embedding], [centroid])[0][0] for level, centroid in bloom_centroids.items()}

    return max(similarities, key=similarities.get)

# Function to transform input CSV and classify Bloom's taxonomy
def transform_and_classify(input_file, output_file):
    """Transform CSV: Each LO gets its own row, with Bloom's classification, skipping empty LOs."""
    df = pd.read_csv(input_file)

    transformed_rows = []
    total_courses = len(df)
    total_lo_count = 0

    for course_idx, row in df.iterrows():
        course_title = row['course_title']
        program_id = row['program_id']
        url = row['url']  # Include URL
        lo_list = row['lo'].split(';') if pd.notna(row['lo']) else []  # Handle NaN LO field

        lo_num = 0  # Track valid LOs per course

        for lo in lo_list:
            lo = lo.strip()
            if not lo:  # Skip empty LOs
                continue

            lo_num += 1  # Increment only for valid LOs
            bloom_level = classify_blooms_level(lo)

            transformed_rows.append({
                'program_id': program_id,
                'course_title': course_title,
                'url': url,
                'lo_num': lo_num,
                'lo': lo,
                'bloom': bloom_level
            })

            total_lo_count += 1

        progress_percentage = (course_idx + 1) / total_courses * 100
        sys.stdout.write(f'\rProgress: {progress_percentage:.2f}% ({course_idx + 1}/{total_courses} courses)')
        sys.stdout.flush()

    transformed_df = pd.DataFrame(transformed_rows)


    transformed_df.rename(columns={
        "bloom_category": "bloom_from_lo",
        "module_bloom_category": "bloom_from_module"
    }, inplace=True)


    # Save the final cleaned output
    transformed_df.to_csv(output_file, index=False)

    sys.stdout.write(f'\nProcessing complete. Processed {total_lo_count} learning objectives from {total_courses} courses.\n')
    sys.stdout.write(f'Output saved to {output_file}\n')

    return transformed_df

# Paths to input and output files
input_file = 'course_scraped_with_lo.csv'
output_file = 'courses_with_blooms_per_lo.csv'

# Transform and classify learning objectives
result_df = transform_and_classify(input_file, output_file)

# Print the first few rows of the final output
print(result_df.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Progress: 100.00% (2186/2186 courses)
Processing complete. Processed 14289 learning objectives from 2186 courses.
Output saved to courses_with_blooms_per_lo.csv
   program_id                      course_title  \
0           0  Systems and Application Security   
1           0  Systems and Application Security   
2           0  Systems and Application Security   
3           0  Systems and Application Security   
4           0  Systems and Application Security   

                                                 url  lo_num  \
0  https://www.coursera.org/learn/systems-and-app...       1   
1  https://www.coursera.org/learn/systems-and-app...       2   
2  https://www.coursera.org/learn/systems-and-app...       3   
3  https://www.coursera.org/learn/systems-and-app...       4   
4  https://www.coursera.org/learn/systems-and-app...       5   

                                                  lo   bloom  
0                   Systems and Application Security   apply  
1        Course 7 - S

In [4]:
# similarity score (miniLM) - without blooms

import pandas as pd
import torch
import numpy as np
import time
from tqdm import tqdm
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# Load the MiniLM model and tokenizer
print("Loading MiniLM model...")
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Mean Pooling function for creating sentence embeddings
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to encode texts using MiniLM
def encode_texts(texts):
    # Tokenize texts
    encoded_input = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

# Load the datasets
print("Loading CSV files...")
courses_df = pd.read_csv('course_scraped_with_lo.csv')
modules_df = pd.read_csv('module_scraped_with_content.csv')

# Define similarity threshold
similarity_threshold = 0.5

# Function to calculate alignment metrics
def calculate_alignment_metrics(course_row):
    # Extract learning objectives for this course
    course_url = course_row['url']

    if pd.isna(course_row['lo']) or course_row['lo'].strip() == '':
        return {
            'coverage_percentage': None,
            'mean_max_similarity': None,
            'num_learning_objectives': 0,
            'num_modules': 0,
            'num_module_contents': 0
        }

    learning_objectives = [lo.strip() for lo in course_row['lo'].split(';') if lo.strip()]

    # Get all modules for this course
    course_modules = modules_df[modules_df['url'] == course_url]

    if len(course_modules) == 0 or len(learning_objectives) == 0:
        return {
            'coverage_percentage': None,
            'mean_max_similarity': None,
            'num_learning_objectives': len(learning_objectives),
            'num_modules': 0,
            'num_module_contents': 0
        }

    # Extract all module contents as individual items
    all_module_contents = []
    for _, module_row in course_modules.iterrows():
        if pd.notna(module_row['module_content']) and module_row['module_content'].strip() != '':
            contents = [content.strip() for content in module_row['module_content'].split(';') if content.strip()]
            all_module_contents.extend(contents)

    if len(all_module_contents) == 0:
        return {
            'coverage_percentage': 0.0,
            'mean_max_similarity': 0.0,
            'num_learning_objectives': len(learning_objectives),
            'num_modules': len(course_modules),
            'num_module_contents': 0
        }

    # Encode all learning objectives and module contents
    lo_embeddings = encode_texts(learning_objectives)
    module_embeddings = encode_texts(all_module_contents)

    # Calculate cosine similarity
    cosine_scores = torch.mm(lo_embeddings, module_embeddings.T)

    # Calculate metrics
    max_similarities = []
    covered_los = 0

    # For each learning objective, find its highest similarity with any module content
    detailed_results = []
    for i in range(len(learning_objectives)):
        # Find the highest similarity for this learning objective
        max_sim_for_lo = torch.max(cosine_scores[i]).item()
        max_similarities.append(max_sim_for_lo)

        # Find the index of the best matching module content
        max_sim_idx = torch.argmax(cosine_scores[i]).item()
        best_match = all_module_contents[max_sim_idx]

        # Check if this learning objective is covered by any module content
        is_covered = max_sim_for_lo >= similarity_threshold
        if is_covered:
            covered_los += 1

        # Store detailed result for this learning objective
        detailed_results.append({
            'learning_objective': learning_objectives[i],
            'max_similarity': max_sim_for_lo,
            'best_match_content': best_match,
            'is_covered': is_covered
        })

    coverage_percentage = (covered_los / len(learning_objectives)) * 100 if learning_objectives else 0
    mean_max_similarity = np.mean(max_similarities) if max_similarities else 0

    return {
        'coverage_percentage': coverage_percentage,
        'mean_max_similarity': mean_max_similarity,
        'num_learning_objectives': len(learning_objectives),
        'num_modules': len(course_modules),
        'num_module_contents': len(all_module_contents),
        'detailed_results': detailed_results
    }

# Process each course
print(f"Processing {len(courses_df)} courses...")
start_time = time.time()
results = []
detailed_all = []

# Process in batches to avoid memory issues
batch_size = 10
num_batches = (len(courses_df) + batch_size - 1)//batch_size

# Set up progress tracking with just a simple progress bar
with tqdm(total=len(courses_df), desc="Analyzing courses") as pbar:
    for i in range(0, len(courses_df), batch_size):
        batch = courses_df.iloc[i:i+batch_size]

        for idx, course_row in batch.iterrows():
            metrics = calculate_alignment_metrics(course_row)

            result = {
                'program_id': course_row['program_id'],
                'course_title': course_row['course_title'],
                'url': course_row['url'],
                'num_learning_objectives': metrics['num_learning_objectives'],
                'num_modules': metrics['num_modules'],
                'num_module_contents': metrics['num_module_contents'],
                'coverage_percentage': metrics['coverage_percentage'],
                'mean_max_similarity': metrics['mean_max_similarity']
            }
            results.append(result)

            # Create detailed results for this course
            if 'detailed_results' in metrics:
                for detail in metrics['detailed_results']:
                    detailed = {
                        'program_id': course_row['program_id'],
                        'course_title': course_row['course_title'],
                        'url': course_row['url'],
                        'learning_objective': detail['learning_objective'],
                        'max_similarity': detail['max_similarity'],
                        'best_match_content': detail['best_match_content'],
                        'is_covered': detail['is_covered']
                    }
                    detailed_all.append(detailed)

            pbar.update(1)

# Create results DataFrames
results_df = pd.DataFrame(results)
detailed_df = pd.DataFrame(detailed_all)

# Save results
results_df.to_csv('course_module_alignment_summary_minilm.csv', index=False)
detailed_df.to_csv('course_module_alignment_detailed_minilm.csv', index=False)

total_time = time.time() - start_time
print("\nAnalysis complete!")
print(f"Total processing time: {total_time/60:.1f} minutes ({total_time/len(courses_df):.2f} seconds per course)")
print(f"Summary results saved to 'course_module_alignment_summary_minilm.csv'")
print(f"Detailed results saved to 'course_module_alignment_detailed_minilm.csv'")

# Print summary statistics
print("\nSummary statistics:")
print(f"Total courses analyzed: {len(results_df)}")
valid_results = results_df[results_df['coverage_percentage'].notna()]
if len(valid_results) > 0:
    print(f"Average coverage percentage: {valid_results['coverage_percentage'].mean():.2f}%")
    print(f"Average mean max similarity: {valid_results['mean_max_similarity'].mean():.4f}")

    # Count courses with good alignment (>50% coverage and >0.5 mean similarity)
    good_alignment = valid_results[
        (valid_results['coverage_percentage'] >= 50) &
        (valid_results['mean_max_similarity'] >= 0.5)
    ]
    print(f"Courses with good alignment: {len(good_alignment)}/{len(valid_results)} ({len(good_alignment)/len(valid_results)*100:.1f}%)")


Loading MiniLM model...
Loading CSV files...
Processing 2186 courses...


Analyzing courses: 100%|██████████| 2186/2186 [16:41<00:00,  2.18it/s]



Analysis complete!
Total processing time: 16.7 minutes (0.46 seconds per course)
Summary results saved to 'course_module_alignment_summary_minilm.csv'
Detailed results saved to 'course_module_alignment_detailed_minilm.csv'

Summary statistics:
Total courses analyzed: 2186
Average coverage percentage: 62.12%
Average mean max similarity: 0.5601
Courses with good alignment: 1433/2185 (65.6%)


In [5]:
# find bloom tag for best-match module content of each learning objectives

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import sys
import os

# Suppress TensorFlow and transformers warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logging
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # Suppress TensorFlow warnings

# Suppress sentence transformers progress bars
from tqdm.auto import tqdm
tqdm.pandas(disable=True)

# Initialize the MiniLM model for semantic embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Bloom's Taxonomy categories and their example keywords
blooms_keywords = {
    'remember': ['define', 'identify', 'recall', 'recognize', 'state', 'memorize'],
    'understand': ['comprehend', 'explain', 'summarize', 'infer', 'translate', 'discuss'],
    'apply': ['apply', 'demonstrate', 'use', 'execute', 'implement', 'solve'],
    'analyze': ['analyze', 'compare', 'contrast', 'distinguish', 'differentiate', 'examine'],
    'evaluate': ['evaluate', 'justify', 'assess', 'support', 'judge', 'rate'],
    'create': ['create', 'design', 'develop', 'generate', 'plan', 'formulate']
}

# Generate Bloom's category centroid embeddings
def generate_bloom_centroids():
    centroids = {}
    for level, keywords in blooms_keywords.items():
        embeddings = model.encode(keywords, show_progress_bar=False)
        centroids[level] = np.mean(embeddings, axis=0)
    return centroids

bloom_centroids = generate_bloom_centroids()

# Function to classify Bloom's level based on cosine similarity
def classify_blooms_level(text):
    """Classify Bloom's Taxonomy level for a given text."""
    if not text or pd.isna(text):
        return None  # Skip empty texts

    text_embedding = model.encode([text], show_progress_bar=False)[0]
    similarities = {level: cosine_similarity([text_embedding], [centroid])[0][0] for level, centroid in bloom_centroids.items()}

    return max(similarities, key=similarities.get)

# Function to transform input CSV and classify Bloom's taxonomy for `best_match_content`
def transform_and_classify(input_file, output_file):
    """Transform CSV: Each row gets Bloom's classification added."""
    df = pd.read_csv(input_file)

    transformed_rows = []
    total_courses = len(df)
    total_content_count = 0

    for course_idx, row in df.iterrows():
        # Keep all original columns
        program_id = row['program_id']
        course_title = row['course_title']
        url = row['url']
        learning_objective = row['learning_objective']
        max_similarity = row['max_similarity']
        best_match_content = row['best_match_content']
        is_covered = row['is_covered']

        if pd.notna(best_match_content):  # Only process if `best_match_content` is not NaN
            best_match_content = best_match_content.strip()

            # Classify Bloom's taxonomy level
            bloom_level = classify_blooms_level(best_match_content)

            transformed_rows.append({
                'program_id': program_id,
                'course_title': course_title,
                'url': url,
                'learning_objective': learning_objective,
                'max_similarity': max_similarity,
                'best_match_content': best_match_content,
                'is_covered': is_covered,
                'bloom': bloom_level
            })

            total_content_count += 1

        progress_percentage = (course_idx + 1) / total_courses * 100
        sys.stdout.write(f'\rProgress: {progress_percentage:.2f}% ({course_idx + 1}/{total_courses} courses)')
        sys.stdout.flush()

    transformed_df = pd.DataFrame(transformed_rows)

    # Save the final cleaned output
    transformed_df.to_csv(output_file, index=False)

    sys.stdout.write(f'\nProcessing complete. Processed {total_content_count} content from {total_courses} courses.\n')
    sys.stdout.write(f'Output saved to {output_file}\n')

    return transformed_df

# Paths to input and output files (same file for input and output)
input_file = 'course_module_alignment_detailed_minilm.csv'
output_file = input_file  # Same as input file to overwrite it

# Transform and classify best_match_content
result_df = transform_and_classify(input_file, output_file)

# Print the first few rows of the final output
print(result_df.head())


Progress: 100.00% (14285/14285 courses)
Processing complete. Processed 14285 content from 14285 courses.
Output saved to course_module_alignment_detailed_minilm.csv
   program_id                      course_title  \
0           0  Systems and Application Security   
1           0  Systems and Application Security   
2           0  Systems and Application Security   
3           0  Systems and Application Security   
4           0  Systems and Application Security   

                                                 url  \
0  https://www.coursera.org/learn/systems-and-app...   
1  https://www.coursera.org/learn/systems-and-app...   
2  https://www.coursera.org/learn/systems-and-app...   
3  https://www.coursera.org/learn/systems-and-app...   
4  https://www.coursera.org/learn/systems-and-app...   

                                  learning_objective  max_similarity  \
0                   Systems and Application Security        0.527121   
1        Course 7 - Systems and Application Sec

In [6]:
# adjust similarity score based on bloom taxonomy tag alignment
import pandas as pd
import numpy as np
import math

# Bloom's Taxonomy complexity mapping
blooms_complexity = {
    'remember': 1,
    'understand': 2,
    'apply': 3,
    'analyze': 4,
    'evaluate': 5,
    'create': 6
}

# Function to safely extract Bloom's complexity
def get_bloom_complexity(bloom_tag):
    if pd.isna(bloom_tag) or not isinstance(bloom_tag, str):
        return 1  # Default to lowest complexity

    # Convert to lowercase and strip whitespace
    bloom_tag = bloom_tag.lower().strip()

    # Print for debugging
    # print(f"Processing bloom tag: '{bloom_tag}'")

    # Direct match first
    if bloom_tag in blooms_complexity:
        match_type = "direct"
        complexity = blooms_complexity[bloom_tag]
    else:
        # Fallback to most specific match
        match_type = "partial"
        complexity = 1  # Default
        matched_level = None

        for level, level_complexity in blooms_complexity.items():
            if level in bloom_tag:
                complexity = level_complexity
                matched_level = level
                break

    # print(f"  Match type: {match_type}, Complexity: {complexity}")
    # if match_type == "partial" and matched_level:
        # print(f"  Matched with: '{matched_level}'")

    return complexity


def adjust_similarity_with_bloom(max_similarity, lo_complexity, module_complexity):
    complexity_diff = abs(lo_complexity - module_complexity)

    # Only apply penalty if there's a complexity difference
    if complexity_diff == 0:
        return max_similarity

    # Adjust logarithm-based penalty with max 5% reduction
    penalty_factor = max(0.95, 1.0 - (math.log1p(complexity_diff) / (math.log1p(6) * 1.5)))

    # Log the penalty factor adjustment
    # print(f"Complexity Difference: {complexity_diff}, Penalty Factor: {penalty_factor:.4f}")

    adjusted_similarity = max_similarity * penalty_factor

    # Log the adjustment amount
    adjustment = max_similarity - adjusted_similarity
    # if adjustment > 0:
    #     print(f"Adjustment applied: {adjustment:.4f} (Original: {max_similarity:.4f}, Adjusted: {adjusted_similarity:.4f})")

    return adjusted_similarity

# Adjust the code to include complexity for the modules as well

def refine_module_alignment(courses_with_blooms_file,
                             minilm_detailed_file,
                             minilm_summary_file,
                             output_detailed_file,
                             output_summary_file):
    # Load input files
    courses_df = pd.read_csv(courses_with_blooms_file)
    detailed_df = pd.read_csv(minilm_detailed_file)
    summary_df = pd.read_csv(minilm_summary_file)

    # Prepare to store updated detailed results
    updated_detailed_results = []

    # Similarity threshold
    similarity_threshold = 0.5

    # Track courses penalized due to Bloom tags
    courses_penalized_by_bloom = set()

    # Process each row in detailed_df
    for _, lo_row in detailed_df.iterrows():
        url = lo_row['url']
        learning_objective = lo_row['learning_objective']

        # Find corresponding bloom tags from courses_df for this specific LO
        lo_match = courses_df[(courses_df['url'] == url) & (courses_df['lo'] == learning_objective)]

        # Extract LO bloom tag
        lo_bloom = lo_match['bloom'].values[0] if len(lo_match) > 0 else None
        lo_complexity = get_bloom_complexity(lo_bloom)

        # Find the module bloom tag for this module
        module_match = courses_df[courses_df['url'] == url]
        # If there are multiple rows for this module, use the first one's bloom tag
        module_bloom = module_match['bloom'].iloc[0] if len(module_match) > 0 else None
        module_complexity = get_bloom_complexity(module_bloom)

        # Calculate original and adjusted similarity
        original_similarity = lo_row['max_similarity']
        adjusted_similarity = adjust_similarity_with_bloom(
            original_similarity,
            lo_complexity,
            module_complexity
        )

        # Check if penalized
        is_penalized = adjusted_similarity < original_similarity
        if is_penalized:
            courses_penalized_by_bloom.add(url)
            penalty_percentage = ((original_similarity - adjusted_similarity) / original_similarity) * 100

        # Determine if covered based on adjusted similarity threshold
        is_covered = adjusted_similarity >= similarity_threshold

        # Prepare updated detailed result - start with original row data but exclude 'bloom' column if it exists
        updated_result = {k: v for k, v in lo_row.to_dict().items() if k != 'bloom'}

        # Add our new columns
        updated_result['lo_bloom'] = lo_bloom
        updated_result['lo_complexity'] = lo_complexity
        updated_result['module_bloom'] = module_bloom
        updated_result['module_complexity'] = module_complexity
        updated_result['max_similarity'] = adjusted_similarity
        updated_result['is_covered'] = is_covered
        updated_result['is_penalized'] = is_penalized
        if is_penalized:
            updated_result['penalty_percentage'] = penalty_percentage

        updated_detailed_results.append(updated_result)

    # Convert to DataFrame
    updated_detailed_df = pd.DataFrame(updated_detailed_results)

    # Recalculate summary metrics per URL
    summary_results = []

    for url in updated_detailed_df['url'].unique():
        url_detailed = updated_detailed_df[updated_detailed_df['url'] == url]

        # Recalculate metrics
        num_los = len(url_detailed)
        num_covered_los = sum(url_detailed['is_covered'])
        coverage_percentage = (num_covered_los / num_los) * 100 if num_los > 0 else 0
        mean_max_similarity = url_detailed['max_similarity'].mean()

        # Find corresponding summary row
        summary_row = summary_df[summary_df['url'] == url].iloc[0].to_dict()

        # Update key metrics
        summary_row['coverage_percentage'] = coverage_percentage
        summary_row['mean_max_similarity'] = mean_max_similarity
        summary_row['num_los'] = num_los
        summary_row['num_covered_los'] = num_covered_los

        summary_results.append(summary_row)

    # Convert to DataFrame
    updated_summary_df = pd.DataFrame(summary_results)

    # Save updated files
    updated_detailed_df.to_csv(output_detailed_file, index=False)
    updated_summary_df.to_csv(output_summary_file, index=False)

    # Print summary statistics in the requested format
    total_courses = len(updated_summary_df)
    avg_coverage_percentage = updated_summary_df['coverage_percentage'].mean()
    avg_mean_max_similarity = updated_summary_df['mean_max_similarity'].mean()

    # Calculate number of courses with good alignment
    courses_with_good_alignment = updated_summary_df[updated_summary_df['coverage_percentage'] >= 60]
    num_courses_with_good_alignment = len(courses_with_good_alignment)

    # Number of courses penalized by Bloom tags
    num_courses_penalized = len(courses_penalized_by_bloom)

    print(f"\nAnalysis complete!")
    print(f"Summary results saved to '{output_summary_file}'")
    print(f"Detailed results saved to '{output_detailed_file}'\n")

    print(f"Summary statistics:")
    print(f"Total courses analyzed: {total_courses}")
    print(f"Courses penalized by Bloom tags: {num_courses_penalized}/{total_courses} ({(num_courses_penalized / total_courses) * 100:.1f}%)")
    print(f"Average coverage percentage: {avg_coverage_percentage:.2f}%")
    print(f"Average mean max similarity: {avg_mean_max_similarity:.4f}")
    print(f"Courses with good alignment: {num_courses_with_good_alignment}/{total_courses} ({(num_courses_with_good_alignment / total_courses) * 100:.1f}%)")


# Example usage
if __name__ == "__main__":
    courses_with_blooms_file = 'courses_with_blooms_per_lo.csv'
    minilm_detailed_file = 'course_module_alignment_detailed_minilm.csv'
    minilm_summary_file = 'course_module_alignment_summary_minilm.csv'

    output_detailed_file = 'course_module_alignment_detailed_minilm_bloom.csv'
    output_summary_file = 'course_module_alignment_summary_minilm_bloom.csv'

    refine_module_alignment(
        courses_with_blooms_file,
        minilm_detailed_file,
        minilm_summary_file,
        output_detailed_file,
        output_summary_file
    )


Analysis complete!
Summary results saved to 'course_module_alignment_summary_minilm_bloom.csv'
Detailed results saved to 'course_module_alignment_detailed_minilm_bloom.csv'

Summary statistics:
Total courses analyzed: 1992
Courses penalized by Bloom tags: 1683/1992 (84.5%)
Average coverage percentage: 58.54%
Average mean max similarity: 0.5422
Courses with good alignment: 1045/1992 (52.5%)
