In [2]:
# USE Universal Sentence Encoder

In [3]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import time
from tqdm import tqdm
import tensorflow_text  # Required for multilingual model

# Load the Universal Sentence Encoder model
print("Loading Universal Sentence Encoder model...")
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")  # or use multilingual version: "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"

# Load the datasets
print("Loading CSV files...")
courses_df = pd.read_csv('course_scraped_with_lo.csv')
modules_df = pd.read_csv('module_scraped_with_content.csv')

# Define similarity threshold
similarity_threshold = 0.4

# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Function to calculate alignment metrics
def calculate_alignment_metrics(course_row):
    # Extract learning objectives for this course
    course_url = course_row['url']

    if pd.isna(course_row['lo']) or course_row['lo'].strip() == '':
        return {
            'coverage_percentage': None,
            'mean_max_similarity': None,
            'num_learning_objectives': 0,
            'num_modules': 0,
            'num_module_contents': 0
        }

    learning_objectives = [lo.strip() for lo in course_row['lo'].split(';') if lo.strip()]

    # Get all modules for this course
    course_modules = modules_df[modules_df['url'] == course_url]

    if len(course_modules) == 0 or len(learning_objectives) == 0:
        return {
            'coverage_percentage': None,
            'mean_max_similarity': None,
            'num_learning_objectives': len(learning_objectives),
            'num_modules': 0,
            'num_module_contents': 0
        }

    # Extract all module contents as individual items
    all_module_contents = []
    for _, module_row in course_modules.iterrows():
        if pd.notna(module_row['module_content']) and module_row['module_content'].strip() != '':
            contents = [content.strip() for content in module_row['module_content'].split(';') if content.strip()]
            all_module_contents.extend(contents)

    if len(all_module_contents) == 0:
        return {
            'coverage_percentage': 0.0,
            'mean_max_similarity': 0.0,
            'num_learning_objectives': len(learning_objectives),
            'num_modules': len(course_modules),
            'num_module_contents': 0
        }

    # Encode all learning objectives and module contents using USE
    lo_embeddings = embed(learning_objectives).numpy()
    module_embeddings = embed(all_module_contents).numpy()

    # Calculate metrics
    max_similarities = []
    covered_los = 0

    # For each learning objective, find its highest similarity with any module content
    detailed_results = []
    for i, lo in enumerate(learning_objectives):
        # Calculate similarity with all module contents
        similarities = [cosine_similarity(lo_embeddings[i], module_embeddings[j])
                        for j in range(len(all_module_contents))]

        # Find maximum similarity
        max_sim = max(similarities)
        max_sim_idx = np.argmax(similarities)
        max_similarities.append(max_sim)

        # Check if this learning objective is covered by any module content
        if max_sim >= similarity_threshold:
            covered_los += 1

        best_match = all_module_contents[max_sim_idx] if all_module_contents else "No match"

        detailed_results.append({
            'learning_objective': lo,
            'max_similarity': max_sim,
            'best_match_content': best_match,
            'is_covered': max_sim >= similarity_threshold
        })

    coverage_percentage = (covered_los / len(learning_objectives)) * 100 if learning_objectives else 0
    mean_max_similarity = np.mean(max_similarities) if max_similarities else 0

    return {
        'coverage_percentage': coverage_percentage,
        'mean_max_similarity': mean_max_similarity,
        'num_learning_objectives': len(learning_objectives),
        'num_modules': len(course_modules),
        'num_module_contents': len(all_module_contents),
        'detailed_results': detailed_results
    }

# Process each course
print(f"Processing {len(courses_df)} courses...")
start_time = time.time()
results = []
detailed_all = []

# Process in batches to avoid memory issues
batch_size = 10
num_batches = (len(courses_df) + batch_size - 1)//batch_size

# Set up progress tracking with just a simple progress bar
with tqdm(total=len(courses_df), desc="Analyzing courses with USE") as pbar:
    for i in range(0, len(courses_df), batch_size):
        batch = courses_df.iloc[i:i+batch_size]

        for idx, course_row in batch.iterrows():
            metrics = calculate_alignment_metrics(course_row)

            result = {
                'program_id': course_row['program_id'],
                'course_title': course_row['course_title'],
                'url': course_row['url'],
                'num_learning_objectives': metrics['num_learning_objectives'],
                'num_modules': metrics['num_modules'],
                'num_module_contents': metrics['num_module_contents'],
                'coverage_percentage': metrics['coverage_percentage'],
                'mean_max_similarity': metrics['mean_max_similarity']
            }
            results.append(result)

            # Create detailed results for this course
            if 'detailed_results' in metrics:
                for detail in metrics['detailed_results']:
                    detailed = {
                        'program_id': course_row['program_id'],
                        'course_title': course_row['course_title'],
                        'url': course_row['url'],
                        'learning_objective': detail['learning_objective'],
                        'max_similarity': detail['max_similarity'],
                        'best_match_content': detail['best_match_content'],
                        'is_covered': detail['is_covered']
                    }
                    detailed_all.append(detailed)

            pbar.update(1)

# Create results DataFrames
results_df = pd.DataFrame(results)
detailed_df = pd.DataFrame(detailed_all)

# Save results
results_df.to_csv('course_module_alignment_summary_use.csv', index=False)
detailed_df.to_csv('course_module_alignment_detailed_use.csv', index=False)

total_time = time.time() - start_time
print("\nAnalysis complete!")
print(f"Total processing time: {total_time/60:.1f} minutes ({total_time/len(courses_df):.2f} seconds per course)")
print(f"Summary results saved to 'course_module_alignment_summary_use.csv'")
print(f"Detailed results saved to 'course_module_alignment_detailed_use.csv'")

# Print summary statistics
print("\nSummary statistics:")
print(f"Total courses analyzed: {len(results_df)}")
valid_results = results_df[results_df['coverage_percentage'].notna()]
if len(valid_results) > 0:
    print(f"Average coverage percentage: {valid_results['coverage_percentage'].mean():.2f}%")
    print(f"Average mean max similarity: {valid_results['mean_max_similarity'].mean():.4f}")

    # Count courses with good alignment (>40% coverage and >0.4 mean similarity)
    good_alignment = valid_results[
        (valid_results['coverage_percentage'] >= 40) &
        (valid_results['mean_max_similarity'] >= 0.4)
    ]
    print(f"Courses with good alignment: {len(good_alignment)}/{len(valid_results)} ({len(good_alignment)/len(valid_results)*100:.1f}%)")


Loading Universal Sentence Encoder model...
Loading CSV files...
Processing 2186 courses...


Analyzing courses with USE: 100%|██████████| 2186/2186 [00:39<00:00, 54.75it/s]



Analysis complete!
Total processing time: 0.7 minutes (0.02 seconds per course)
Summary results saved to 'course_module_alignment_summary_use.csv'
Detailed results saved to 'course_module_alignment_detailed_use.csv'

Summary statistics:
Total courses analyzed: 2186
Average coverage percentage: 56.46%
Average mean max similarity: 0.4445
Courses with good alignment: 1338/2185 (61.2%)
