In [1]:
# Import course_scraped and process it to extract lo.

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load the CSV file
df = pd.read_csv('course_scraped.csv')

# Get stopwords
stop_words = set(stopwords.words('english'))
# Add additional single-word stopwords that might appear in learning objectives
additional_stopwords = {'who', 'what', 'when', 'where', 'why', 'how', 'this', 'that', 'these', 'those',
                       'here', 'there', 'with', 'from', 'into', 'onto', 'upon', 'about'}
stop_words.update(additional_stopwords)

# Function to clean text by removing leading hyphens and extra whitespace
def clean_text(text):
    if pd.isna(text):
        return ""

    # Remove leading hyphens and whitespace from the text
    cleaned = re.sub(r'^\s*-\s*', '', text.strip())
    # Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

# Function to filter out single words and stopwords
def filter_meaningless_objectives(objectives):
    filtered_objectives = []

    for obj in objectives:
        # Skip if it's a single word and in stopwords
        words = obj.split()
        if len(words) <= 1 and obj.lower() in stop_words:
            continue
        # Skip if it's entirely made up of stopwords
        if all(word.lower() in stop_words for word in words):
            continue
        # Otherwise keep it
        filtered_objectives.append(obj)

    return filtered_objectives

# Function to extract learning objectives from text
def extract_learning_objectives(row):
    learning_objectives = []

    # Check if lo_description or lo_skills have content
    lo_description_exists = pd.notna(row['lo_description']) and row['lo_description'].strip() != ''
    lo_skills_exists = pd.notna(row['lo_skills']) and row['lo_skills'].strip() != ''

    # Priority 1: Extract from lo_description and lo_skills if available
    if lo_description_exists:
        # Split by semicolons
        objectives = row['lo_description'].split(';')
        # Clean each objective
        objectives = [clean_text(obj) for obj in objectives]
        # Filter out empty strings
        objectives = [obj for obj in objectives if obj]
        learning_objectives.extend(objectives)

    if lo_skills_exists:
        # Split by semicolons
        skills = row['lo_skills'].split(';')
        # Clean each skill
        skills = [clean_text(skill) for skill in skills]
        # Filter out empty strings
        skills = [skill for skill in skills if skill]
        learning_objectives.extend(skills)

    # Priority 2: If both lo_description and lo_skills are null, extract from course_title and course_description
    if not lo_description_exists and not lo_skills_exists:
        # Add course_title as a learning objective
        if pd.notna(row['course_title']) and row['course_title'].strip() != '':
            learning_objectives.append(clean_text(row['course_title']))

        # Extract sentences from course_description
        if pd.notna(row['course_description']) and row['course_description'].strip() != '':
            sentences = sent_tokenize(row['course_description'])
            sentences = [clean_text(s) for s in sentences]
            sentences = [s for s in sentences if s]
            learning_objectives.extend(sentences)

    # Filter out single words and stopwords
    learning_objectives = filter_meaningless_objectives(learning_objectives)

    # Remove duplicates while preserving order
    unique_objectives = []
    seen = set()
    for obj in learning_objectives:
        if obj.lower() not in seen:
            unique_objectives.append(obj)
            seen.add(obj.lower())

    # Join all objectives with semicolons for the new column
    joined_objectives = '; '.join(unique_objectives)

    # Final check to remove any remaining hyphens after semicolons
    cleaned_joined = re.sub(r';\s*-\s*', '; ', joined_objectives)

    return cleaned_joined

# Apply the function to each row
df['lo'] = df.apply(extract_learning_objectives, axis=1)

# Final cleanup pass for the entire column to catch any edge cases
df['lo'] = df['lo'].str.replace(r';\s*-\s*', '; ', regex=True)
df['lo'] = df['lo'].str.replace(r'^\s*-\s*', '', regex=True)

# Save the result to a new CSV file
df.to_csv('course_scraped_with_lo.csv', index=False)

print(f"Processing complete. {len(df)} rows processed.")
print("Sample of the first 3 rows:")
print(df[['course_title', 'lo']].head(3))

# Optional: Check for any remaining leading hyphens in the extracted LOs
has_hyphens = df['lo'].str.contains(r';\s*-').any() or df['lo'].str.startswith('-').any()
print(f"\nLearning objectives with remaining leading hyphens: {'Yes' if has_hyphens else 'No'}")

# Optional: Check how many rows had learning objectives extracted from different sources
lo_from_dedicated = df.apply(lambda row: pd.notna(row['lo_description']) or pd.notna(row['lo_skills']), axis=1).sum()
lo_from_fallback = len(df) - lo_from_dedicated

# Count rows with no learning objectives after filtering
rows_with_no_lo = (df['lo'] == '').sum()

print(f"\nStatistics:")
print(f"Rows with learning objectives from lo_description or lo_skills: {lo_from_dedicated}")
print(f"Rows with learning objectives from course_title and course_description: {lo_from_fallback}")
print(f"Rows with no learning objectives after filtering: {rows_with_no_lo}")

# Optional: Check for single-word learning objectives that might have been missed
def count_words(text):
    if pd.isna(text) or text == '':
        return 0
    objectives = text.split(';')
    single_word_count = 0
    for obj in objectives:
        obj = obj.strip()
        if len(obj.split()) <= 1 and obj != '':
            single_word_count += 1
    return single_word_count

single_word_lo_count = df['lo'].apply(count_words).sum()
print(f"Remaining single-word learning objectives: {single_word_lo_count}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Processing complete. 2186 rows processed.
Sample of the first 3 rows:
                       course_title  \
0  Systems and Application Security   
1   Security Concepts and Practices   
2    Incident Response and Recovery   

                                                  lo  
0  Systems and Application Security; Course 7 - S...  
1  Security Concepts and Practices; Course 1 - Se...  
2  Incident Response and Recovery; Course 4 - Inc...  

Learning objectives with remaining leading hyphens: No

Statistics:
Rows with learning objectives from lo_description or lo_skills: 1902
Rows with learning objectives from course_title and course_description: 284
Rows with no learning objectives after filtering: 0
Remaining single-word learning objectives: 1806


In [3]:
# Import module_scraped and extract relevant info

In [4]:
# Load the CSV file
df = pd.read_csv('module_scraped.csv')

# Function to extract module content
def extract_module_content(row):
    module_contents = []

    # Check if videos column has content
    videos_exists = pd.notna(row['videos']) and row['videos'].strip() != ''

    # Priority 1: Extract from videos if available
    if videos_exists:
        # Split by semicolons
        video_contents = [content.strip() for content in row['videos'].split(';') if content.strip()]
        module_contents.extend(video_contents)

    # Priority 2: If videos is null, extract from mod_description and mod_title
    else:
        # Extract sentences from mod_description if available
        if pd.notna(row['mod_description']) and row['mod_description'].strip() != '':
            sentences = sent_tokenize(row['mod_description'])
            module_contents.extend([s.strip() for s in sentences if s.strip()])

        # Always add mod_title even if mod_description is not empty
        if pd.notna(row['mod_title']) and row['mod_title'].strip() != '':
            module_contents.append(row['mod_title'].strip())

    # Join all contents with semicolons for the new column
    return '; '.join(module_contents)

# Apply the function to each row
df['module_content'] = df.apply(extract_module_content, axis=1)

# Save the result to a new CSV file
df.to_csv('module_scraped_with_content.csv', index=False)

print(f"Processing complete. {len(df)} rows processed.")
print("Sample of the first 3 rows:")
print(df[['mod_title', 'module_content']].head(3))

# Optional: Check how many rows had module content extracted from different sources
content_from_videos = df.apply(lambda row: pd.notna(row['videos']) and row['videos'].strip() != '', axis=1).sum()
content_from_description_or_title = len(df) - content_from_videos

print(f"\nStatistics:")
print(f"Modules with content extracted from videos: {content_from_videos}")
print(f"Modules with content extracted from mod_description or mod_title: {content_from_description_or_title}")


Processing complete. 10272 rows processed.
Sample of the first 3 rows:
  mod_title                                     module_content
0  Overview  Malware Attackers; Endpoints; Security Strateg...
1  Overview  Professional Ethics; Confidentiality; Integrit...
2  Overview  The Incident Response Team (IRT); Incident Ana...

Statistics:
Modules with content extracted from videos: 8920
Modules with content extracted from mod_description or mod_title: 1352
