# Write Image Feature Extractor

In [7]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import requests

def extract_image_features(image_paths: list[str]):
    line_counts = []
    contour_counts = []

    for path in image_paths:
        response = requests.get(path)
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        if img is None:
            print(f"Failed to load image from URL: {path}")
            continue
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, threshold1=50, threshold2=150)

        # Count lines
        lines = cv2.HoughLines(edges, 1, np.pi / 180, 100)
        num_lines = len(lines) if lines is not None else 0

        # Count regions (e.g., boxes or labeled areas)
        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        num_regions = len(contours)

        line_counts.append(num_lines)
        contour_counts.append(num_regions)

    return {
        "avg_lines_per_image": np.mean(line_counts),
        "avg_regions_per_image": np.mean(contour_counts),
        "total_images": len(image_paths),
    }

# Write Text Feature Extractor

In [8]:
from wordfreq import zipf_frequency
import re
import numpy as np

def extract_text_features(text: str):
    words = re.findall(r'\b\w+\b', text.lower())
    total_words = len(words)
    
    if total_words == 0:
        return {
            "avg_word_length": 0.0,
            "mean_word_rarity": 0.0,
            "num_complex_words": 0,
            "total_word_count": 0
        }

    # Compute average word length
    avg_word_length = sum(len(word) for word in words) / total_words

    # Zipf scores: 6+ is common, <5 is rare
    zipf_scores = [zipf_frequency(w, 'en') for w in words]
    mean_word_rarity = np.mean([7 - z for z in zipf_scores])  # Inverted so higher = rarer
    num_complex_words = sum(1 for z in zipf_scores if z < 5.0)  # Empirical threshold

    return {
        "avg_word_length": round(avg_word_length, 2),
        "mean_word_rarity": round(mean_word_rarity, 2),
        "num_complex_words": num_complex_words,
        "total_word_count": total_words
    }

# Extract Features at Chapter Level

In [5]:
import pandas as pd

df_chapter_quiz_page_content_all = pd.read_csv("data/chapter_quiz_page_content_all.csv", index_col=0)

In [10]:
# Extract text features for each chapter
for idx, text in enumerate(df_chapter_quiz_page_content_all['text']):
    features = extract_text_features(text)
    for key, value in features.items():
        df_chapter_quiz_page_content_all.loc[idx, "TEXT_FEATURE_" + key] = value

# Display the dataframe with new features
df_chapter_quiz_page_content_all

# Extract image features for each chapter
for idx, image_urls in enumerate(df_chapter_quiz_page_content_all['images']):
    # Extract image features using the extract_image_features function
    if isinstance(image_urls, str) and image_urls != '[]':
        image_list = eval(image_urls)
        features = extract_image_features(image_list)
        for key, value in features.items():
            df_chapter_quiz_page_content_all.loc[idx, "IMAGE_FEATURE_" + key] = value

# Display the dataframe with all features
df_chapter_quiz_page_content_all

Unnamed: 0,chapter,urls,text,images,book,TEXT_FEATURE_avg_word_length,TEXT_FEATURE_mean_word_rarity,TEXT_FEATURE_num_complex_words,TEXT_FEATURE_total_word_count,IMAGE_FEATURE_avg_lines_per_image,IMAGE_FEATURE_avg_regions_per_image,IMAGE_FEATURE_total_images
0,1,['https://coursekata.org/preview/book/3cc54c0e...,\nQuestion ID: A1_Review1_01\n1. What will hap...,[],College / Statistics and Data Science (ABC),4.71,1.73,143.0,479.0,,,
1,2,['https://coursekata.org/preview/book/3cc54c0e...,\nChapter Quiz Page 1:\n\nQuestion ID: A2_Revi...,[],College / Statistics and Data Science (ABC),4.13,1.96,610.0,1773.0,,,
2,3,['https://coursekata.org/preview/book/3cc54c0e...,\nChapter Quiz Page 1:\n\nQuestion ID: A3_Revi...,['https://coursekata-course-assets.s3.us-west-...,College / Statistics and Data Science (ABC),4.83,1.88,683.0,1829.0,184.5,32.5,2.0
3,4,['https://coursekata.org/preview/book/3cc54c0e...,\nChapter Quiz Page 1:\n\nQuestion ID: A4_Revi...,['https://coursekata-course-assets.s3.us-west-...,College / Statistics and Data Science (ABC),4.84,1.99,710.0,1866.0,1575.230769,57.538462,13.0
4,5,['https://coursekata.org/preview/book/3cc54c0e...,\nChapter Quiz Page 1:\n\nQuestion ID: B1_Revi...,['https://coursekata-course-assets.s3.us-west-...,College / Statistics and Data Science (ABC),4.51,1.94,620.0,1605.0,192.666667,44.666667,3.0
5,6,['https://coursekata.org/preview/book/3cc54c0e...,\nChapter Quiz Page 1:\n\nQuestion ID: B2_Revi...,['https://coursekata-course-assets.s3.us-west-...,College / Statistics and Data Science (ABC),4.56,1.91,547.0,1467.0,365.222222,56.777778,9.0
6,7,['https://coursekata.org/preview/book/3cc54c0e...,\nQuestion ID: B3_Review2_01\nThe following se...,[],College / Statistics and Data Science (ABC),4.46,2.11,408.0,923.0,,,
7,8,['https://coursekata.org/preview/book/3cc54c0e...,\nQuestion ID: B3_Review1_01\nStudentSurvey is...,['https://coursekata-course-assets.s3.us-west-...,College / Statistics and Data Science (ABC),4.59,2.18,564.0,1301.0,1741.5,57.0,2.0
8,9,['https://coursekata.org/preview/book/3cc54c0e...,\nChapter Quiz Page 1:\n\nQuestion ID: B4_Revi...,['https://coursekata-course-assets.s3.us-west-...,College / Statistics and Data Science (ABC),4.77,2.19,1061.0,2422.0,2227.857143,83.571429,7.0
9,10,['https://coursekata.org/preview/book/3cc54c0e...,\nChapter Quiz Page 1:\n\nQuestion ID: C1_Revi...,['https://coursekata-course-assets.s3.us-west-...,College / Statistics and Data Science (ABC),4.96,2.26,639.0,1548.0,5346.5,112.75,4.0


In [11]:
df_chapter_quiz_page_content_all.to_csv("data/chapter_quiz_with_multimodal_features.csv")