In [None]:
# Install all required libraries (including new openai)
!pip install -q openai pandas spacy sentence-transformers openpyxl

# Download spaCy model
!python -m spacy download en_core_web_sm


In [None]:
import os
import math
import zipfile
import spacy
import pandas as pd
from collections import Counter
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI

# Initialize OpenAI GPT-4o client
client = OpenAI(api_key="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")  # Replace with your actual API key

# Load models
nlp = spacy.load("en_core_web_sm")
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define paths
zip_path = "/content/KQ_FEED_Final_Meaningful_Lectures_Dataset.zip"
extract_dir = "/content/dataset"
LECTURE_PATH = "/content/dataset/final_meaningful_dataset/lectures"
OBJECTIVE_PATH = "/content/dataset/final_meaningful_dataset/objectives"


In [None]:
# Extract zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Show structure
print("Extracted files and folders:")
for root, dirs, files in os.walk(extract_dir):
    print(root)


In [None]:
def get_summary(text):
    prompt = f"Summarize the following university-level lecture into around 1000 words:\n\n{text[:40000]}"
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that summarizes educational lectures."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=600
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("Error:", e)
        return "ERROR"


In [None]:
def semantic_similarity(ref, gen):
    emb1 = bert_model.encode(ref, convert_to_tensor=True)
    emb2 = bert_model.encode(gen, convert_to_tensor=True)
    return util.pytorch_cos_sim(emb1, emb2).item()

def extract_concepts(text):
    doc = nlp(text)
    return set([chunk.text.lower() for chunk in doc.noun_chunks])

def jaccard_similarity(set1, set2):
    if not set1 or not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)

def entropy(tokens):
    total = len(tokens)
    freq = Counter(tokens)
    probs = [count / total for count in freq.values()]
    return -sum(p * math.log2(p) for p in probs)
##first try of linear combination
def compute_kqs(sem_score, kg_score, mi_score, w1=0.3, w2=0.3, w3=0.4):
    raw_score = w1 * sem_score + w2 * kg_score + w3 * mi_score
    return max(0.0, min(raw_score / 3.0, 1.0))  # Normalize to 0–1


In [None]:
results = []

lecture_files = sorted(os.listdir(LECTURE_PATH))
objective_files = sorted(os.listdir(OBJECTIVE_PATH))

for lec_file, obj_file in zip(lecture_files, objective_files):
    lec_path = os.path.join(LECTURE_PATH, lec_file)
    obj_path = os.path.join(OBJECTIVE_PATH, obj_file)

    with open(lec_path, 'r', encoding='utf-8') as f:
        lecture_text = f.read()
    with open(obj_path, 'r', encoding='utf-8') as f:
        objectives = f.read().splitlines()

    gen_summary = get_summary(lecture_text)
    sem_score = semantic_similarity(" ".join(objectives), gen_summary)

    kg_ref = extract_concepts(" ".join(objectives))
    kg_gen = extract_concepts(gen_summary)
    kg_score = jaccard_similarity(kg_ref, kg_gen)

    H_L = entropy(lecture_text.lower().split())
    H_R = entropy(gen_summary.lower().split())
    H_LR = entropy(lecture_text.lower().split() + gen_summary.lower().split())
    mi_score = (H_L + H_R - H_LR) / max(H_L, H_R) if max(H_L, H_R) > 0 else 0.0

    kqs = compute_kqs(sem_score, kg_score, mi_score)

    results.append({
        "Lecture_File": lec_file,
        "Semantic_Score": round(sem_score, 4),
        "KG_Score": round(kg_score, 4),
        "Mutual_Info": round(mi_score, 4),
        "KQS_Score": round(kqs, 4)
    })

df = pd.DataFrame(results)
df.to_excel("KQS_Batch_Results.xlsx", index=False)
df.head()


In [None]:


import zipfile
import os
import openai
import pandas as pd
import math
import spacy
from collections import Counter
from sentence_transformers import SentenceTransformer, util

# ----------------- API KEY -----------------
# Replace with your own API key securely or via environment variable
openai.api_key = "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"

# ----------------- UNZIP DATA -----------------
zip_path = "/content/KQ_FEED_Final_Meaningful_Lectures_Dataset.zip"
extract_dir = "/content/dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

LECTURE_PATH = "/content/dataset/final_meaningful_dataset/lectures"
OBJECTIVE_PATH = "/content/dataset/final_meaningful_dataset/objectives"

# ----------------- MODEL LOADING -----------------
nlp = spacy.load("en_core_web_sm")
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# ----------------- FUNCTION DEFINITIONS -----------------
def get_summary(text):
    prompt = f"Summarize the following university-level lecture into around 150 words:\n\n{text[:3000]}"
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that summarizes educational lectures."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=300
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("Error:", e)
        return "ERROR"

def semantic_similarity(ref, gen):
    emb1 = bert_model.encode(ref, convert_to_tensor=True)
    emb2 = bert_model.encode(gen, convert_to_tensor=True)
    return util.pytorch_cos_sim(emb1, emb2).item()

def extract_concepts(text):
    doc = nlp(text)
    return set([chunk.text.lower() for chunk in doc.noun_chunks])

def jaccard_similarity(set1, set2):
    if not set1 or not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)

def entropy(tokens):
    total = len(tokens)
    freq = Counter(tokens)
    probs = [count / total for count in freq.values()]
    return -sum(p * math.log2(p) for p in probs)

def compute_kqs(sem_score, kg_score, mi_score, w1=0.33, w2=0.33, w3=0.33):
    raw_score = w1 * sem_score + w2 * kg_score + w3 * mi_score
    return max(0.0, min(raw_score / 3.0, 1.0))  # Normalize to 0–1

# ----------------- BATCH PROCESSING -----------------
results = []

lecture_files = sorted(os.listdir(LECTURE_PATH))
objective_files = sorted(os.listdir(OBJECTIVE_PATH))

for lec_file, obj_file in zip(lecture_files, objective_files):
    lec_path = os.path.join(LECTURE_PATH, lec_file)
    obj_path = os.path.join(OBJECTIVE_PATH, obj_file)

    with open(lec_path, 'r', encoding='utf-8') as f:
        lecture_text = f.read()
    with open(obj_path, 'r', encoding='utf-8') as f:
        objectives = f.read().splitlines()

    gen_summary = get_summary(lecture_text)
    if gen_summary == "ERROR":
        continue  # Skip in case of API failure

    sem_score = semantic_similarity(" ".join(objectives), gen_summary)
    kg_ref = extract_concepts(" ".join(objectives))
    kg_gen = extract_concepts(gen_summary)
    kg_score = jaccard_similarity(kg_ref, kg_gen)

    H_L = entropy(lecture_text.lower().split())
    H_R = entropy(gen_summary.lower().split())
    H_LR = entropy(lecture_text.lower().split() + gen_summary.lower().split())
    mi_score = (H_L + H_R - H_LR) / max(H_L, H_R) if max(H_L, H_R) > 0 else 0.0

    kqs = compute_kqs(sem_score, kg_score, mi_score)

    results.append({
        "Lecture_File": lec_file,
        "Semantic_Score": round(sem_score, 4),
        "KG_Score": round(kg_score, 4),
        "Mutual_Info": round(mi_score, 4),
        "KQS_Score": round(kqs, 4)
    })

# ----------------- SAVE RESULTS -----------------
df = pd.DataFrame(results)
df.to_excel("KQS_Batch_Results.xlsx", index=False)
df.head()
