In [110]:
from docx import Document
import pandas as pd
import nltk
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import re

## Teachers Upload their Syllabus 

In [111]:
# Load the document
doc = Document('Syllabus.docx')

# Initialize lists to store table data
co_numbers = []
course_outcomes = []

# Assuming the data is in a table within the Word document
for table in doc.tables:
    for row in table.rows:
        # Get the text from each cell in the row
        cells = [cell.text.strip() for cell in row.cells]
        
        # Skip rows without the expected structure
        if len(cells) == 2:
            co_numbers.append(cells[0])
            course_outcomes.append(cells[1])

# Create a DataFrame
# Directly create DataFrame from extracted table data if column headings are included
df = pd.DataFrame([co_numbers, course_outcomes]).transpose()

# Rename columns only if needed
df.columns = df.iloc[0]  # Set the first row as the header
df = df[1:].reset_index(drop=True)  # Drop the first row and reset the index


In [112]:
df

Unnamed: 0,No.,Course Outcomes
0,CO1,Understand the fundamental principles of image...
1,CO2,Develop proficiency in image enhancement and s...
2,CO3,Develop skills in object detection and recogni...
3,CO4,Apply the image and video analysis approaches ...


In [113]:
# Load the syllabus Word document
doc = Document('Syllabus.docx')

# Initialize variables to store units and their content
units = []
current_unit = None
current_content = []

# Parse through the document paragraphs
for para in doc.paragraphs:
    text = para.text.strip()

    # Check for "Unit" and start a new unit
    if text.startswith("Unit"):
        # Save the previous unit and its content
        if current_unit:
            units.append((current_unit, " ".join(current_content)))
        
        # Start a new unit
        current_unit = text
        current_content = []
    elif current_unit:
        # Check if the paragraph contains Lab Exercises or Reading sections
        if text.startswith("Lab Exercise") or text.startswith("Essential Reading") or text.startswith("Recommended Reading"):
            continue
        # Accumulate content for the current unit
        current_content.append(text)

# Append the last unit
if current_unit:
    units.append((current_unit, " ".join(current_content)))

# Create a DataFrame with columns "Unit", "Contents"
df_units = pd.DataFrame(units, columns=["Unit", "Contents"])

# Extract teaching hours using the specific pattern "Teaching Hours: X"
def extract_hours(contents):
    match = re.search(r"Teaching Hours:\s*(\d+)", contents)
    return int(match.group(1)) if match else None

# Extract content before "Teaching Hours"
def extract_content_before_hours(contents):
    if "Teaching Hours" in contents:
        return contents.split("Teaching Hours")[0].strip()
    return contents.strip()

# Apply content splitting and teaching hours extraction
df_units['Teaching Hours'] = df_units['Contents'].apply(extract_hours)
df_units['Contents'] = df_units['Contents'].apply(extract_content_before_hours)

# Extract Topic from the unit by assuming it's the part of the string after "Unit X:"
def extract_topic(unit):
    # Match unit topic patterns with different possible delimiters
    match = re.search(r"Unit\s*\d+\s*[:\t\s](.+)", unit)
    return match.group(1).strip() if match else ""  # Return empty string if no match


# Apply topic extraction
df_units['Topic'] = df_units['Unit'].apply(extract_topic)

# Clean up "Unit" column to only contain the unit number (e.g., "Unit 1")
df_units['Unit'] = df_units['Unit'].apply(lambda x: re.match(r"Unit\s*\d+", x).group())

# Reorder columns for better readability
df_units = df_units[['Unit', 'Topic', 'Contents', 'Teaching Hours']]

# Display the DataFrame
print(df_units)


     Unit                                              Topic  \
0  Unit 1  Introduction to Digital Image and Video Proces...   
1  Unit 2        Image and Video Enhancement and Restoration   
2  Unit 3                        Image and Video Compression   
3  Unit 4                  Feature Detection and Description   
4  Unit 5                   Object Detection and Recognition   

                                            Contents  Teaching Hours  
0  Digital image representation, Sampling and Qua...              12  
1  Spatial domain-Linear and Non-linear Filtering...              12  
2  Fundamentals of Image Compression: Huffman Cod...              12  
3  Introduction to feature detectors, Point, line...              12  
4  Descriptors: Boundary descriptors - Fourier de...              12  


In [114]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess text (lowercase for consistency)
df['Course Outcomes'] = df['Course Outcomes'].str.lower()
df_units['Topic'] = df_units['Topic'].str.lower()

# Combine text data for vectorization
all_text = pd.concat([df['Course Outcomes'], df_units['Topic']])

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_text)

# Separate vectors for Course Outcomes and Topics
course_outcome_vectors = tfidf_matrix[:len(df)]
topic_vectors = tfidf_matrix[len(df):]

# Compute cosine similarity
similarity_matrix = cosine_similarity(topic_vectors, course_outcome_vectors)

# Find the most similar course outcome for each unit
best_matches = []
for i, topic in enumerate(df_units['Topic']):
    # Get the index of the most similar course outcome
    best_match_index = similarity_matrix[i].argmax()
    best_match_score = similarity_matrix[i][best_match_index]
    
    # Append the best match and its score
    best_matches.append({
        'Matched Course Outcome': df.iloc[best_match_index]['Course Outcomes'],
        'Similarity Score': best_match_score
    })

# Convert matches to a DataFrame
matches_df = pd.DataFrame(best_matches)

# Add the matched course outcome and score to df_units
df_units['Course Outcomes'] = matches_df['Matched Course Outcome']
df_units['Similarity Score'] = matches_df['Similarity Score']

# Display the updated DataFrame
print(df_units)


df_units.to_excel("Extracted_Units_Updated.xlsx", index=False)

     Unit                                              Topic  \
0  Unit 1  introduction to digital image and video proces...   
1  Unit 2        image and video enhancement and restoration   
2  Unit 3                        image and video compression   
3  Unit 4                  feature detection and description   
4  Unit 5                   object detection and recognition   

                                            Contents  Teaching Hours  \
0  Digital image representation, Sampling and Qua...              12   
1  Spatial domain-Linear and Non-linear Filtering...              12   
2  Fundamentals of Image Compression: Huffman Cod...              12   
3  Introduction to feature detectors, Point, line...              12   
4  Descriptors: Boundary descriptors - Fourier de...              12   

                                     Course Outcomes  Similarity Score  
0  apply the image and video analysis approaches ...          0.232597  
1  develop proficiency in image enha

  df_units.to_excel("Extracted_Units_Updated.xlsx", index=False)


## Extract Verbs from the Course Outcomes

In [115]:
course_outcomes=[]
for i in range(len(df)+1):
    data=df_units['Course Outcomes'].iloc[i]
    course_outcomes.append(data)

In [116]:
course_outcomes

['apply the image and video analysis approaches to solve real world problems',
 'develop proficiency in image enhancement and segmentation',
 'understand the fundamental principles of image and video analysis',
 'develop skills in object detection and recognition',
 'develop skills in object detection and recognition']

In [117]:
verbs=['VB','VBP','VBD','VBG','VBN']

In [118]:
import nltk



course_verbs = []
for i in range(len(course_outcomes)):
    review = course_outcomes[i]
    review = review.split()
    review = nltk.pos_tag(review)
    print(review)
    filtered_verbs = [word for word, tag in review if tag in verbs]
    course_verbs.append(filtered_verbs)

# Assign the collected verbs list to the DataFrame column
df_units['Verbs'] = course_verbs


[('apply', 'VB'), ('the', 'DT'), ('image', 'NN'), ('and', 'CC'), ('video', 'NN'), ('analysis', 'NN'), ('approaches', 'NNS'), ('to', 'TO'), ('solve', 'VB'), ('real', 'JJ'), ('world', 'NN'), ('problems', 'NNS')]
[('develop', 'VB'), ('proficiency', 'NN'), ('in', 'IN'), ('image', 'NN'), ('enhancement', 'NN'), ('and', 'CC'), ('segmentation', 'NN')]
[('understand', 'VB'), ('the', 'DT'), ('fundamental', 'JJ'), ('principles', 'NNS'), ('of', 'IN'), ('image', 'NN'), ('and', 'CC'), ('video', 'NN'), ('analysis', 'NN')]
[('develop', 'VB'), ('skills', 'NNS'), ('in', 'IN'), ('object', 'JJ'), ('detection', 'NN'), ('and', 'CC'), ('recognition', 'NN')]
[('develop', 'VB'), ('skills', 'NNS'), ('in', 'IN'), ('object', 'JJ'), ('detection', 'NN'), ('and', 'CC'), ('recognition', 'NN')]


In [119]:
course_verbs

[['apply', 'solve'], ['develop'], ['understand'], ['develop'], ['develop']]

In [120]:
df_units

Unnamed: 0,Unit,Topic,Contents,Teaching Hours,Course Outcomes,Similarity Score,Verbs
0,Unit 1,introduction to digital image and video proces...,"Digital image representation, Sampling and Qua...",12,apply the image and video analysis approaches ...,0.232597,"[apply, solve]"
1,Unit 2,image and video enhancement and restoration,Spatial domain-Linear and Non-linear Filtering...,12,develop proficiency in image enhancement and s...,0.350995,[develop]
2,Unit 3,image and video compression,Fundamentals of Image Compression: Huffman Cod...,12,understand the fundamental principles of image...,0.228646,[understand]
3,Unit 4,feature detection and description,"Introduction to feature detectors, Point, line...",12,develop skills in object detection and recogni...,0.196493,[develop]
4,Unit 5,object detection and recognition,Descriptors: Boundary descriptors - Fourier de...,12,develop skills in object detection and recogni...,0.682227,[develop]


In [121]:
df_assessments=pd.read_excel("Verbs-Assesments Grouped.xlsx")

In [122]:
df_assessments

Unnamed: 0.1,Unnamed: 0,Verbs,Assessments
0,0,(ann),[]
1,1,(cnn),[]
2,2,(rnn),[]
3,3,according,"['Activities such as problem sets, performance..."
4,4,adequacy,"['Activities such as journals, diaries, critiq..."
...,...,...,...
77,77,using,"['Activities such as research projects, musica..."
78,78,utilizing,"['Activities such as case studies, critiques, ..."
79,79,verify,"['Activities such as journals, diaries, critiq..."
80,80,visualize,"['Activities such as case studies, critiques, ..."


In [123]:
df_assessments['Verbs']

0         (ann)
1         (cnn)
2         (rnn)
3     according
4      adequacy
        ...    
77        using
78    utilizing
79       verify
80    visualize
81      writing
Name: Verbs, Length: 82, dtype: object

In [124]:
flattened_course_verbs = [verb for sublist in course_verbs for verb in sublist]

In [125]:
flattened_course_verbs

['apply', 'solve', 'develop', 'understand', 'develop', 'develop']

In [126]:
df_assessments['Verbs']

0         (ann)
1         (cnn)
2         (rnn)
3     according
4      adequacy
        ...    
77        using
78    utilizing
79       verify
80    visualize
81      writing
Name: Verbs, Length: 82, dtype: object

In [127]:
# Convert the 'verbs' column to lists of individual words, ignoring NaNs
flattened_assessment_verbs = []
for item in df_assessments['Verbs'].dropna():
    flattened_assessment_verbs.extend([verb.strip() for verb in item.split(',')])

flattened_assessment_verbs

['(ann)',
 '(cnn)',
 '(rnn)',
 'according',
 'adequacy',
 'admit',
 'advanced',
 'analyse',
 'analysis.',
 'analyze',
 'apply',
 'applying',
 'are',
 'associated',
 'based',
 'be',
 'classified',
 'classify',
 'computing',
 'constructing',
 'create',
 'deciding',
 'demonstrate',
 'describe',
 'describethemaintechnologiesandmethodscurrentlyusedincreating',
 'designing',
 'develop',
 'developing',
 'differentiate',
 'displayed.',
 'distributed',
 'do',
 'estimate',
 'evaluate',
 'evaluatethechallengesintraining',
 'examine',
 'forecast',
 'formulate',
 'formulated',
 'gain',
 'generated',
 'given',
 'grasp',
 'identify',
 'illustrate',
 'implementing',
 'including',
 'infer',
 'inherent',
 'integrating',
 'interpret',
 'involved',
 'know',
 'knowledgeof',
 'learning',
 'linking',
 'model.',
 'organized',
 'perform',
 'present',
 'processing',
 'programming',
 'reporting',
 'represent',
 'sampling',
 'solve',
 'solved',
 'solving',
 'specified',
 'surrounding',
 'testing',
 'thei',
 'thin

In [128]:

# # Load SpaCy model
# import numpy as np
# import spacy
# from sklearn.metrics.pairwise import cosine_similarity

# # Load SpaCy model
# nlp = spacy.load("en_core_web_md")

# # Function to get the SpaCy vector for a word
# def get_word_vector(word):
#     doc = nlp(word)
#     if doc.has_vector:  # Check if the word has a vector
#         return doc.vector
#     else:
#         return np.zeros(nlp.vocab.vectors_length)  # Return a zero vector if word not in vocabulary


# # Convert course verbs to vectors
# course_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_course_verbs])

# # Convert assessment verbs to vectors
# assessment_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_assessment_verbs])

# # Set similarity threshold
# threshold = 0.5  

# # Dictionary to store matching assessments
# matching_assessments = {}

# for i, course_verb in enumerate(flattened_course_verbs):
#     matching_assessments[course_verb] = []
    
#     # Calculate cosine similarity between the course verb and each assessment verb
#     sim_scores = cosine_similarity([course_verb_vectors[i]], assessment_verb_vectors).flatten()
    
#     # Find assessment verbs with similarity scores above the threshold
#     for j, score in enumerate(sim_scores):
#         if score > threshold:
#             # Append the assessment description from df_assessments for the matching verb
#             matching_assessments[course_verb].append(df_assessments.iloc[j]['Assessments'])

# # Print results
# for verb, assessments in matching_assessments.items():
#     print(f"Verb: {verb}")
#     print(f"Assessments for course verb '{verb}': {assessments}\n")



In [129]:
print(len(flattened_assessment_verbs))
print(len(df_assessments))

82
82


In [130]:
import numpy as np
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity

# Load SpaCy model
nlp = spacy.load("en_core_web_md")

# Function to get the SpaCy vector for a word
def get_word_vector(word):
    doc = nlp(word)
    if doc.has_vector:
        return doc.vector
    else:
        return np.zeros(nlp.vocab.vectors_length)  # Return zero vector if word not in vocabulary

# Expand 'Verbs' column to match a flattened list
expanded_rows = []
for index, row in df_assessments.iterrows():
    verbs = row['Verbs']
    if pd.isna(verbs):  # Skip rows with NaN in 'Verbs'
        continue
    verb_list = [verb.strip() for verb in verbs.split(',')]  # Split multiple verbs into a list
    for verb in verb_list:
        expanded_rows.append({'Verbs': verb, 'Assessments': row['Assessments']})

# Create a new expanded DataFrame
expanded_df_assessments = pd.DataFrame(expanded_rows)


In [131]:
expanded_df_assessments

Unnamed: 0,Verbs,Assessments
0,(ann),[]
1,(cnn),[]
2,(rnn),[]
3,according,"['Activities such as problem sets, performance..."
4,adequacy,"['Activities such as journals, diaries, critiq..."
...,...,...
77,using,"['Activities such as research projects, musica..."
78,utilizing,"['Activities such as case studies, critiques, ..."
79,verify,"['Activities such as journals, diaries, critiq..."
80,visualize,"['Activities such as case studies, critiques, ..."


In [132]:
# Flattened assessment verbs
flattened_assessment_verbs = expanded_df_assessments['Verbs'].tolist()

# Convert course verbs and assessment verbs to vectors
course_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_course_verbs])
assessment_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_assessment_verbs])

# Similarity threshold
threshold = 0.5

# Dictionary to store matching assessments
matching_assessments = {}

# Function to normalize assessments (you can extend this for more complex cases)
def normalize_assessment(assessment):
    # Strip leading/trailing spaces and convert to lowercase
    return assessment.strip().lower()

for i, course_verb in enumerate(flattened_course_verbs):
    matching_assessments[course_verb] = set()  # Use a set to avoid duplicate assessments

    # Calculate cosine similarity between the course verb and each assessment verb
    sim_scores = cosine_similarity([course_verb_vectors[i]], assessment_verb_vectors).flatten()

    # Find assessment verbs with similarity scores above the threshold
    for j, score in enumerate(sim_scores):
        if score > threshold:
            if 0 <= j < len(expanded_df_assessments):  # Ensure index is valid
                normalized_assessment = normalize_assessment(expanded_df_assessments.iloc[j]['Assessments'])
                matching_assessments[course_verb].add(normalized_assessment)  # Store normalized assessment

# Print results
# Create an empty DataFrame with specified columns
df_matchass = pd.DataFrame(columns=["Verbs", "Assessments"])

# Iterate through matching assessments
for verb, assessments in matching_assessments.items():
    print(f"Verb: {verb}")
    if assessments:
        print("Matching Assessments:")
        # Sort and remove duplicates (optional, for better display)
        unique_assessments = sorted(set(assessments))
        print("\n".join(f"- {assessment}" for assessment in unique_assessments))
        
        # Create a new row as a DataFrame
        new_row = pd.DataFrame({
            "Verbs": [verb],
            "Assessments": [", ".join(unique_assessments)]
        })
        
        # Concatenate the new row to the existing DataFrame
        df_matchass = pd.concat([df_matchass, new_row], ignore_index=True)
    else:
        print("No matching assessments found.")
    print("\n")

# Display the resulting DataFrame
print(df_matchass)





Verb: apply
Matching Assessments:
- ['activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select relevant and irrelevant parts • determine how elements function together • determine bias, values, or underlying intent in presented material •', 'activities such as papers, exams, problem sets, class discussions, or concept maps that require students to: summarize readings, films, or speeches • compare and contrast two or more theories, events, or processes • classify or categorize cases, elements, or events using established criteria • paraphrase documents or speeches • find or identify examples or illustrations of a concept or principle •', 'activities such as problem sets, performances, labs, prototyping, or simulations that require students to: use procedures to solve or complete familiar or unfamiliar tasks • determine which procedure(s) are most appropriate for a given task •', 'objective test items su

In [133]:
#df_matchass.to_excel('match.xlsx')

In [134]:
# def clean_assessments(assessments):
#     # Split assessments by commas, remove spaces, and convert to set for uniqueness
#     assessment_list = [a.strip() for a in assessments.split(',')]
#     unique_assessments = set(assessment_list)  # Using set to ensure uniqueness
#     return ', '.join(sorted(unique_assessments))  # Convert back to string


In [135]:
# df_matchass['unique_assessments'] = df_matchass['Assessments'].apply(clean_assessments)

# # Display the cleaned DataFrame
# #print(df[['verb', 'unique_assessments']])

In [136]:
df_matchass
df_matchass.to_excel('match.xlsx')

  df_matchass.to_excel('match.xlsx')


In [137]:
import pandas as pd
import re  # To escape special characters in the verb

# Sample data
# df_units: Table 1 with a list of verbs in the "Verbs" column
# df_matchass: Table 2 with verbs and corresponding assessments

# Assuming df_units has a column "Verbs" containing lists of verbs
# and df_matchass has a column "Verbs" (string) and "Assessments" (string)

# Create an empty list to store assessments for each row in df_units
assessments = []

# Iterate over each row in df_units
for index, row in df_units.iterrows():
    # Initialize a list to store matched assessments for each verb list
    matched_assessments = []
    
    # Iterate over the list of verbs in the "Verbs" column of df_units
    for verb in row['Verbs']:
        # Escape special characters in the verb for regex matching
        escaped_verb = re.escape(verb)
        
        # Check if the verb exists in df_matchass
        matching_rows = df_matchass[df_matchass['Verbs'].str.contains(escaped_verb, case=False, na=False)]
        
        # If there are matches, collect the corresponding assessments
        if not matching_rows.empty:
            matched_assessments.extend(matching_rows['Assessments'].tolist())
    
    # Remove duplicate assessments if any and add to the list
    matched_assessments = list(set(matched_assessments))
    
    # Append the matched assessments to the assessments list
    assessments.append(matched_assessments)

# Add the new "Assessment" column to df_units
df_units['Assessments'] = assessments

# Now, df_units will have an additional "Assessments" column with the matched assessments


In [138]:
# Assuming df_units is the DataFrame you provided

# Convert the list of characters in the "Verbs" column back to strings
df_units['Verbs'] = df_units['Verbs'].apply(lambda x: ''.join(x) if isinstance(x, list) else x)

# Check the updated DataFrame
df_units


Unnamed: 0,Unit,Topic,Contents,Teaching Hours,Course Outcomes,Similarity Score,Verbs,Assessments
0,Unit 1,introduction to digital image and video proces...,"Digital image representation, Sampling and Qua...",12,apply the image and video analysis approaches ...,0.232597,applysolve,"[['activities such as case studies, critiques,..."
1,Unit 2,image and video enhancement and restoration,Spatial domain-Linear and Non-linear Filtering...,12,develop proficiency in image enhancement and s...,0.350995,develop,"[['activities such as case studies, critiques,..."
2,Unit 3,image and video compression,Fundamentals of Image Compression: Huffman Cod...,12,understand the fundamental principles of image...,0.228646,understand,"[['activities such as case studies, critiques,..."
3,Unit 4,feature detection and description,"Introduction to feature detectors, Point, line...",12,develop skills in object detection and recogni...,0.196493,develop,"[['activities such as case studies, critiques,..."
4,Unit 5,object detection and recognition,Descriptors: Boundary descriptors - Fourier de...,12,develop skills in object detection and recogni...,0.682227,develop,"[['activities such as case studies, critiques,..."


In [149]:
df_units.to_excel('unit_co_ma.xlsx')

  df_units.to_excel('unit_co_ma.xlsx')


In [139]:
# # Function to normalize text for consistency
# def normalize_text(text):
#     return text.strip().lower()

# # Global set to track unique assessments
# global_unique_assessments = set()

# # Dictionary to store unique matching assessments for each verb
# matching_assessments = {}

# for i, course_verb in enumerate(flattened_course_verbs):
#     # Initialize set for the current verb to store its unique assessments
#     matching_assessments[course_verb] = set()
    
#     # Calculate similarity scores between current course verb and assessment verbs
#     sim_scores = cosine_similarity([course_verb_vectors[i]], assessment_verb_vectors).flatten()
    
#     # Find assessments with scores above the threshold
#     for j, score in enumerate(sim_scores):
#         if score > threshold:
#             # Normalize the assessment text for uniformity
#             assessment = normalize_text(expanded_df_assessments.iloc[j]['Assessments'])
            
#             # Check if this assessment is globally unique
#             if assessment not in global_unique_assessments:
#                 # Add to global unique set and the current verb's set
#                 global_unique_assessments.add(assessment)
#                 matching_assessments[course_verb].add(assessment)

# # Display results
# for verb, assessments in matching_assessments.items():
#     print(f"Verb: {verb}")
#     if assessments:
#         print("Matching Assessments:")
#         for assessment in sorted(assessments):  # Sort for better readability
#             print(f"- {assessment}")
#     else:
#         print("No matching assessments found.")
#     print("\n")


In [None]:
df

Simplification shit

In [140]:
from transformers import pipeline
import pandas as pd
import re

# Example DataFrame
# df_matchass = pd.DataFrame({
#     "Verbs": ["apply", "solve", "develop", "understand"],
#     "Assessments": [
#         "['activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select relevant and irrelevant parts • determine how elements function together • determine bias, values, or underlying intent in presented material •']",
#         "['activities such as papers, exams, problem sets, class discussions, or concept maps that require students to: summarize readings, films, or speeches • compare and contrast two or more theories, events, or processes • classify or categorize cases, elements, or events using established criteria •']",
#         "['activities such as research projects, musical compositions, performances, essays, business plans, website designs, or set designs that require students to: make, build, design or generate something new •']",
#         "['activities such as journals, diaries, critiques, problem sets, product reviews, or studies that require students to: test, monitor, judge, or critique readings, performances, or products against established criteria or standards']"
#     ]
# })

# Load a pre-trained BERT-based pipeline for keyphrase extraction
keyword_extractor = pipeline("ner", model="dslim/bert-base-NER")

# Preprocess the Assessments column
def preprocess_text(text):
    # Retain content but remove unnecessary brackets or special formatting
    text = re.sub(r"[\[\]']", "", text)  # Remove square brackets and single quotes
    return text


# Extract keywords using BERT
def extract_keywords(text):
    # Preprocess the input text
    clean_text = preprocess_text(text)
    # Use the BERT NER pipeline
    ner_results = keyword_extractor(clean_text)
    
    # Debugging: Print NER results
    print(f"Text: {clean_text}")
    print(f"NER Results: {ner_results}")
    
    # Extract unique entities as keywords
    keywords = {result['word'] for result in ner_results if result['entity'].startswith("B")}
    return ", ".join(sorted(keywords)) if keywords else "No keywords found"

# Apply the extraction to the DataFrame
df_matchass["Simplified_Assessments"] = df_matchass["Assessments"].apply(extract_keywords)

# Display the updated DataFrame
df_matchass





Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Text: activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select relevant and irrelevant parts • determine how elements function together • determine bias, values, or underlying intent in presented material •, activities such as papers, exams, problem sets, class discussions, or concept maps that require students to: summarize readings, films, or speeches • compare and contrast two or more theories, events, or processes • classify or categorize cases, elements, or events using established criteria • paraphrase documents or speeches • find or identify examples or illustrations of a concept or principle •, activities such as problem sets, performances, labs, prototyping, or simulations that require students to: use procedures to solve or complete familiar or unfamiliar tasks • determine which procedure(s) are most appropriate for a given task •, objective test items such as fill-in-the-blank, matching, lab

Unnamed: 0,Verbs,Assessments,Simplified_Assessments
0,apply,"['activities such as case studies, critiques, ...",No keywords found
1,solve,"['activities such as case studies, critiques, ...",No keywords found
2,develop,"['activities such as case studies, critiques, ...",No keywords found
3,understand,"['activities such as case studies, critiques, ...",No keywords found


In [141]:
from sentence_transformers import SentenceTransformer, util

# Initialize a sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')

def extract_keywords_embeddings(text):
    clean_text = preprocess_text(text)  # Clean the text
    # Break text into sentences for better extraction
    sentences = clean_text.split("•")  # Split based on bullet points
    # Encode all sentences
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    # Find the most relevant keywords or phrases
    keywords = util.semantic_search(embeddings, embeddings, top_k=3)
    
    # Extract unique sentences as top keywords
    top_keywords = {sentences[item['corpus_id']].strip() for item in keywords[0]}
    return ", ".join(top_keywords) if top_keywords else "No keywords found"

# Apply the function to your DataFrame
df_matchass["Simplified_Assessments"] = df_matchass["Assessments"].apply(extract_keywords_embeddings)




In [142]:
df_matchass

Unnamed: 0,Verbs,Assessments,Simplified_Assessments
0,apply,"['activities such as case studies, critiques, ...",", activities such as case studies, critiques, ..."
1,solve,"['activities such as case studies, critiques, ...",", activities such as case studies, critiques, ..."
2,develop,"['activities such as case studies, critiques, ...",", activities such as case studies, critiques, ..."
3,understand,"['activities such as case studies, critiques, ...",", activities such as case studies, critiques, ..."


In [143]:
def preprocess_text(text):
    # Remove square brackets but retain other formatting
    text = re.sub(r"[\[\]']", "", text)
    # Convert to lowercase for better model compatibility
    text = text.lower()
    return text


In [144]:
from keybert import KeyBERT

# Initialize KeyBERT
kw_model = KeyBERT()

def extract_keyphrases_keybert(text):
    clean_text = preprocess_text(text)
    # Extract keyphrases with a range of 1 to 3 words
    keywords = kw_model.extract_keywords(clean_text, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=5)
    return ", ".join([kw[0] for kw in keywords]) if keywords else "No keywords found"

df_matchass["Simplified_Assessments"] = df_matchass["Assessments"].apply(extract_keyphrases_keybert)


In [145]:
def debug_extraction(text):
    clean_text = preprocess_text(text)
    print(f"Clean Text: {clean_text}")
    return extract_keyphrases_keybert(clean_text)

df_matchass["Simplified_Assessments"] = df_matchass["Assessments"].apply(debug_extraction)


Clean Text: activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select relevant and irrelevant parts • determine how elements function together • determine bias, values, or underlying intent in presented material •, activities such as papers, exams, problem sets, class discussions, or concept maps that require students to: summarize readings, films, or speeches • compare and contrast two or more theories, events, or processes • classify or categorize cases, elements, or events using established criteria • paraphrase documents or speeches • find or identify examples or illustrations of a concept or principle •, activities such as problem sets, performances, labs, prototyping, or simulations that require students to: use procedures to solve or complete familiar or unfamiliar tasks • determine which procedure(s) are most appropriate for a given task •, objective test items such as fill-in-the-blank, matchin

In [146]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_and_extract(text):
    clean_text = preprocess_text(text)
    try:
        summary = summarizer(clean_text, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
        print(f"Summary: {summary}")
        return summary
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return "Error in summarization"

df_matchass["Simplified_Assessments"] = df_matchass["Assessments"].apply(summarize_and_extract)


Summary:  activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select relevant and irrelevant parts • determine how elements function together • determine bias, values, or underlying intent in
Summary:  activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select relevant and irrelevant parts • determine how elements function together • determine bias, values, or underlying intent in
Error summarizing text: index out of range in self
Error summarizing text: index out of range in self


Extracted Units and Contents

In [147]:
# import pandas as pd
# import re
# from docx import Document

# # Load the syllabus Word document
# doc = Document('Syllabus.docx')

# # Initialize variables to store units and their content
# units = []
# current_unit = None
# current_content = []

# # Parse through the document paragraphs
# for para in doc.paragraphs:
#     text = para.text.strip()

#     # Check for "Unit" and start a new unit
#     if text.startswith("Unit"):
#         # Save the previous unit and its content
#         if current_unit:
#             units.append((current_unit, " ".join(current_content)))
        
#         # Start a new unit
#         current_unit = text
#         current_content = []
#     elif current_unit:
#         # Check if the paragraph contains Lab Exercises or Reading sections
#         if text.startswith("Lab Exercise") or text.startswith("Essential Reading") or text.startswith("Recommended Reading"):
#             continue
#         # Accumulate content for the current unit
#         current_content.append(text)

# # Append the last unit
# if current_unit:
#     units.append((current_unit, " ".join(current_content)))

# # Create a DataFrame with columns "Unit", "Contents"
# df_units = pd.DataFrame(units, columns=["Unit", "Contents"])

# # Extract teaching hours using the specific pattern "Teaching Hours: X"
# def extract_hours(contents):
#     match = re.search(r"Teaching Hours:\s*(\d+)", contents)
#     return int(match.group(1)) if match else None

# # Extract content before "Teaching Hours"
# def extract_content_before_hours(contents):
#     if "Teaching Hours" in contents:
#         return contents.split("Teaching Hours")[0].strip()
#     return contents.strip()

# # Apply content splitting and teaching hours extraction
# df_units['Teaching Hours'] = df_units['Contents'].apply(extract_hours)
# df_units['Contents'] = df_units['Contents'].apply(extract_content_before_hours)

# # Extract Topic from the unit by assuming it's the part of the string after "Unit X:"
# def extract_topic(unit):
#     # Match unit topic patterns with different possible delimiters
#     match = re.search(r"Unit\s*\d+\s*[:\t\s](.+)", unit)
#     return match.group(1).strip() if match else ""  # Return empty string if no match


# # Apply topic extraction
# df_units['Topic'] = df_units['Unit'].apply(extract_topic)

# # Clean up "Unit" column to only contain the unit number (e.g., "Unit 1")
# df_units['Unit'] = df_units['Unit'].apply(lambda x: re.match(r"Unit\s*\d+", x).group())

# # Reorder columns for better readability
# df_units = df_units[['Unit', 'Topic', 'Contents', 'Teaching Hours']]

# # Display the DataFrame
# print(df_units)

# # Save the DataFrame to an Excel file if needed
# df_units.to_excel("Extracted_Units_Updated3.xlsx", index=False)


In [148]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Preprocess text (lowercase for consistency)
# df['Course Outcomes'] = df['Course Outcomes'].str.lower()
# df_units['Topic'] = df_units['Topic'].str.lower()

# # Combine text data for vectorization
# all_text = pd.concat([df['Course Outcomes'], df_units['Topic']])

# # Vectorize using TF-IDF
# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(all_text)

# # Separate vectors for Course Outcomes and Topics
# course_outcome_vectors = tfidf_matrix[:len(df)]
# topic_vectors = tfidf_matrix[len(df):]

# # Compute cosine similarity
# similarity_matrix = cosine_similarity(topic_vectors, course_outcome_vectors)

# # Find the most similar course outcome for each unit
# best_matches = []
# for i, topic in enumerate(df_units['Topic']):
#     # Get the index of the most similar course outcome
#     best_match_index = similarity_matrix[i].argmax()
#     best_match_score = similarity_matrix[i][best_match_index]
    
#     # Append the best match and its score
#     best_matches.append({
#         'Matched Course Outcome': df.iloc[best_match_index]['Course Outcomes'],
#         'Similarity Score': best_match_score
#     })

# # Convert matches to a DataFrame
# matches_df = pd.DataFrame(best_matches)

# # Add the matched course outcome and score to df_units
# df_units['Matched Course Outcome'] = matches_df['Matched Course Outcome']
# df_units['Similarity Score'] = matches_df['Similarity Score']

# # Display the updated DataFrame
# print(df_units)


