In [23]:
from docx import Document
import pandas as pd
import nltk
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity

## Teachers Upload their Syllabus 

In [24]:
# Load the document
doc = Document('Syllabus.docx')

# Initialize lists to store table data
co_numbers = []
course_outcomes = []

# Assuming the data is in a table within the Word document
for table in doc.tables:
    for row in table.rows:
        # Get the text from each cell in the row
        cells = [cell.text.strip() for cell in row.cells]
        
        # Skip rows without the expected structure
        if len(cells) == 2:
            co_numbers.append(cells[0])
            course_outcomes.append(cells[1])

# Create a DataFrame
# Directly create DataFrame from extracted table data if column headings are included
df = pd.DataFrame([co_numbers, course_outcomes]).transpose()

# Rename columns only if needed
df.columns = df.iloc[0]  # Set the first row as the header
df = df[1:].reset_index(drop=True)  # Drop the first row and reset the index


In [25]:
df

Unnamed: 0,No.,Course Outcomes
0,CO1,Understand the fundamental principles of image...
1,CO2,Develop proficiency in image enhancement and s...
2,CO3,Develop skills in object detection and recogni...
3,CO4,Apply the image and video analysis approaches ...


## Extract Verbs from the Course Outcomes

In [26]:
course_outcomes=[]
for i in range(len(df)):
    data=df['Course Outcomes'].iloc[i]
    course_outcomes.append(data)

In [27]:
course_outcomes

['Understand the fundamental principles of image and video analysis',
 'Develop proficiency in image enhancement and segmentation',
 'Develop skills in object detection and recognition',
 'Apply the image and video analysis approaches to solve real world problems']

In [28]:
verbs=['VB','VBP','VBD','VBG','VBN']

In [29]:
course_verbs=[]
for i in range(len(course_outcomes)):
    review=course_outcomes[i]
    review=review.split()
    review=nltk.pos_tag(review)
    print(review)
    review=[word for word,tag in review if tag in verbs]
    course_verbs.append(review)


[('Understand', 'IN'), ('the', 'DT'), ('fundamental', 'JJ'), ('principles', 'NNS'), ('of', 'IN'), ('image', 'NN'), ('and', 'CC'), ('video', 'NN'), ('analysis', 'NN')]
[('Develop', 'NNP'), ('proficiency', 'NN'), ('in', 'IN'), ('image', 'NN'), ('enhancement', 'NN'), ('and', 'CC'), ('segmentation', 'NN')]
[('Develop', 'NNP'), ('skills', 'NNS'), ('in', 'IN'), ('object', 'JJ'), ('detection', 'NN'), ('and', 'CC'), ('recognition', 'NN')]
[('Apply', 'VB'), ('the', 'DT'), ('image', 'NN'), ('and', 'CC'), ('video', 'NN'), ('analysis', 'NN'), ('approaches', 'NNS'), ('to', 'TO'), ('solve', 'VB'), ('real', 'JJ'), ('world', 'NN'), ('problems', 'NNS')]


In [30]:
course_verbs

[[], [], [], ['Apply', 'solve']]

In [31]:
df_assessments=pd.read_excel("Course Assessments.xlsx")

In [32]:
df_assessments

Unnamed: 0.1,Unnamed: 0,No,Course Outcomes,Verbs,Assessments
0,0,CO1,Understand the essence of research and the_x00...,understand,Objective test items such as fill-in-the-blank...
1,1,CO2,Explore the fundamental concepts of data science,,
2,2,CO3,Understand various machine learning algorithms...,"learning, used",Objective test items such as fill-in-the-blank...
3,3,CO4,Learn to think through the ethics surrounding ...,"think, surrounding",Objective test items such as fill-in-the-blank...
4,4,CO5,Create scientific reports according to specifi...,"according, specified",Objective test items such as fill-in-the-blank...
...,...,...,...,...,...
128,128,CO4,Designcomputationalexperimentsfortrainingandev...,"learning, solving",Objective test items such as fill-in-the-blank...
129,129,CO1,Understand the fundamental principles of image...,understand,Objective test items such as fill-in-the-blank...
130,130,CO2,Develop proficiency in image enhancement and s...,develop,Objective test items such as fill-in-the-blank...
131,131,CO3,Develop skills in object detection and recogni...,develop,Objective test items such as fill-in-the-blank...


In [33]:
df_assessments['Verbs']

0                understand
1                       NaN
2            learning, used
3        think, surrounding
4      according, specified
               ...         
128       learning, solving
129              understand
130                 develop
131                 develop
132            apply, solve
Name: Verbs, Length: 133, dtype: object

In [34]:
flattened_course_verbs = [verb for sublist in course_verbs for verb in sublist]

In [35]:
flattened_course_verbs

['Apply', 'solve']

In [36]:
df_assessments['Verbs']

0                understand
1                       NaN
2            learning, used
3        think, surrounding
4      according, specified
               ...         
128       learning, solving
129              understand
130                 develop
131                 develop
132            apply, solve
Name: Verbs, Length: 133, dtype: object

In [37]:
# Convert the 'verbs' column to lists of individual words, ignoring NaNs
flattened_assessment_verbs = []
for item in df_assessments['Verbs'].dropna():
    flattened_assessment_verbs.extend([verb.strip() for verb in item.split(',')])

flattened_assessment_verbs

['understand',
 'learning',
 'used',
 'think',
 'surrounding',
 'according',
 'specified',
 'understand',
 'gain',
 'identify',
 'verify',
 'understand',
 'programming',
 'understand',
 'programming',
 'examine',
 'visualize',
 'apply',
 'demonstrate',
 'understand',
 'programming',
 'solve',
 'given',
 'demonstrate',
 'demonstrate',
 'using',
 'given',
 'solve',
 'using',
 'infer',
 'analyze',
 'designing',
 'including',
 'understand',
 'deciding',
 'know',
 'do',
 'admit',
 'understanding',
 'develop',
 'using',
 'demonstrate',
 'apply',
 'sampling',
 'testing',
 'estimate',
 'using',
 'using',
 'using',
 'using',
 'create',
 'describethemaintechnologiesandmethodscurrentlyusedincreating',
 'inherent',
 'using',
 'develop',
 'writing',
 'programming',
 'solve',
 'develop',
 'understand',
 'understand',
 'are',
 'formulated',
 'solved',
 'learning',
 'solve',
 'formulate',
 'identify',
 'identify',
 'using',
 'use',
 'understand',
 'describe',
 'apply',
 'using',
 'using',
 'classified

In [38]:

# # Load SpaCy model
# import numpy as np
# import spacy
# from sklearn.metrics.pairwise import cosine_similarity

# # Load SpaCy model
# nlp = spacy.load("en_core_web_md")

# # Function to get the SpaCy vector for a word
# def get_word_vector(word):
#     doc = nlp(word)
#     if doc.has_vector:  # Check if the word has a vector
#         return doc.vector
#     else:
#         return np.zeros(nlp.vocab.vectors_length)  # Return a zero vector if word not in vocabulary


# # Convert course verbs to vectors
# course_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_course_verbs])

# # Convert assessment verbs to vectors
# assessment_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_assessment_verbs])

# # Set similarity threshold
# threshold = 0.5  

# # Dictionary to store matching assessments
# matching_assessments = {}

# for i, course_verb in enumerate(flattened_course_verbs):
#     matching_assessments[course_verb] = []
    
#     # Calculate cosine similarity between the course verb and each assessment verb
#     sim_scores = cosine_similarity([course_verb_vectors[i]], assessment_verb_vectors).flatten()
    
#     # Find assessment verbs with similarity scores above the threshold
#     for j, score in enumerate(sim_scores):
#         if score > threshold:
#             # Append the assessment description from df_assessments for the matching verb
#             matching_assessments[course_verb].append(df_assessments.iloc[j]['Assessments'])

# # Print results
# for verb, assessments in matching_assessments.items():
#     print(f"Verb: {verb}")
#     print(f"Assessments for course verb '{verb}': {assessments}\n")



In [39]:
print(len(flattened_assessment_verbs))
print(len(df_assessments))

180
133


In [40]:
import numpy as np
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity

# Load SpaCy model
nlp = spacy.load("en_core_web_md")

# Function to get the SpaCy vector for a word
def get_word_vector(word):
    doc = nlp(word)
    if doc.has_vector:
        return doc.vector
    else:
        return np.zeros(nlp.vocab.vectors_length)  # Return zero vector if word not in vocabulary

# Expand 'Verbs' column to match a flattened list
expanded_rows = []
for index, row in df_assessments.iterrows():
    verbs = row['Verbs']
    if pd.isna(verbs):  # Skip rows with NaN in 'Verbs'
        continue
    verb_list = [verb.strip() for verb in verbs.split(',')]  # Split multiple verbs into a list
    for verb in verb_list:
        expanded_rows.append({'Verbs': verb, 'Assessments': row['Assessments']})

# Create a new expanded DataFrame
expanded_df_assessments = pd.DataFrame(expanded_rows)


In [41]:
expanded_df_assessments

Unnamed: 0,Verbs,Assessments
0,understand,Objective test items such as fill-in-the-blank...
1,learning,Objective test items such as fill-in-the-blank...
2,used,Objective test items such as fill-in-the-blank...
3,think,Objective test items such as fill-in-the-blank...
4,surrounding,Objective test items such as fill-in-the-blank...
...,...,...
175,understand,Objective test items such as fill-in-the-blank...
176,develop,Objective test items such as fill-in-the-blank...
177,develop,Objective test items such as fill-in-the-blank...
178,apply,Objective test items such as fill-in-the-blank...


In [44]:
# Flattened assessment verbs
flattened_assessment_verbs = expanded_df_assessments['Verbs'].tolist()

# Convert course verbs and assessment verbs to vectors
course_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_course_verbs])
assessment_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_assessment_verbs])

# Similarity threshold
threshold = 0.5

# Dictionary to store matching assessments
matching_assessments = {}

# Function to normalize assessments (you can extend this for more complex cases)
def normalize_assessment(assessment):
    # Strip leading/trailing spaces and convert to lowercase
    return assessment.strip().lower()

for i, course_verb in enumerate(flattened_course_verbs):
    matching_assessments[course_verb] = set()  # Use a set to avoid duplicate assessments

    # Calculate cosine similarity between the course verb and each assessment verb
    sim_scores = cosine_similarity([course_verb_vectors[i]], assessment_verb_vectors).flatten()

    # Find assessment verbs with similarity scores above the threshold
    for j, score in enumerate(sim_scores):
        if score > threshold:
            if 0 <= j < len(expanded_df_assessments):  # Ensure index is valid
                normalized_assessment = normalize_assessment(expanded_df_assessments.iloc[j]['Assessments'])
                matching_assessments[course_verb].add(normalized_assessment)  # Store normalized assessment

# Print results
for verb, assessments in matching_assessments.items():
    print(f"Verb: {verb}")
    if assessments:
        print("Matching Assessments:")
        # Sort assessments (optional, for better display)
        print("\n".join(f"- {assessment}" for assessment in sorted(assessments)))
    else:
        print("No matching assessments found.")
    print("\n")



Verb: Apply
Matching Assessments:
- objective test items such as fill-in-the-blank, matching, labeling, or multiple-choice questions that require students to: recall or recognize terms, facts, and concepts •, activities such as papers, exams, problem sets, class discussions, or concept maps that require students to: summarize readings, films, or speeches • compare and contrast two or more theories, events, or processes • classify or categorize cases, elements, or events using established criteria • paraphrase documents or speeches • find or identify examples or illustrations of a concept or principle •, activities such as problem sets, performances, labs, prototyping, or simulations that require students to: use procedures to solve or complete familiar or unfamiliar tasks • determine which procedure(s) are most appropriate for a given task •, activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select rel

Extracted Units and Contents

In [45]:
import pandas as pd
import re
from docx import Document

# Load the syllabus Word document
doc = Document('Syllabus.docx')

# Initialize variables to store units and their content
units = []
current_unit = None
current_content = []

# Parse through the document paragraphs
for para in doc.paragraphs:
    text = para.text.strip()

    # Check for "Unit" and start a new unit
    if text.startswith("Unit"):
        # Save the previous unit and its content
        if current_unit:
            units.append((current_unit, " ".join(current_content)))
        
        # Start a new unit
        current_unit = text
        current_content = []
    elif current_unit:
        # Check if the paragraph contains Lab Exercises or Reading sections
        if text.startswith("Lab Exercise") or text.startswith("Essential Reading") or text.startswith("Recommended Reading"):
            continue
        # Accumulate content for the current unit
        current_content.append(text)

# Append the last unit
if current_unit:
    units.append((current_unit, " ".join(current_content)))

# Create a DataFrame with columns "Unit", "Contents"
df_units = pd.DataFrame(units, columns=["Unit", "Contents"])

# Extract teaching hours using the specific pattern "Teaching Hours: X"
def extract_hours(contents):
    match = re.search(r"Teaching Hours:\s*(\d+)", contents)
    return int(match.group(1)) if match else None

# Extract content before "Teaching Hours"
def extract_content_before_hours(contents):
    if "Teaching Hours" in contents:
        return contents.split("Teaching Hours")[0].strip()
    return contents.strip()

# Apply content splitting and teaching hours extraction
df_units['Teaching Hours'] = df_units['Contents'].apply(extract_hours)
df_units['Contents'] = df_units['Contents'].apply(extract_content_before_hours)

# Extract Topic from the unit by assuming it's the part of the string after "Unit X:"
def extract_topic(unit):
    # Match unit topic patterns with different possible delimiters
    match = re.search(r"Unit\s*\d+\s*[:\t\s](.+)", unit)
    return match.group(1).strip() if match else ""  # Return empty string if no match


# Apply topic extraction
df_units['Topic'] = df_units['Unit'].apply(extract_topic)

# Clean up "Unit" column to only contain the unit number (e.g., "Unit 1")
df_units['Unit'] = df_units['Unit'].apply(lambda x: re.match(r"Unit\s*\d+", x).group())

# Reorder columns for better readability
df_units = df_units[['Unit', 'Topic', 'Contents', 'Teaching Hours']]

# Display the DataFrame
print(df_units)

# Save the DataFrame to an Excel file if needed
df_units.to_excel("Extracted_Units_Updated3.xlsx", index=False)


     Unit                                              Topic  \
0  Unit 1  Introduction to Digital Image and Video Proces...   
1  Unit 2        Image and Video Enhancement and Restoration   
2  Unit 3                        Image and Video Compression   
3  Unit 4                  Feature Detection and Description   
4  Unit 5                   Object Detection and Recognition   

                                            Contents  Teaching Hours  
0  Digital image representation, Sampling and Qua...              12  
1  Spatial domain-Linear and Non-linear Filtering...              12  
2  Fundamentals of Image Compression: Huffman Cod...              12  
3  Introduction to feature detectors, Point, line...              12  
4  Descriptors: Boundary descriptors - Fourier de...              12  


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess text (lowercase for consistency)
df['Course Outcomes'] = df['Course Outcomes'].str.lower()
df_units['Topic'] = df_units['Topic'].str.lower()

# Combine text data for vectorization
all_text = pd.concat([df['Course Outcomes'], df_units['Topic']])

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_text)

# Separate vectors for Course Outcomes and Topics
course_outcome_vectors = tfidf_matrix[:len(df)]
topic_vectors = tfidf_matrix[len(df):]

# Compute cosine similarity
similarity_matrix = cosine_similarity(topic_vectors, course_outcome_vectors)

# Find the most similar course outcome for each unit
best_matches = []
for i, topic in enumerate(df_units['Topic']):
    # Get the index of the most similar course outcome
    best_match_index = similarity_matrix[i].argmax()
    best_match_score = similarity_matrix[i][best_match_index]
    
    # Append the best match and its score
    best_matches.append({
        'Matched Course Outcome': df.iloc[best_match_index]['Course Outcomes'],
        'Similarity Score': best_match_score
    })

# Convert matches to a DataFrame
matches_df = pd.DataFrame(best_matches)

# Add the matched course outcome and score to df_units
df_units['Matched Course Outcome'] = matches_df['Matched Course Outcome']
df_units['Similarity Score'] = matches_df['Similarity Score']

# Display the updated DataFrame
print(df_units)




     Unit                                              Topic  \
0  Unit 1  introduction to digital image and video proces...   
1  Unit 2        image and video enhancement and restoration   
2  Unit 3                        image and video compression   
3  Unit 4                  feature detection and description   
4  Unit 5                   object detection and recognition   

                                            Contents  Teaching Hours  \
0  Digital image representation, Sampling and Qua...              12   
1  Spatial domain-Linear and Non-linear Filtering...              12   
2  Fundamentals of Image Compression: Huffman Cod...              12   
3  Introduction to feature detectors, Point, line...              12   
4  Descriptors: Boundary descriptors - Fourier de...              12   

                              Matched Course Outcome  Similarity Score  
0  apply the image and video analysis approaches ...          0.232597  
1  develop proficiency in image enha