### Bloom's Taxonomy Extractor

In [1]:
# Required Libraries
import tabula
import pandas as pd
import camelot


  from pandas.core import (


In [2]:
# Path to your PDF file
pdf_path = 'Blooms-Taxonomy-Handout.pdf'

# Extract tables from the PDF
tables = camelot.read_pdf(pdf_path, pages='all')

# Convert the first table to a DataFrame and export it to Excel
if len(tables) > 0:
    df = tables[0].df  # Extract the first table as a DataFrame
    columns=df.iloc[0]
    column_names=columns[0].split('\n')
    df.columns=column_names
    df = df.drop(0) 
    df = df.replace('\n', '', regex=True)
    df.to_excel('Blooms_index.xlsx', index=False)
    print(f"Table extracted and saved to 'output_table.xlsx'")
else:
    print("No table found in the PDF.")


Table extracted and saved to 'output_table.xlsx'


### Course Outcomes Extractor

In [3]:
# Required Libraries
import re

In [4]:
import tabula
import pandas as pd
import re

pdf_path = 'MDS2024_25.pdf'

# Extract all tables from the PDF with the lattice option to handle table borders better
try:
    tables_json = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True, output_format="json", encoding='ISO-8859-1', lattice=True)
except Exception as e:
    print(f"Error reading PDF: {e}")
    tables_json = []  # Initialize as empty if reading fails

# Regular expression pattern to identify "Course Outcomes" table
pattern = re.compile(r"Course Outcomes", re.IGNORECASE)

# Filter JSON data for tables with "Course Outcomes"
course_outcomes_tables = []
for table_json in tables_json:
    try:
        # Convert JSON to DataFrame
        table_data = [[cell['text'] for cell in row] for row in table_json['data']]
        table = pd.DataFrame(table_data)
        
        # Check for the pattern in the DataFrame content
        if table.apply(lambda row: row.astype(str).str.contains(pattern).any(), axis=1).any():
            # Set the first row as header if it matches the expected format
            if 'No.' in table.iloc[0].values[0] and 'Course Outcomes' in table.iloc[0].values[1]:
                table.columns = table.iloc[0]  # Set first row as header
                table = table.drop(0).reset_index(drop=True)  # Drop the header row from data
            
            # Standardize column names
            table.columns = ['No', 'Course Outcomes', 'LRNG Needs', 'Unused_1', 'Unused_2'][:table.shape[1]]
            
            # Drop unnecessary columns by name
            table = table.drop(columns=['LRNG Needs', 'Unused_1', 'Unused_2'], errors='ignore')
            
            # Fix merged words in 'Course Outcomes' by adding spaces before capital letters or numbers
            def fix_merged_words(text):
                # Add spaces between lowercase-uppercase or letter-number transitions
                return re.sub(r'(?<=[a-z])(?=[A-Z0-9])', ' ', text)
            
            table['Course Outcomes'] = table['Course Outcomes'].apply(lambda x: fix_merged_words(x))

            # Add the table to the list
            course_outcomes_tables.append(table)
    except Exception as e:
        print(f"Error processing table JSON data: {e}")

# Concatenate all "Course Outcomes" tables into a single DataFrame
if course_outcomes_tables:
    combined_df = pd.concat(course_outcomes_tables, ignore_index=True)
    
    # Drop duplicate rows
    combined_df = combined_df.drop_duplicates()
    
    # Add "Verbs" and "Assessments" columns with empty values
    combined_df['Verbs'] = ""       # Or provide a default value if needed
    combined_df['Assessments'] = ""  # Or provide a default value if needed
    
    # Save to Excel file
    combined_df.to_excel("course_outcomes.xlsx", index=False)
    print("Filtered course outcomes tables have been saved to 'course_outcomes_combined_single_sheet.xlsx'")
else:
    print("No tables containing 'Course Outcomes' found.")



Filtered course outcomes tables have been saved to 'course_outcomes_combined_single_sheet.xlsx'


### Verb Assessments Grouping

In [5]:
# Required Libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
import numpy as np
import spacy 

In [6]:
#Loading the Course Outcomes Dataframe in df_co
df_co=combined_df

In [7]:
# Printing the Data Frame for checking the result
df_co

Unnamed: 0,No,Course Outcomes,Verbs,Assessments
0,CO1,Understand the essence of research and the\rim...,,
1,CO2,Explore the fundamental concepts of data science,,
2,CO3,Understand various machine learning algorithms...,,
3,CO4,Learn to think through the ethics surrounding ...,,
4,CO5,Create scientific reports according to specifi...,,
...,...,...,...,...
128,CO4,Designcomputationalexperimentsfortrainingandev...,,
129,CO1,Understand the fundamental principles of image...,,
130,CO2,Develop proficiency in image enhancement and s...,,
131,CO3,Develop skills in object detection and recogni...,,


#### Indentify the Verbs from the Course Outcome

In [8]:
course_outcomes=[]
for i in range(len(df_co)):
    data=df_co['Course Outcomes'].iloc[i]
    course_outcomes.append(data)

In [9]:
course_outcomes

['Understand the essence of research and the\rimportance of research methods and methodology',
 'Explore the fundamental concepts of data science',
 'Understand various machine learning algorithms used in data science process',
 'Learn to think through the ethics surrounding privacy, data sharing and algorithmi\rdecision- making',
 'Create scientific reports according to specified standards',
 'able to understand the concept of the random variable and expectation for discrete\rand continuous data',
 'evaluate condition probabilities and conditional expectations',
 'gain the knowledge of applications of discrete distributions in Data Science',
 'identify the applications of continuous distributions in Data Science',
 'apply Chebychevs inequality to verify the convergence of sequence in probability',
 'Understand the fundamentals of programming languages.',
 'Understand the design paradigms of programming languages.',
 'To examine expressions, subprograms and their parameters.',
 'Demons

In [10]:
verbs=['VB','VBP','VBD','VBG','VBN']

In [11]:
course_verbs=[]
for i in range(0,len(course_outcomes)):
    review=course_outcomes[i]
    review=review.lower()
    review=review.split()
    review=nltk.pos_tag(review)
    review=[word for word,tag in review if tag in verbs]
    course_verbs.append(review)

In [12]:
course_verbs

[['understand'],
 [],
 ['learning', 'used'],
 ['think', 'surrounding'],
 ['according', 'specified'],
 ['understand'],
 [],
 ['gain'],
 ['identify'],
 ['verify'],
 ['understand', 'programming'],
 ['understand', 'programming'],
 ['examine'],
 ['visualize'],
 ['apply'],
 [],
 ['demonstrate'],
 ['understand'],
 ['programming', 'solve', 'given'],
 ['demonstrate'],
 [],
 [],
 ['demonstrate', 'using'],
 ['given', 'solve', 'using'],
 ['infer'],
 ['analyze'],
 [],
 ['designing', 'including'],
 ['understand', 'deciding', 'know', 'do', 'admit'],
 [],
 ['understanding'],
 ['develop', 'using'],
 ['demonstrate'],
 ['apply', 'sampling', 'testing'],
 ['estimate', 'using', 'using'],
 ['using', 'using'],
 ['create'],
 ['describethemaintechnologiesandmethodscurrentlyusedincreating'],
 ['inherent'],
 ['using'],
 ['develop', 'writing'],
 ['programming', 'solve'],
 ['develop'],
 ['understand'],
 ['understand', 'are', 'formulated', 'solved'],
 ['learning', 'solve'],
 ['formulate'],
 ['identify'],
 ['identify

#### Load Blooms Taxonomy Pdf

In [13]:
df_blooms=df

In [14]:
df_blooms

Unnamed: 0,Level:,Verb,Examples of Appropriate Assessments
1,Remembering: can the student recall or remembe...,Recall Recognize Identify,Objective test items such as fill-in-the-blank...
2,Understanding: can the student explain ideas o...,Interpret Exemplify Classify Summarize Infer C...,"Activities such as papers, exams, problem sets..."
3,Applying: can the student use the information ...,Apply Execute Implement,"Activities such as problem sets, performances,..."
4,Analyzing: can the student distinguish between...,Analyze Differentiate Organize Attribute,"Activities such as case studies, critiques, la..."
5,Evaluating: can the student justify a stand or...,Evaluate Check Critique Assess,"Activities such as journals, diaries, critique..."
6,Creating: can the student create new product o...,Create Generate Plan Produce Design,"Activities such as research projects, musical ..."


In [15]:
df_blooms['Verb '] = df_blooms['Verb '].str.split()

In [16]:
df_blooms['Verb ']

1                        [Recall, Recognize, Identify]
2    [Interpret, Exemplify, Classify, Summarize, In...
3                          [Apply, Execute, Implement]
4        [Analyze, Differentiate, Organize, Attribute]
5                  [Evaluate, Check, Critique, Assess]
6            [Create, Generate, Plan, Produce, Design]
Name: Verb , dtype: object

In [17]:
flattened_course_verbs = [verb[0] for verb in course_verbs if verb]

In [18]:
flattened_course_verbs

['understand',
 'learning',
 'think',
 'according',
 'understand',
 'gain',
 'identify',
 'verify',
 'understand',
 'understand',
 'examine',
 'visualize',
 'apply',
 'demonstrate',
 'understand',
 'programming',
 'demonstrate',
 'demonstrate',
 'given',
 'infer',
 'analyze',
 'designing',
 'understand',
 'understanding',
 'develop',
 'demonstrate',
 'apply',
 'estimate',
 'using',
 'create',
 'describethemaintechnologiesandmethodscurrentlyusedincreating',
 'inherent',
 'using',
 'develop',
 'programming',
 'develop',
 'understand',
 'understand',
 'learning',
 'formulate',
 'identify',
 'identify',
 'use',
 'describe',
 'apply',
 'using',
 'using',
 'understand',
 'apply',
 'identify',
 'present',
 'using',
 'analyse',
 'differentiate',
 'forecast',
 'understand',
 'understand',
 '(cnn)',
 'evaluatethechallengesintraining',
 'understand',
 'identify',
 'understand',
 'illustrate',
 'illustrate',
 'utilizing',
 'describe',
 'analyze',
 'based',
 'demonstrate',
 'be',
 'apply',
 'knowle

In [19]:
flattened_df_verbs = [' '.join(verbs) for verbs in df_blooms['Verb ']]

In [20]:
flattened_df_verbs

['Recall Recognize Identify',
 'Interpret Exemplify Classify Summarize Infer Compare Explain',
 'Apply Execute Implement',
 'Analyze Differentiate Organize Attribute',
 'Evaluate Check Critique Assess',
 'Create Generate Plan Produce Design']

In [21]:
blooms_verbs=[]
for i in range(len(flattened_df_verbs)):
    review=flattened_df_verbs[i]
    review=review.split()
    blooms_verbs.append(review)

In [22]:
blooms_verbs

[['Recall', 'Recognize', 'Identify'],
 ['Interpret',
  'Exemplify',
  'Classify',
  'Summarize',
  'Infer',
  'Compare',
  'Explain'],
 ['Apply', 'Execute', 'Implement'],
 ['Analyze', 'Differentiate', 'Organize', 'Attribute'],
 ['Evaluate', 'Check', 'Critique', 'Assess'],
 ['Create', 'Generate', 'Plan', 'Produce', 'Design']]

#### Grouping

In [23]:
df_va = pd.DataFrame(columns=["Verbs", "Assessments"])

In [25]:
nlp = spacy.load("en_core_web_md")
verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']  # POS tags for verbs

# Function to extract verbs from the course outcomes
def extract_verbs(course_outcomes):
    course_verbs = []
    for outcome in course_outcomes:
        words = outcome.split()
        tagged_words = nltk.pos_tag(words)
        verbs_in_outcome = [word for word, tag in tagged_words if tag in verbs]
        course_verbs.append(verbs_in_outcome)
    return course_verbs

# Function to get SpaCy word vectors for words
def get_word_vector(word):
    doc = nlp(word)
    if doc.has_vector:
        return doc.vector
    else:
        return np.zeros(nlp.vocab.vectors_length)  # Return zero vector if word not in vocab

# Convert course verbs to vectors (assuming course_verbs is a list of verb lists)
flattened_course_verbs = [verb for sublist in course_verbs for verb in sublist]
course_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_course_verbs])

# Set a similarity threshold
threshold = 0.5

# Assuming 'blooms_verbs' is your list of Bloom's verbs categories (from your example)
matching_assessments = {}

# Iterate over course verbs and find matching Bloom's verbs & assessments


# Iterate over course verbs and find matching Bloom's verbs & assessments
for i, course_verb in enumerate(flattened_course_verbs):
    matching_assessments[course_verb] = []
    
    # Compare with each Bloom's category
    for cat_index, blooms_category in enumerate(blooms_verbs):
        blooms_verb_vectors = np.array([get_word_vector(verb) for verb in blooms_category])
        
        # Calculate cosine similarity between the course verb and each Bloom's verb
        sim_scores = cosine_similarity([course_verb_vectors[i]], blooms_verb_vectors).flatten()
        
        # Check if any similarity score in this category is above the threshold
        if any(score > threshold for score in sim_scores):
            matching_assessments[course_verb].append(df_blooms.iloc[cat_index]['Examples of Appropriate Assessments'])

    # Create a new DataFrame for the current mapping
    new_row = pd.DataFrame({"Verbs": [course_verb], "Assessments": [matching_assessments[course_verb]]})
    
    # Concatenate the new row with the existing DataFrame
    df_va = pd.concat([df_va, new_row], ignore_index=True)

# Define a function to handle nested lists and ensure uniqueness
def aggregate_unique_assessments(assessments):
    # Flatten and deduplicate all lists in the column
    unique_items = set(item for sublist in assessments for item in sublist)
    return list(unique_items)

# Group by "Verbs" and apply the aggregation function
grouped_df = df_va.groupby("Verbs", as_index=False).agg({
    "Assessments": aggregate_unique_assessments
})

# Display the grouped DataFrame
grouped_df
grouped_df.to_excel('Verbs-Assesments Grouped.xlsx')
