In [1]:
from docx import Document
import pandas as pd
import nltk
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import re

## Teachers Upload their Syllabus 

In [2]:
# Load the document
doc = Document('BD_Syllabus.docx')

# Initialize lists to store table data
co_numbers = []
course_outcomes = []

# Assuming the data is in a table within the Word document
for table in doc.tables:
    for row in table.rows:
        # Get the text from each cell in the row
        cells = [cell.text.strip() for cell in row.cells]
        
        # Skip rows without the expected structure
        if len(cells) == 2:
            co_numbers.append(cells[0])
            course_outcomes.append(cells[1])

# Create a DataFrame
# Directly create DataFrame from extracted table data if column headings are included
df = pd.DataFrame([co_numbers, course_outcomes]).transpose()

# Rename columns only if needed
df.columns = df.iloc[0]  # Set the first row as the header
df = df[1:].reset_index(drop=True)  # Drop the first row and reset the index


### Extracted the syllabus - CO

In [3]:
df

Unnamed: 0,No.,Course Outcomes
0,CO1,Understand the Big Data concepts in real time ...
1,CO2,Identify different types of Hadoop architecture
2,CO3,Demonstrate an ability to use Hadoop framework...
3,CO4,Analyze the Big data under Spark architecture
4,CO5,Demonstrate the programming of Big data using ...


### Extracting the Units - Topics - Teaching Hours from syllabus

In [4]:
# Load the syllabus Word document
doc = Document('BD_Syllabus.docx')

# Initialize variables to store units and their content
units = []
current_unit = None
current_content = []

# Parse through the document paragraphs
for para in doc.paragraphs:
    text = para.text.strip()

    # Check for "Unit" and start a new unit
    if text.startswith("Unit"):
        # Save the previous unit and its content
        if current_unit:
            units.append((current_unit, " ".join(current_content)))
        
        # Start a new unit
        current_unit = text
        current_content = []
    elif current_unit:
        # Check if the paragraph contains Lab Exercises or Reading sections
        if text.startswith("Lab Exercise") or text.startswith("Essential Reading") or text.startswith("Recommended Reading"):
            continue
        # Accumulate content for the current unit
        current_content.append(text)

# Append the last unit
if current_unit:
    units.append((current_unit, " ".join(current_content)))

# Create a DataFrame with columns "Unit", "Contents"
df_units = pd.DataFrame(units, columns=["Unit", "Contents"])

# Extract teaching hours using the specific pattern "Teaching Hours: X"
def extract_hours(contents):
    match = re.search(r"Teaching Hours:\s*(\d+)", contents)
    return int(match.group(1)) if match else None

# Extract content before "Teaching Hours"
def extract_content_before_hours(contents):
    if "Teaching Hours" in contents:
        return contents.split("Teaching Hours")[0].strip()
    return contents.strip()

# Apply content splitting and teaching hours extraction
df_units['Teaching Hours'] = df_units['Contents'].apply(extract_hours)
df_units['Contents'] = df_units['Contents'].apply(extract_content_before_hours)

# Extract Topic from the unit by assuming it's the part of the string after "Unit X:"
def extract_topic(unit):
    # Match unit topic patterns with different possible delimiters
    match = re.search(r"Unit\s*\d+\s*[:\t\s](.+)", unit)
    return match.group(1).strip() if match else ""  # Return empty string if no match


# Apply topic extraction
df_units['Topic'] = df_units['Unit'].apply(extract_topic)

# Clean up "Unit" column to only contain the unit number (e.g., "Unit 1")
df_units['Unit'] = df_units['Unit'].apply(lambda x: re.match(r"Unit\s*\d+", x).group())

# Reorder columns for better readability
df_units = df_units[['Unit', 'Topic', 'Contents', 'Teaching Hours']]

# Display the DataFrame
print(df_units)


     Unit                               Topic  \
0  Unit 1                        Introduction   
1  Unit 2               Big Data Architecture   
2  Unit 3  Parallel Processing with MapReduce   
3  Unit 4                        Hive and Pig   
4  Unit 5        Stream Processing with Spark   

                                            Contents  Teaching Hours  
0  Concepts of Data Analytics: Descriptive, Diagn...              15  
1  Standard Big data architecture - Big data appl...              15  
2  Introduction to MapReduce - Sample MapReduce a...              15  
3  Hive Architecture - Components - Data Definiti...              15  
4  Stream processing Models and Tools - Apache Sp...              15  


### Matching the units with their respective course outcomes based on the similarity score

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess text (lowercase for consistency)
df['Course Outcomes'] = df['Course Outcomes'].str.lower()
df_units['Topic'] = df_units['Topic'].str.lower()

# Combine text data for vectorization
all_text = pd.concat([df['Course Outcomes'], df_units['Topic']])

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_text)

# Separate vectors for Course Outcomes and Topics
course_outcome_vectors = tfidf_matrix[:len(df)]
topic_vectors = tfidf_matrix[len(df):]

# Compute cosine similarity
similarity_matrix = cosine_similarity(topic_vectors, course_outcome_vectors)

# Find the most similar course outcome for each unit
best_matches = []
for i, topic in enumerate(df_units['Topic']):
    # Get the index of the most similar course outcome
    best_match_index = similarity_matrix[i].argmax()
    best_match_score = similarity_matrix[i][best_match_index]
    
    # Append the best match and its score
    best_matches.append({
        'Matched Course Outcome': df.iloc[best_match_index]['Course Outcomes'],
        'Similarity Score': best_match_score
    })

# Convert matches to a DataFrame
matches_df = pd.DataFrame(best_matches)

# Add the matched course outcome and score to df_units
df_units['Course Outcomes'] = matches_df['Matched Course Outcome']
df_units['Similarity Score'] = matches_df['Similarity Score']

# Display the updated DataFrame
print(df_units)


df_units.to_excel("Extracted_Units_Updated.xlsx", index=False)

     Unit                               Topic  \
0  Unit 1                        introduction   
1  Unit 2               big data architecture   
2  Unit 3  parallel processing with mapreduce   
3  Unit 4                        hive and pig   
4  Unit 5        stream processing with spark   

                                            Contents  Teaching Hours  \
0  Concepts of Data Analytics: Descriptive, Diagn...              15   
1  Standard Big data architecture - Big data appl...              15   
2  Introduction to MapReduce - Sample MapReduce a...              15   
3  Hive Architecture - Components - Data Definiti...              15   
4  Stream processing Models and Tools - Apache Sp...              15   

                                     Course Outcomes  Similarity Score  
0  understand the big data concepts in real time ...          0.000000  
1      analyze the big data under spark architecture          0.526814  
2  demonstrate an ability to use hadoop framework... 

## Extract Verbs from the Course Outcomes

In [6]:
course_outcomes=[]
for i in range(len(df)):
    data=df_units['Course Outcomes'].iloc[i]
    course_outcomes.append(data)

In [7]:
course_outcomes

['understand the big data concepts in real time scenario',
 'analyze the big data under spark architecture',
 'demonstrate an ability to use hadoop framework for processing big data for analytics',
 'demonstrate the programming of big data using hive and pig environments',
 'analyze the big data under spark architecture']

In [8]:
verbs=['VB','VBP','VBD','VBG','VBN']

In [9]:
import nltk


course_verbs = []
for i in range(len(course_outcomes)):
    review = course_outcomes[i]
    review = review.split()
    review = nltk.pos_tag(review)
    print(review)
    filtered_verbs = [word for word, tag in review if tag in verbs]
    course_verbs.append(filtered_verbs)

# Assign the collected verbs list to the DataFrame column
df_units['Verbs'] = course_verbs


[('understand', 'VB'), ('the', 'DT'), ('big', 'JJ'), ('data', 'NNS'), ('concepts', 'NNS'), ('in', 'IN'), ('real', 'JJ'), ('time', 'NN'), ('scenario', 'NN')]
[('analyze', 'IN'), ('the', 'DT'), ('big', 'JJ'), ('data', 'NNS'), ('under', 'IN'), ('spark', 'NN'), ('architecture', 'NN')]
[('demonstrate', 'VB'), ('an', 'DT'), ('ability', 'NN'), ('to', 'TO'), ('use', 'VB'), ('hadoop', 'NN'), ('framework', 'NN'), ('for', 'IN'), ('processing', 'VBG'), ('big', 'JJ'), ('data', 'NNS'), ('for', 'IN'), ('analytics', 'NNS')]
[('demonstrate', 'VB'), ('the', 'DT'), ('programming', 'NN'), ('of', 'IN'), ('big', 'JJ'), ('data', 'NNS'), ('using', 'VBG'), ('hive', 'JJ'), ('and', 'CC'), ('pig', 'JJ'), ('environments', 'NNS')]
[('analyze', 'IN'), ('the', 'DT'), ('big', 'JJ'), ('data', 'NNS'), ('under', 'IN'), ('spark', 'NN'), ('architecture', 'NN')]


In [10]:
course_verbs

[['understand'],
 [],
 ['demonstrate', 'use', 'processing'],
 ['demonstrate', 'using'],
 []]

### Dataframe of the units wit their CO and extracted verbs

In [11]:
df_units

Unnamed: 0,Unit,Topic,Contents,Teaching Hours,Course Outcomes,Similarity Score,Verbs
0,Unit 1,introduction,"Concepts of Data Analytics: Descriptive, Diagn...",15,understand the big data concepts in real time ...,0.0,[understand]
1,Unit 2,big data architecture,Standard Big data architecture - Big data appl...,15,analyze the big data under spark architecture,0.526814,[]
2,Unit 3,parallel processing with mapreduce,Introduction to MapReduce - Sample MapReduce a...,15,demonstrate an ability to use hadoop framework...,0.085745,"[demonstrate, use, processing]"
3,Unit 4,hive and pig,Hive Architecture - Components - Data Definiti...,15,demonstrate the programming of big data using ...,0.524799,"[demonstrate, using]"
4,Unit 5,stream processing with spark,Stream processing Models and Tools - Apache Sp...,15,analyze the big data under spark architecture,0.19599,[]


### Load the defined verb-assesment dataset (grouped)

In [12]:
df_assessments=pd.read_excel("Verbs-Assesments Grouped.xlsx")

In [13]:
df_assessments

Unnamed: 0.1,Unnamed: 0,Verbs,Assessments
0,0,(ann),[]
1,1,(cnn),[]
2,2,(rnn),[]
3,3,according,"['Activities such as problem sets, performance..."
4,4,adequacy,"['Activities such as journals, diaries, critiq..."
...,...,...,...
77,77,using,"['Activities such as research projects, musica..."
78,78,utilizing,"['Activities such as case studies, critiques, ..."
79,79,verify,"['Activities such as journals, diaries, critiq..."
80,80,visualize,"['Activities such as case studies, critiques, ..."


In [14]:
df_assessments['Verbs']

0         (ann)
1         (cnn)
2         (rnn)
3     according
4      adequacy
        ...    
77        using
78    utilizing
79       verify
80    visualize
81      writing
Name: Verbs, Length: 82, dtype: object

In [15]:
# Convert the 'verbs' column to lists of individual words, ignoring NaNs
flattened_assessment_verbs = []
for item in df_assessments['Verbs'].dropna():
    flattened_assessment_verbs.extend([verb.strip() for verb in item.split(',')])

flattened_assessment_verbs

['(ann)',
 '(cnn)',
 '(rnn)',
 'according',
 'adequacy',
 'admit',
 'advanced',
 'analyse',
 'analysis.',
 'analyze',
 'apply',
 'applying',
 'are',
 'associated',
 'based',
 'be',
 'classified',
 'classify',
 'computing',
 'constructing',
 'create',
 'deciding',
 'demonstrate',
 'describe',
 'describethemaintechnologiesandmethodscurrentlyusedincreating',
 'designing',
 'develop',
 'developing',
 'differentiate',
 'displayed.',
 'distributed',
 'do',
 'estimate',
 'evaluate',
 'evaluatethechallengesintraining',
 'examine',
 'forecast',
 'formulate',
 'formulated',
 'gain',
 'generated',
 'given',
 'grasp',
 'identify',
 'illustrate',
 'implementing',
 'including',
 'infer',
 'inherent',
 'integrating',
 'interpret',
 'involved',
 'know',
 'knowledgeof',
 'learning',
 'linking',
 'model.',
 'organized',
 'perform',
 'present',
 'processing',
 'programming',
 'reporting',
 'represent',
 'sampling',
 'solve',
 'solved',
 'solving',
 'specified',
 'surrounding',
 'testing',
 'thei',
 'thin

In [16]:
print(len(flattened_assessment_verbs))
print(len(df_assessments))

82
82


In [17]:
import numpy as np
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity

# Load SpaCy model
nlp = spacy.load("en_core_web_md")

# Function to get the SpaCy vector for a word
def get_word_vector(word):
    doc = nlp(word)
    if doc.has_vector:
        return doc.vector
    else:
        return np.zeros(nlp.vocab.vectors_length)  # Return zero vector if word not in vocabulary

# Expand 'Verbs' column to match a flattened list
expanded_rows = []
for index, row in df_assessments.iterrows():
    verbs = row['Verbs']
    if pd.isna(verbs):  # Skip rows with NaN in 'Verbs'
        continue
    verb_list = [verb.strip() for verb in verbs.split(',')]  # Split multiple verbs into a list
    for verb in verb_list:
        expanded_rows.append({'Verbs': verb, 'Assessments': row['Assessments']})

# Create a new expanded DataFrame
expanded_df_assessments = pd.DataFrame(expanded_rows)


In [18]:
expanded_df_assessments

Unnamed: 0,Verbs,Assessments
0,(ann),[]
1,(cnn),[]
2,(rnn),[]
3,according,"['Activities such as problem sets, performance..."
4,adequacy,"['Activities such as journals, diaries, critiq..."
...,...,...
77,using,"['Activities such as research projects, musica..."
78,utilizing,"['Activities such as case studies, critiques, ..."
79,verify,"['Activities such as journals, diaries, critiq..."
80,visualize,"['Activities such as case studies, critiques, ..."


### Match and extract the assesments for the verbs extracted from the syllabus CO

In [19]:
flattened_course_verbs = [verb for sublist in course_verbs for verb in sublist]

In [20]:
# Flattened assessment verbs
flattened_assessment_verbs = expanded_df_assessments['Verbs'].tolist()

# Convert course verbs and assessment verbs to vectors
course_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_course_verbs])
assessment_verb_vectors = np.array([get_word_vector(verb) for verb in flattened_assessment_verbs])

# Similarity threshold
threshold = 0.5

# Dictionary to store matching assessments
matching_assessments = {}

# Function to normalize assessments (you can extend this for more complex cases)
def normalize_assessment(assessment):
    # Strip leading/trailing spaces and convert to lowercase
    return assessment.strip().lower()

for i, course_verb in enumerate(flattened_course_verbs):
    matching_assessments[course_verb] = set()  # Use a set to avoid duplicate assessments

    # Calculate cosine similarity between the course verb and each assessment verb
    sim_scores = cosine_similarity([course_verb_vectors[i]], assessment_verb_vectors).flatten()

    # Find assessment verbs with similarity scores above the threshold
    for j, score in enumerate(sim_scores):
        if score > threshold:
            if 0 <= j < len(expanded_df_assessments):  # Ensure index is valid
                normalized_assessment = normalize_assessment(expanded_df_assessments.iloc[j]['Assessments'])
                matching_assessments[course_verb].add(normalized_assessment)  # Store normalized assessment

# Print results
# Create an empty DataFrame with specified columns
df_matchass = pd.DataFrame(columns=["Verbs", "Assessments"])

# Iterate through matching assessments
for verb, assessments in matching_assessments.items():
    print(f"Verb: {verb}")
    if assessments:
        print("Matching Assessments:")
        # Sort and remove duplicates (optional, for better display)
        unique_assessments = sorted(set(assessments))
        print("\n".join(f"- {assessment}" for assessment in unique_assessments))
        
        # Create a new row as a DataFrame
        new_row = pd.DataFrame({
            "Verbs": [verb],
            "Assessments": [", ".join(unique_assessments)]
        })
        
        # Concatenate the new row to the existing DataFrame
        df_matchass = pd.concat([df_matchass, new_row], ignore_index=True)
    else:
        print("No matching assessments found.")
    print("\n")

# Display the resulting DataFrame
print(df_matchass)





Verb: understand
Matching Assessments:
- ['activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select relevant and irrelevant parts • determine how elements function together • determine bias, values, or underlying intent in presented material •', 'activities such as papers, exams, problem sets, class discussions, or concept maps that require students to: summarize readings, films, or speeches • compare and contrast two or more theories, events, or processes • classify or categorize cases, elements, or events using established criteria • paraphrase documents or speeches • find or identify examples or illustrations of a concept or principle •', 'activities such as problem sets, performances, labs, prototyping, or simulations that require students to: use procedures to solve or complete familiar or unfamiliar tasks • determine which procedure(s) are most appropriate for a given task •', 'objective test ite

In [21]:
df_matchass
df_matchass.to_excel('match.xlsx')

In [22]:
import pandas as pd
import re  # To escape special characters in the verb

# Sample data
# df_units: Table 1 with a list of verbs in the "Verbs" column
# df_matchass: Table 2 with verbs and corresponding assessments

# Assuming df_units has a column "Verbs" containing lists of verbs
# and df_matchass has a column "Verbs" (string) and "Assessments" (string)

# Create an empty list to store assessments for each row in df_units
assessments = []

# Iterate over each row in df_units
for index, row in df_units.iterrows():
    # Initialize a list to store matched assessments for each verb list
    matched_assessments = []
    
    # Iterate over the list of verbs in the "Verbs" column of df_units
    for verb in row['Verbs']:
        # Escape special characters in the verb for regex matching
        escaped_verb = re.escape(verb)
        
        # Check if the verb exists in df_matchass
        matching_rows = df_matchass[df_matchass['Verbs'].str.contains(escaped_verb, case=False, na=False)]
        
        # If there are matches, collect the corresponding assessments
        if not matching_rows.empty:
            matched_assessments.extend(matching_rows['Assessments'].tolist())
    
    # Remove duplicate assessments if any and add to the list
    matched_assessments = list(set(matched_assessments))
    
    # Append the matched assessments to the assessments list
    assessments.append(matched_assessments)

# Add the new "Assessment" column to df_units
df_units['Assessments'] = assessments

# Now, df_units will have an additional "Assessments" column with the matched assessments


In [23]:
# Assuming df_units is the DataFrame you provided

# Convert the list of characters in the "Verbs" column back to strings
df_units['Verbs'] = df_units['Verbs'].apply(lambda x: ''.join(x) if isinstance(x, list) else x)

# Check the updated DataFrame
df_units


Unnamed: 0,Unit,Topic,Contents,Teaching Hours,Course Outcomes,Similarity Score,Verbs,Assessments
0,Unit 1,introduction,"Concepts of Data Analytics: Descriptive, Diagn...",15,understand the big data concepts in real time ...,0.0,understand,"[['activities such as case studies, critiques,..."
1,Unit 2,big data architecture,Standard Big data architecture - Big data appl...,15,analyze the big data under spark architecture,0.526814,,[]
2,Unit 3,parallel processing with mapreduce,Introduction to MapReduce - Sample MapReduce a...,15,demonstrate an ability to use hadoop framework...,0.085745,demonstrateuseprocessing,"[['activities such as case studies, critiques,..."
3,Unit 4,hive and pig,Hive Architecture - Components - Data Definiti...,15,demonstrate the programming of big data using ...,0.524799,demonstrateusing,"[['activities such as case studies, critiques,..."
4,Unit 5,stream processing with spark,Stream processing Models and Tools - Apache Sp...,15,analyze the big data under spark architecture,0.19599,,[]


In [24]:
df_units.to_excel('unit_co_ma.xlsx')

### Getting only unique assessments - Filtering

In [25]:
def clean_assessments(assessments):
    if isinstance(assessments, list):  # Check if it's already a list
        assessments = " ".join(assessments)  # Convert the list to a single string
    assessments = assessments.strip("[]").replace("'", "").replace('"', "")
    return [x.strip() for x in assessments.split("•") if x.strip()]

df_units["Assessments_cleaned"] = df_units["Assessments"].apply(clean_assessments)


In [26]:
df_units["Assessments_cleaned"]

0    [activities such as case studies, critiques, l...
1                                                   []
2    [activities such as case studies, critiques, l...
3    [activities such as case studies, critiques, l...
4                                                   []
Name: Assessments_cleaned, dtype: object

In [27]:
# Step 2: Flatten the list of all assessments and count occurrences
from collections import Counter

all_assessments = [item for sublist in df_units["Assessments_cleaned"] for item in sublist]
assessment_counts = Counter(all_assessments)



In [28]:
# Step 4: Apply the filtering logic iteratively
def filter_assessments(row, assessment_counts, threshold=3):
    unique_assessments = []
    
    for assessment in row:
        # If the assessment appears more than once and the unit has more than threshold assessments, remove it
        if assessment_counts[assessment] > 1 and len(row) > threshold:
            continue  # Remove this assessment from the unit
        # If the assessment appears more than once and the unit has less than threshold assessments, keep it
        elif assessment_counts[assessment] > 1 and len(row) < threshold:
            unique_assessments.append(assessment)
        # Otherwise, always add the assessment
        else:
            unique_assessments.append(assessment)
    
    return unique_assessments

# Step 5: Apply the filtering function to each row (unit)
df_units["Filtered Assessments"] = df_units["Assessments_cleaned"].apply(
    lambda row: filter_assessments(row, assessment_counts, threshold=3)
)




In [29]:
df_units.to_excel("unique_assessments.xlsx")

In [30]:
# Step 4: Apply the filtering logic iteratively
def filter_assessments(row, assessment_counts, threshold=3):
    # Create a list to hold filtered assessments
    filtered_assessments = []
    # Iterate over the assessments of each unit
    for assessment in row:
        # If the assessment appears more than once and the unit has more than threshold assessments, remove it
        if assessment_counts[assessment] > 1 and len(row) > threshold:
            continue  # Remove this assessment from the unit
        # If the assessment appears more than once and the unit has less than threshold assessments, keep it
        elif assessment_counts[assessment] > 1 and len(row) < threshold:
            filtered_assessments.append(assessment)
        # Otherwise, always add the assessment
        else:
            filtered_assessments.append(assessment)
    
    return filtered_assessments

# Step 5: Apply the filtering function to each row (unit)
df_units["Filtered Assessments"] = df_units["Assessments_cleaned"].apply(
    lambda row: filter_assessments(row, assessment_counts, threshold=3)
)

# Step 6: Ensure no unit is left empty but limit to top 3 assessments if necessary
def recheck_and_limit_empty_units(df, limit=3):
    for index, row in df.iterrows():
        # If a unit's filtered assessments are empty, we restore the top 3 from its original assessments
        if not row["Filtered Assessments"]:
            original_assessments = row["Assessments_cleaned"]
            # Ensure that the filtered assessments are limited to the top 3
            df.at[index, "Filtered Assessments"] = original_assessments[:limit]
    return df

# Step 7: Apply the recheck to limit to top 3 assessments
df_units = recheck_and_limit_empty_units(df_units)

# Check the filtered DataFrame
print(df_units)

     Unit                               Topic  \
0  Unit 1                        introduction   
1  Unit 2               big data architecture   
2  Unit 3  parallel processing with mapreduce   
3  Unit 4                        hive and pig   
4  Unit 5        stream processing with spark   

                                            Contents  Teaching Hours  \
0  Concepts of Data Analytics: Descriptive, Diagn...              15   
1  Standard Big data architecture - Big data appl...              15   
2  Introduction to MapReduce - Sample MapReduce a...              15   
3  Hive Architecture - Components - Data Definiti...              15   
4  Stream processing Models and Tools - Apache Sp...              15   

                                     Course Outcomes  Similarity Score  \
0  understand the big data concepts in real time ...          0.000000   
1      analyze the big data under spark architecture          0.526814   
2  demonstrate an ability to use hadoop framework.

In [31]:
df_units.to_excel("unique_assessments1.xlsx")

In [32]:
df_units

Unnamed: 0,Unit,Topic,Contents,Teaching Hours,Course Outcomes,Similarity Score,Verbs,Assessments,Assessments_cleaned,Filtered Assessments
0,Unit 1,introduction,"Concepts of Data Analytics: Descriptive, Diagn...",15,understand the big data concepts in real time ...,0.0,understand,"[['activities such as case studies, critiques,...","[activities such as case studies, critiques, l...","[criteria or standards], [activities such as j..."
1,Unit 2,big data architecture,Standard Big data architecture - Big data appl...,15,analyze the big data under spark architecture,0.526814,,[],[],[]
2,Unit 3,parallel processing with mapreduce,Introduction to MapReduce - Sample MapReduce a...,15,demonstrate an ability to use hadoop framework...,0.085745,demonstrateuseprocessing,"[['activities such as case studies, critiques,...","[activities such as case studies, critiques, l...","[] [activities such as case studies, critiques..."
3,Unit 4,hive and pig,Hive Architecture - Components - Data Definiti...,15,demonstrate the programming of big data using ...,0.524799,demonstrateusing,"[['activities such as case studies, critiques,...","[activities such as case studies, critiques, l...","[criteria or standards], [activities such as p..."
4,Unit 5,stream processing with spark,Stream processing Models and Tools - Apache Sp...,15,analyze the big data under spark architecture,0.19599,,[],[],[]


In [33]:
import pandas as pd
from io import StringIO
import tempfile

def create_temp_csv_for_langchain(df):
    """
    Creates a temporary CSV file from a DataFrame and returns its path.

    Parameters:
        df (pd.DataFrame): The DataFrame to convert.

    Returns:
        str: The file path of the temporary CSV file.
    """
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w")  # Create a temp file
    df.to_csv(temp_file.name, index=False)  # Save the DataFrame as a CSV to the temp file
    temp_file.close()  # Close the file to allow external access
    return temp_file.name

# Example usage:


In [34]:
from langchain_community.document_loaders.csv_loader import CSVLoader


In [35]:

temp_csv_path = create_temp_csv_for_langchain(df_units)

loader = CSVLoader(file_path=temp_csv_path, csv_args={
    'delimiter': ',',
    'quotechar': '"',
    'fieldnames': ['','Unit','Topic','Contents','Teaching Hours','Course Outcomes','Similarity Score','Verbs','Assessments','Assessments_cleaned','Filtered Assessments']
})

documents = loader.load()


In [41]:
documents[3].page_content

': Unit 3\nUnit: parallel processing with mapreduce\nTopic: Introduction to MapReduce - Sample MapReduce application: Wordcount - MapReduce Data types and Formats - Writing MapReduce Programming - Testing MapReduce Programs - MapReduce Job Execution - Shuffle and Sort - Managing Failures - Progress and Status Updates. MapReduce Programs: Using languages other than Java with Hadoop, Analyzing a large dataset.\nContents: 15\nTeaching Hours: demonstrate an ability to use hadoop framework for processing big data for analytics\nCourse Outcomes: 0.0857446626448432\nSimilarity Score: demonstrateuseprocessing\nVerbs: ["[\'activities such as case studies, critiques, labs, papers, projects, debates, or concept maps that require students to: discriminate or select relevant and irrelevant parts â€¢ determine how elements function together â€¢ determine bias, values, or underlying intent in presented material â€¢\', \'activities such as papers, exams, problem sets, class discussions, or concept map

In [42]:
data = []
for doc in documents:
    # Extract content from each document
    content = doc.page_content.strip()
    
    # Split the content by newline characters to separate columns
    content_lines = content.split('\n')
    
    # Ensure there are enough parts in the split content
    if len(content_lines) >= 9:
        entry = {
            "Unit": content_lines[0].strip(),          # First line: Unit
            "Topic": content_lines[2].strip(),         # Second line: Topic
            "Contents": content_lines[3].strip(),      # Third line: Contents
            "Assessments": set(content_lines[9].strip().split(';'))  # Ninth line: Assessments (handling repeated assessments)
        }
        # Join the assessments back into a single string (if they were split into a set)
        entry["Assessments"] = ', '.join(entry["Assessments"])
        
        data.append(entry)


In [43]:
data

[{'Unit': ': Unit',
  'Topic': 'Topic: Contents',
  'Contents': 'Contents: Teaching Hours',
  'Assessments': 'F, A, c,  , r, :, i, t, m, d, l, n, a, _, e, s'},
 {'Unit': ': Unit 1',
  'Topic': 'Topic: Concepts of Data Analytics: Descriptive, Diagnostic, Predictive, Prescriptive analytics - Big Data characteristics: Volume, Velocity, Variety, Veracity of data - Types of data: Structured, Unstructured, Semi-Structured, Metadata - Introduction to Hadoop Scaling - Distributed Framework -Hadoop v/s RDBMS-Brief history of Hadoop.',
  'Contents': 'Contents: 15',
  'Assessments': "r, :, i, ], t, [, a, p, j, b, e, s, u, c,  , v, ,, g, A, h, ', w, m, -, n, k, z, o, d, l, f, q, _"},
 {'Unit': ': Unit 2',
  'Topic': 'Topic: Standard Big data architecture - Big data application - Hadoop framework - HDFS Design goal - Master Slave architecture - Block System - Read-write Process for data - Installing HDFS - Executing in HDFS: Reading and writing Local files and Data streams into HDFS - Types of file