In [2]:

import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import nltk

# Install NLTK stopwords if needed
nltk.download('stopwords')

# Load the data
df = pd.read_excel("/content/What changes, if any, would you suggest for the academic preparation of this student.xlsx")

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'_x[0-9a-fA-F]{4}_', '', text)  # Remove patterns like _x000D_
    tokens = text.split()  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    return ' '.join(tokens)




# Predefined stop words and additional domain-specific stop words
stop_words = set(stopwords.words('english'))
custom_stop_words = {
    "student", "work", "prepared", "suggestions", "would", "changes", "academic", "preparation",
    "to", "be", "for", "the", "of", "this", "position", "from", "a", "it", "could", "make", "sure",
    "that", "real", "world", "are", "in", "with", "drexel", "is", "able", "experiences", "co", "ops",
    "help", "only", "but", "also", "have", "been", "and", "will", "on", "right",
    "great", "him", "well", "into", "future", "maybe", "more", "oriented", "food", "i", "think",
    "based", "students", "better", "at", "really", "these", "how", "use", "areas", "or", "courses", "other", "like", "any", "some", "where", "you",
    "learn", "about", "different", "etc", "has", "very", "good", "can", "continue", "if", "she",
    "job", "such", "as", "when", "he", "academic", "much", "role", "learned", "hard", "overall",
    "an", "take", "improve", "her", "handle", "focus", "tools", "they", "their", "prior",
    "op", "most", "important", "our", "get", "so", "many", "people", "professional", "what", "we",
    "no", "strong", "here", "had", "all", "tasks", "changes", "helpful",
    "given", "up", "beneficial", "out", "email", "was", "don", "t", "there", "his", "were", "s",
    "exposure", "academically", "yes", "understanding", "not", "school", "which", "further",
    "needs", "training", "path", "my", "specific", "needed", "them", "industry",
    "understand", "related", "however", "prepare", "interest", "suggest", "major", "than", "office",
    "benefit", "every", "team", "recommend", "studies", "vs", "nothing", "within", "n", "_x000d_",
    "believe", "during", "need", "social", "media", "should",
    "opportunities", "working", "career", "suggestion", "background",
    "taking", "did", "always", "new", "something", "those", "development", "even", "d", "skill",
    "over", "see", "workplace", "your", "do", "already", "e", "day", "by", "executive", "just",
    "who", "group", "may", "level", "technologies", "none", "no", "nan", "nothing", "excellent",
}
stop_words.update(custom_stop_words)

# Synonym library
synonym_library = {
    "accounting": ["accounting", "basic accounting", "accounting basics", "financial accounting", "managerial accounting",
                   "bookkeeping", "cash flows", "invoice", "accounting introduction", "internal audit"],
    "finance": ["finance", "financial", "intro to finance", "basics of finance", "financial planning", "investment management",
                "corporate finance", "insurance", "financial reporting", "financial analysis",
                "portfolio rebalancing", "portfolio management", "financial markets"],
    "statistics": ["statistics", "basic statistics", "intro to stats", "statistical methods", "data statistics", "applied statistics"],
    "python": ["python", "python programming", "python coding", "python scripting"],
    "sql": ["sql", "structured query language", "sql programming", "database querying"],
    "data_analysis": ["data analysis", "analyzing data", "data analytics", "data insights"],
    "business_analytics": ["business analytics", "analytics for business", "business data analytics", "business intelligence", "projections", "projection", "kpis", "key performance indicators"],
    "automation": ["automation", "intelligent automation tools", "workflow automation", "process automation"],
    "leadership": ["leadership", "executive leadership", "project leadership", "team leadership", "organizational leadership"],
    "tax": ["tax", "tax accounting", "tax planning", "tax strategies"],
    "public_speaking": ["public speaking", "presentation skills", "oral communication", "speech delivery", "storytelling"],
    "excel": ["excel", "microsoft excel", "advanced excel skills", "pivot tables in excel", "vlookup", "data modeling", "spreadsheet"],
    "powerpoint": ["powerpoint", "microsoft powerpoint", "presentation design", "slides creation"],
    "marketing": ["marketing", "digital marketing", "marketing strategies", "social media marketing", "advertising", "seo", "content marketing"],
    "project_management": ["project management", "agile project management", "waterfall project management", "pmp", "scrum"],
    "critical_thinking": ["critical thinking", "problem solving", "analytical thinking", "decision making"],
    "data_visualization": ["data visualization", "chart creation", "dashboard creation", "data storytelling", "tableau", "power bi"],
    "programming": ["programming", "coding", "software development", "scripting"],
    "time_management": ["time management", "productivity skills", "scheduling", "prioritization"],
    "crm": ["crm", "customer relationship management", "salesforce", "hubspot", "zoho crm"],
    "ms_office": ["ms office", "microsoft office", "word", "outlook"],
    "networking": ["networking", "professional networking", "relationship building", "linkedin"],
    "entrepreneurship": ["entrepreneurship", "startup skills", "venture creation", "business innovation"],
    "customer_service": ["customer service", "client relations", "service management"],
    "supply_chain": ["supply chain", "logistics", "inventory management", "procurement", "supply chain analytics"],
    "hr_management": ["hr management", "human resources", "recruitment", "talent management", "employee relations"],
    "cybersecurity": ["cybersecurity", "information security", "network security", "data security", "ethical hacking"],
    "soft_skills": ["speaking", "speak", "technical communication", "communication", "business communication", "written communication", "verbal communication", "busi-comm", "business communications"],
    "presentation": ["present",  "presentation", "presentations", "presenting"],
    "sales": ["sales", "selling", "sale"],
    "technical_skills": ["technical skills", "reporting", "tracking", "trading", "planning", "forecasting", "graphic design"],
    "syst_anal": ["systems analysis", "system analysis"],
    "business": ["business"],
    "hospitality": ["beverage", "beverages"],
    "writing_skills": ["email", "emails", "e-mail", "e mail", "e-mails", "writing", "written"],
    "prioritization": ["prioritize", "prioritization", "priority"],
    "partcipation": ["involved", "involve", "engagement", "engage", "engaging"],
    "busi-manag": ["business management"],
    "org-manag": ["organizational management"],
    "data science": ["data science"],
    "data_domain": ["database knoweldge", "cloud based software"],
    "data_manipulation": ["data manipulations", "data manipulation"],
    "software": ["software"],
    "computer_skills": ["computer skills"],

    }

# Match keywords from the synonym library
def match_keywords(text, synonym_lib):
    matched_courses = []
    for course, keywords in synonym_lib.items():
        for keyword in keywords:
            if keyword in text:
                matched_courses.append(course)
    return matched_courses

# Function to remove repetitive words while preserving the order
def remove_repeated_words(text_list):
    """
    Removes repeated words from a list of words while preserving the order.

    Args:
        text_list (list): A list of words (strings).

    Returns:
        str: A string containing the distinct words separated by spaces.
    """
    seen = set()
    distinct_words = []
    for word in text_list:  # Iterate through the list of words directly
        if word not in seen:
            seen.add(word)
            distinct_words.append(word)
    return ' '.join(distinct_words)

# Analyze text for relevant courses
def analyze_text(df, synonym_lib):
    df['Cleaned_Text'] = df['What changes, if any, would you suggest for the academic preparation of this student?'].apply(preprocess_text)
    df['Matched_Courses'] = df['Cleaned_Text'].apply(lambda x: match_keywords(x, synonym_lib))
    return df

# Process the data
df = analyze_text(df, synonym_library)

# Apply the function to remove repeated words in the 'Matched_Courses' column
df['Distinct_Response'] = df['Matched_Courses'].apply(remove_repeated_words)

# Explode matched courses into separate rows for Power BI
df_exploded = df.explode('Matched_Courses')

# Match keywords from the synonym library
def match_keywords(text, synonym_lib):
    matched_courses = []
    for course, keywords in synonym_lib.items():
        for keyword in keywords:
            if keyword in text:
                matched_courses.append(course)
    return matched_courses

# Create binary columns for synonym library matches
def add_binary_columns(df, synonym_lib):
    for course, keywords in synonym_lib.items():
        df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
    return df

# Add "Others" column
def add_others_column(df, synonym_lib):
    keyword_columns = list(synonym_lib.keys())
    # Add the "Others" column
    df['Others'] = df[keyword_columns].sum(axis=1).apply(lambda x: 1 if x == 0 else 0)
    return df

# Analyze text for relevant courses and add binary columns
def analyze_text_and_add_binaries(df, synonym_lib):
    # Preprocess and find matches
    df['Cleaned_Text'] = df['What changes, if any, would you suggest for the academic preparation of this student?'].apply(preprocess_text)
    df['Matched_Courses'] = df['Cleaned_Text'].apply(lambda x: match_keywords(x, synonym_lib))
    df
    # Add binary columns for all synonym library categories
    df = add_binary_columns(df, synonym_lib)

    # Add "Others" column
    df = add_others_column(df, synonym_lib)

    return df

# Process the data
df = analyze_text_and_add_binaries(df, synonym_library)

# Save the results to an Excel file
output_path = "processed_suggestions_with_binary_columns_and_others.xlsx"
df.to_excel(output_path, index=False)

print("Processing completed and results saved with binary columns and 'Others' column!")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processing completed and results saved with binary columns and 'Others' column!


In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from collections import Counter
import nltk

# Install NLTK stopwords if needed
nltk.download('stopwords')

# Load the data
df = pd.read_excel("/content/How did your classroom activities prepare you for co-op If they didn’t, how were you prepared for co-op.xlsx")

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'_x[0-9a-fA-F]{4}_', '', text)  # Remove patterns like _x000D_
    tokens = text.split()  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    return ' '.join(tokens)

# Define negative keywords for classification
negative_keywords = [
    "not prepared", "not really prepared", "wasn’t prepared", "did not prepare",
    "wasn't prepped", "didn't prep", "did not prep", "not prepped", "nothing",
    "no preparation", "not ready", "didn’t prepare", "didn't prepare", "did not", "didn't", "not really",
]

# funtion to classify responses
def classify_response(response):
    """
    Classifies a response as "nothing" if it contains any negative keywords,
    otherwise classifies as "prepared".

    Args:
        response (str): The text response to classify.

    Returns:
        str: "nothing" if negative keywords are found, "prepared" otherwise.
    """
    # Check if the response is a string before converting to lowercase
    if isinstance(response, str):
        response_lower = response.lower()  # Convert to lowercase for case-insensitive matching
        for keyword in negative_keywords:
            if keyword in response_lower:
                return "not prepared"  # Match found, classify as "nothing"
    #     return "prepared"  # No match found, classify as "prepared"
    # else:
    #     # Handle non-string responses (e.g., NaN) - return "prepared" or other appropriate value
    #     return "prepared"

# Predefined stop words and additional domain-specific stop words
stop_words = set(stopwords.words('english'))
custom_stop_words = {
    'i', 'me', 'my', 'we', 'our', 'ours', 'you', 'your', 'yours',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was',
    'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
    'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
    'into', 'through', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
    'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all',
    'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'same',
    'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
}
stop_words.update(custom_stop_words)

# Synonym library (add the full library here)
synonym_library = {
    "acct116": ["acct 116", "acct-116", "accounting 116", "managerial accounting"],
    "acct115": ["acct 115", "acct-115", "accounting 115", "financial accounting"],
    "acct110": ["acct 110", "acct-110", "accounting 110", "accounting for professionals"],
    "acct321": ["acct 321", "acct-321", "accounting321", "accounting 321", "financial reporting i"],
    "acct323": ["acct 323", "acct-323", "accounting323", "accounting 323", "financial reporting iii"],
    "acct329": ["acct 329", "acct-329", "accounting329", "accounting 329", "advanced accounting"],
    "acct341": ["acct 341", "acct-341", "accounting341", "accounting 341", "principles auditing"],
    "acct510": ["acct 510", "acct-510", "accounting510", "accounting 510", "essentials financial reporting"],
    "acct600": ["acct 600", "acct-600", "accounting600", "accounting 600", "accounting analysis & theory"],
    "acct606": ["acct 606", "acct-606", "accounting606", "accounting 606", "current issues accounting profession"],
    "acct622": ["acct 622", "acct-622", "accounting622", "accounting 622", "advanced financial accounting"],
    "acct912": ["acct 912", "acct-912", "accounting912", "accounting 912", "applied research methods accounting"],
    "acct998": ["acct 998", "acct-998", "accounting998", "accounting 998", "dissertation research accounting"],
    "stat201": ["stat 201", "stat-201", "statistics201", "statistics 201", "introduction business statistics", "business statistics i"],
    "stat202": ["stat 202", "stat-202", "statistics202", "statistics 202", "business statistics ii"],
    "stat205": ["stat 205", "stat-205", "statistics205", "statistics 205", "statistical inference i"],
    "stat331": ["stat 331", "stat-331", "statistics331", "statistics 331", "introduction data mining business"],
    "stat610": ["stat 610", "stat-610", "statistics610", "statistics 610", "statistics business analytics"],
    "stat628": ["stat 628", "stat-628", "statistics628", "statistics 628", "applied regression analysis"],
    "stat931": ["stat 931", "stat-931", "statistics931", "statistics 931", "statistics economics"],
    "stat932": ["stat 932", "stat-932", "statistics932", "statistics 932", "statistics behavioral science"],
    "fin301": ["fin 301", "fin-301", "finance301", "finance 301", "introduction finance"],
    "fin302": ["fin 302", "fin-302", "finance302", "finance 302", "intermediate corporate finance"],
    "fin321": ["fin 321", "fin-321", "finance321", "finance 321", "investment securities & markets"],
    "fin323": ["fin 323", "fin-323", "finance323", "finance 323", "risk management"],
    "fin325": ["fin 325", "fin-325", "finance325", "finance 325", "financial institutions and markets"],
    "fin332": ["fin 332", "fin-332", "finance332", "finance 332", "investment analysis"],
    "fin335": ["fin 335", "fin-335", "finance335", "finance 335", "entrepreneurial finance"],
    "fin341": ["fin 341", "fin-341", "finance341", "finance 341", "applied portfolio management"],
    "fin345": ["fin 345", "fin-345", "finance345", "finance 345", "mergers & acquisitions"],
    "fin346": ["fin 346", "fin-346", "finance346", "finance 346", "global financial management"],
    "fin601": ["fin 601", "fin-601", "finance601", "finance 601", "corporate financial management"],
    "fin605": ["fin 605", "fin-605", "finance605", "finance 605", "business valuation"],
    "fin624": ["fin 624", "fin-624", "finance624", "finance 624", "risk management financial professionals"],
    "blaw": ["business law"],
    "blaw101": ["blaw 101", "blaw-101", "businesslaw101", "business law 101"],
    "blaw201": ["blaw 201", "blaw-201", "businesslaw201", "business law 201", "business law i"],
    "blaw202": ["blaw 202", "blaw-202", "businesslaw202", "business law 202", "business law ii"],
    "blaw330": ["blaw 330", "blaw-330", "businesslaw330", "business law 330", "real estate law"],
    "blaw346": ["blaw 346", "blaw-346", "businesslaw346", "business law 346", "entrepreneurial law"],
    "blaw358": ["blaw 358", "blaw-358", "businesslaw358", "business law 358", "employment law"],
    "blaw621": ["blaw 621", "blaw-621", "businesslaw621", "business law 621", "legal issues business"],
    "mis": ["management systems", "management system", "business analytics", "introduction mis", "intro mis"],
    "mis200": ["mis 200", "mis-200", "management information systems"],
    "mis342": ["mis 342", "mis-342", "systems analysis and design"],
    "mis343": ["mis 343", "mis-343", "database design and implementation"],
    "mis346": ["mis 346", "mis-346", "management information systems strategy"],
    "mis361": ["mis 361", "mis-361", "information system project management"],
    "mis364": ["mis 364", "mis-364", "information security systems management"],
    "mis612": ["mis 612", "mis-612", "aligning information systems and business strategies"],
    "mis625": ["mis 625", "mis-625", "management information technology operations"],
    "mis642": ["mis 642", "mis-642", "emerging information technologies business"],
    "opm200": ["opm 200", "opm-200", "operations management"],
    "opm324": ["opm 324", "opm-324", "operations planning"],
    "opm342": ["opm 342", "opm-342", "sustainable supply chain management and logistics"],
    "mktg201": ["mktg 201", "mktg-201", "introduction marketing management"],
    "mktg322": ["mktg 322", "mktg-322", "advertising & integrated marketing communications"],
    "mktg326": ["mktg 326", "mktg-326", "marketing insights"],
    "mktg348": ["mktg 348", "mktg-348", "services marketing"],
    "mktg356": ["mktg 356", "mktg-356", "consumer behavior"],
    "mktg367": ["mktg 367", "mktg-367", "data-driven digital marketing", "digital marketing"],
    "mktg368": ["mktg 368", "mktg-368", "corporate responsibility management"],
    "mktg380": ["mktg 380", "mktg-380", "seminar marketing strategy"],
    "mktg510": ["mktg 510", "mktg-510", "marketing strategy"],
    "mktg601": ["mktg 601", "mktg-601", "marketing strategy & planning"],
    "mktg607": ["mktg 607", "mktg-607", "marketing experiments"],
    "mktg652": ["mktg 652", "mktg-652", "marketing information management and research"],
    "mktg654": ["mktg 654", "mktg-654", "corporate brand & reputation management"],
    "mktgt980": ["mktg t980", "mktg-t980", "perceptual processes consumer behavior"],
    "mktg998": ["mktg 998", "mktg-998", "dissertation research marketing"],
    "mgmt201": ["mgmt 201", "mgmt-201", "introduction technology innovation management"],
    "mgmt260": ["mgmt 260", "mgmt-260", "introduction entrepreneurship"],
    "mgmt240": ["mgmt 240", "mgmt-240", "mgmt 240"],
    "mgmt301": ["mgmt 301", "mgmt-301", "designing innovative organizations"],
    "mgmt302": ["mgmt 302", "mgmt-302", "competing technology industries"],
    "mgmt364": ["mgmt 364", "mgmt-364", "technology management"],
    "mgmt450": ["mgmt 450", "mgmt-450", "strategy and competitive advantage"],
    "mgmt600": ["mgmt 600", "mgmt-600", "introduction change management"],
    "mgmt603": ["mgmt 603", "mgmt-603", "technology strategy"],
    "mgmt640": ["mgmt 640", "mgmt-640", "strategic human resource management"],
    "mgmt770": ["mgmt 770", "mgmt-770", "mba capstone"],
    "mgmt906": ["mgmt 906", "mgmt-906", "foundations research behavioral science"],
    "mgmt935": ["mgmt 935", "mgmt-935", "seminar organization theory"],
    "mgmt998": ["mgmt 998", "mgmt-998", "dissertation research management"],
    "remd": ["real estate", "estate"],
    "remd110": ["remd 110", "remd-110", "real estate management", "estate 110", "estate 110"],
    "remd320": ["remd 320", "remd-320", "sustainability built environment"],
    "remd375": ["remd 375", "remd-375", "real estate finance"],
    "remd410": ["remd 410", "remd-410", "real estate investment and asset management"],
    "cs171": ["cs 171", "cs-171"],
    "cs150": ["cs 150", "cs-150"],
    "engr": ["engineering cad"],
    "engr113": ["engr 113", "engr-113"],
    "com270": ["com 270", "com-270"],
    "opr": ["operations"],
    "opr320": ["opr 320", "opr-320", "linear models decision making"],
    "opr601": ["opr 601", "opr-601", "managerial decision models and simulation"],
    "opr998": ["opr 998", "opr-998", "dissertation research operations research"],
    "busn101": ["busn 101", "busn-101", "foundations business i"],
    "busn102": ["busn 102", "busn-102", "foundations business ii"],
    "busn105": ["busn 105", "busn-105", "applied business analysis"],
    "busn111": ["busn 111", "busn-111", "foundations business"],
    "busn501": ["busn 501", "busn-501", "measuring and maximizing financial performance"],
    "busn614": ["busn 614", "busn-614", "foundations career & professional development"],
    "busn997": ["busn 997", "busn-997", "research activity phd students lebow college business"],
    "bsan160": ["bsan 160", "bsan-160", "business analytics and data visualization"],
    "bsan": ["business analytics", "analytic", "analytics"],
    "bsan360": ["bsan 360", "bsan-360", "programming data analytics"],
    "bsan460": ["bsan 460", "bsan-460", "business analytics senior project"],
    "bsan601": ["bsan 601", "bsan-601", "business analytics managers"],
    "bsan710": ["bsan 710", "bsan-710", "business analytics capstone project"],
    "co-op": ["coop"],
    "co-op101": ["co-op 101", "co-op-101"],
    "orgb300": ["orgb 300", "orgb-300", "organizational behavior"],
    "orgb430": ["orgb 430", "orgb-430"],
    "orgb320": ["orgb 320", "orgb-320", "leadership: theory and practice"],
    "orgb511": ["orgb 511", "orgb-511", "leading dynamic environments: personal, relational, and strategic approach"],
    "intb200": ["intb 200", "intb-200"],
    "econ202": ["econ 202", "econ-202", "introduction economics"],
    "econ270": ["econ 270", "econ-270", "introduction macroeconomics"],
    "econ321": ["econ 321", "econ-321", "microeconomics"],
    "econ350": ["econ 350", "econ-350", "macroeconomics"],
    "econ351": ["econ 351", "econ-351", "international economics"],
    "econ352": ["econ 352", "econ-352", "financial economics"],
    "econ353": ["econ 353", "econ-353", "international trade economics"],
    "tax360": ["tax 360", "tax-360"],
    "tax341": ["tax 341", "tax-341"],
    "se181": ["se 181", "se-181"],
    "accounting": ["accounting", "basic accounting", "financial accounting", "managerial accounting",
                   "bookkeeping", "cash flows", "invoice", "accounting introduction", "internal audit"],
    "finance": ["finance", "intro to finance", "financial planning", "investment management",
                "corporate finance", "insurance", "financial reporting", "portfolio rebalancing",
                "portfolio management", "financial markets"],
    "economics": ["economics", "macroeconomics", "microeconomics"],
    "mis200": ["mis200", "mis 200", "mis-200"],
    "statistics": ["statistics", "basic statistics", "intro to stats", "statistical methods", "data statistics", "applied statistics"],
    "python": ["python", "python programming", "python coding", "python scripting"],
    "sql": ["sql", "structured query language", "sql programming", "database querying"],
    "data_analysis": ["data analysis", "analyzing data", "data analytics", "data insights", "data visualization"],
    "business_analytics": ["business analytics", "analytics for business", "business data analytics", "business intelligence",
                           "projections", "kpis", "key performance indicators"],
    "automation": ["automation", "intelligent automation tools", "workflow automation", "process automation"],
    "leadership": ["leadership", "executive leadership", "project leadership", "team leadership", "organizational leadership"],
    "tax": ["tax", "tax accounting", "tax planning", "tax strategies"],
    "public_speaking": ["public speaking", "presentation skills", "oral communication", "speech delivery", "storytelling"],
    "excel": ["excel", "microsoft excel", "advanced excel skills", "pivot tables", "vlookup", "data modeling", "spreadsheet"],
    "powerpoint": ["powerpoint", "microsoft powerpoint", "presentation design", "slides creation"],
    "marketing": ["marketing", "marketing strategies", "social media marketing", "advertising", "seo", "content marketing"],
    "project_management": ["project management", "agile project management", "waterfall project management", "pmp", "scrum"],
    "critical_thinking": ["critical thinking", "problem solving", "analytical thinking", "decision making"],
    "data_visualization": ["data visualization", "chart creation", "dashboard creation", "data storytelling", "tableau", "power bi"],
    "communication": ["communicate", "communication", "orator", "business communication", "written communication", "verbal communication", "busi-comm"],
    "programming": ["programming", "coding", "software development", "scripting"],
    "time_management": ["time management", "productivity skills", "scheduling", "prioritization"],
    "crm": ["crm", "customer relationship management", "salesforce", "hubspot", "zoho crm"],
    "ms_office": ["ms office", "microsoft office", "word", "outlook"],
    "networking": ["networking", "professional networking", "relationship building", "linkedin"],
    "entrepreneurship": ["entrepreneurship", "startup skills", "venture creation", "business innovation"],
    "customer_service": ["customer service", "client relations", "service management"],
    "supply_chain": ["supply chain", "logistics", "inventory management", "procurement", "supply chain analytics"],
    "hr_management": ["hr management", "human resources", "recruitment", "talent management", "employee relations"],
    "cybersecurity": ["cybersecurity", "information security", "network security", "data security", "ethical hacking"],
    "soft_skills": ["soft skills", "speaking", "technical communication", "presentation"],
    "sales": ["sales", "selling"],
    "technical_skills": ["technical skills", "reporting", "tracking", "trading", "planning", "forecasting", "graphic design"],
    "syst_anal": ["systems analysis", "system analysis"],
    "business": ["business", "organizational", "elementary business"],
    "hospitality": ["hospitality", "beverages"],
    "preliminary": ["preliminary"],
    "advanced": ["advanced"],
    "digital marketing": ["digital marketing", "social media marketing"],
    "writing_skills": ["writing skills", "email", "e-mails", "writing", "written", "writer", "write"],
    "prioritization": ["prioritization", "priority"],
    "projects": ["projects", "team project", "project", "team projects", "teamwork", "group projects", "collaboration", "team"],
    "MIS": ["MIS", "management information systems", "mis"],
    "data_analytics": ["data analytics", "data analysis", "analytics"],
    "regression": ["regression", "linear regression", "logistic regression"],
    "operations_management": ["operations management", "supply chain", "operations"],
    "self_taught": ["personal", "self", "taught", 'yourself', 'ourselves', "yourselves", "self-taught", "own", "myself", "learn", "ask", "asked"],
    "prior_work_experience": ["prior", "previous"],
    "during_coop": ["during", "co-op", "job", "role"],
    "problem_solving": ["problem", "solving", "problem-solving"],
    "classroom_activities": ["classroom activities", "class", "classroom", "involved", "interactive", "participation", "interactivity"]

}

# Match keywords from the synonym library
def match_keywords(text, synonym_lib):
    matched_courses = []
    for course, keywords in synonym_lib.items():
        for keyword in keywords:
            if keyword in text:
                matched_courses.append(course)
    return matched_courses

# Function to remove repetitive words while preserving the order
def remove_repeated_words(text_list):
    seen = set()
    distinct_words = []
    for word in text_list:  # Iterate through the list of words directly
        if word not in seen:
            seen.add(word)
            distinct_words.append(word)
    return ' '.join(distinct_words)

# Analyze text for relevant courses and classify responses
def analyze_text(df, synonym_lib):
    df['Cleaned_Text'] = df['How did your classroom activities prepare you for co-op? If they didn’t, how were you prepared for co-op?'].apply(preprocess_text)
    df['Matched_Courses'] = df['Cleaned_Text'].apply(lambda x: match_keywords(x, synonym_lib))
    df['Classification'] = df['How did your classroom activities prepare you for co-op? If they didn’t, how were you prepared for co-op?'].apply(classify_response)
    return df

# Process the data
df = analyze_text(df, synonym_library)

# Apply the function to remove repeated words in the 'Matched_Courses' column
df['Distinct_Response'] = df['Matched_Courses'].apply(remove_repeated_words)

# Create binary columns for synonym library matches
def add_binary_columns(df, synonym_lib):
    for course, keywords in synonym_lib.items():
        df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
    return df

# Add "Others" column
def add_others_column(df, synonym_lib):
    keyword_columns = list(synonym_lib.keys())
    # Add the "Others" column
    df['Others'] = df[keyword_columns].sum(axis=1).apply(lambda x: 1 if x == 0 else 0)
    return df

# Process the data
df = analyze_text(df, synonym_library)

# Add binary columns for matched courses
df = add_binary_columns(df, synonym_library)

# Add the "Others" column
df = add_others_column(df, synonym_library)

# Explode matched courses into separate rows for Power BI
df_exploded = df.explode('Matched_Courses')

# Save the results to an Excel file
df.to_excel("processed_responses_with_classification_and_others_column.xlsx", index=False)

print("Processing completed and results saved with binary columns and 'Others' column!")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
  df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
  df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
  df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
  df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
  df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
  df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
  df[course] = df['Cleaned_Text'].apply(lambda text: 1 if any(keyword in text for keyword in keywords) else 0)
 

Processing completed and results saved with binary columns and 'Others' column!


In [3]:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download the stopwords resource (only needs to be run once)
nltk.download('stopwords')

# Load the stopwords
stop_words_list = set(stopwords.words('english'))

# Download and initialize NLTK resources if needed
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load data
df = pd.read_excel("/content/what coursework did you apply most during your coop.xlsx")  # Update the file path as needed

# Text cleaning function to remove punctuation and stop words
def clean_text(text):
    # Convert to lowercase and handle non-string values
    text = str(text).lower()
    # Remove parentheses but keep their content
    text = re.sub(r'\((.*?)\)', r'\1', text)
    # Remove punctuation except hyphens
    text = re.sub(r'(?<![a-zA-Z0-9-])[^\w\s,.-](?![a-zA-Z0-9-])', '', text)
    text = re.sub(r'[.,]', '', text)
    # Remove 'x000D' and similar artifacts
    text = re.sub(r'_x[0-9a-fA-F]{4}_', '', text)  # Matches patterns like 'x000D'
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Define stop words, including custom additions
stop_words = set(stopwords.words('english'))
custom_stop_words = {"i", "my", "course", "lebow", "would", "say", "class", "course", "courses", "class", "classes", "univ", "university",
    "college", "lebow", "drexel", "major", "school", "student", "involved", "group --", "important", "everyday", "basis",
    "topic", "topics", "assignment", "assignments", "lesson", "material", "program",
    "prepare", "prepared", "apply", "applied", "using", "use", "learn", "learned",
    "knowledge", "skills", "help", "helped", "helpful", "benefit", "benefited",
    "during", "most", "best", "thing", "things", "one", "lot", "lots", "much", "many", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "complaint", "letters", "city", "philadelphia", "customer's", "insurance", "claims", "surprisingly",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "played", "large", "role", "expected", "type", "concise", "e-mails", "co-workers", "generate", "reports", "progress", "duties", "dealt", "erps", "cigna", "utilized", "oracle", "database", "ariba", "invoicing", "payment", "software",
    "did", "doing", "a", "an", "concept", "staking", "general", "venue",  "prior", "utilize", "search", "servicing", "provided", "great", "amount", "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "given", "perspective", "above", "below", "from", "up", "down",
    "in", "out", "on", "probably", "part", "contain", "aspects", "may", "principles", "applicable", "yet", "skill", "problem", "solving", "become", "illustrator", "successful", "roles", "required", "team", "setting", "often", "able", "working", "attorneys", "promotional", "products", "events", "also", "effectively", "communicate", "adhere", "corporate", "successfully", "according", "multiple", "supervisors", "hope", "future",
    "captions", "social", "media", "accounts", "interaction", "whether", "finances", "going", "logistics", "reflected", "well", "although", "day-to-day", "understanding", "hit", "ground", "running", "didn’t",
     "off", "over", "relied", "past", "under", "again", "further", "then", "once",
    "consulting", "peer", "leader", "practicum", "development", "leadership", "people", "honors", "projects", "timezones", "background", "tech", "processes", "created", "familiarity", "technological", "responsibilities", "knowing", "verbiage", "formatting", "professional", "conversations", "helping", "accomplish",
    "here", "there", "when", "where", "critical", "thinking", "think", "why", "how", "all", "any", "both", "each",
    "referenced", "various", "knew", "like", "improve", "early", "guess", "adobe", "suite", "extra", "cirrculars", "professionalism", "emails", "meetings", "etc", "academic", "random", "number", "generators", "sampling",
    "few", "more", "most", "took", "event", "facilities", "other", "some", "such", "no", "nor", "not", "only",
    "own", "same", "so", "than", "projects", "people", "different", "timezones", "too", "very", "understand", "language", "trust", "s", "t", "can", "will", "just", "intermediate",
    "don", "should", "now", "used", "mostly", "since", "files", ",", "terms", "super", "familiar", "base", "expanded", "manager", "coworkers",
    "work", "i've", "coursework", "relevant", "people", "nan", "coworkers", "carried", "bulk", "information", "needed", "felt", "properly", "believe", "useful",
    "far", "career", "path", "everything", "ii", "loan", "literally", "least", "every", "better", "company", "job",
    "sure", "prep", "world", "ideas", "create", "proficient", "generally", "specifically", "certain", "prof", "cohen", "huge",
    "ticketing", "department", "taught", "crm", "made", "easier", "handle", "score", "next", "daily", "lastly", "another", "comes", "don’t", "realize", "go", "executing", "moving", "parts", "got", "overwhelmed", "appreciated", "leaned", "put", "game", "day", "weeks", "end", "benefitted", "obviously", "grateful", "i’ve", "thus",
    "interesting", "see", "real", "world", "directly", "none", "related", "come", "mind", "developed", "throughout", "previous", "taken",
    "intro", "honestly", "really", "teach", "*honors", "imperative", "minor", "cib", "teach", "write", "reading", "statements", "writing", "data", "goes", "formulas", "behind", "desks", "interesting", "nothing", "even", "specific", "mainly", "came", "handy", "position", "quite",
    "salesperson", "get", "license", "photoshop", "reasoning", "teaches", "ethical", "rules", "abide", "order", "basic", "honors", "shoutout", "dana", "d'angelo", "consisted", "artificial", "technology", "metaverse", "absolutely", "opinion", "take", "experience", "lectures", "learning", "unnecessary", "good", "exam", "forget", "right", "finished",

                     }  # Custom stop words to remove

stop_words.update(custom_stop_words)


# Clean text column
df['Cleaned_Response'] = df['What coursework did you apply most during your co-op?'].apply(clean_text)

# Define a synonym library to standardize course names
synonym_library = {
    "introduction": ["intro"],
    "acct116": ["acct 116", "acct-116", "accounting 116", "managerial accounting"],
    "acct115": ["acct 115", "acct-115", "accounting 115", "financial accounting"],
    "acct110": ["acct 110", "acct-110", "accounting 110", "accounting for professionals"],
    "acct321": ["acct 321", "acct-321", "accounting321", "accounting 321", "financial reporting i"],
    "acct323": ["acct 323", "acct-323", "accounting323", "accounting 323", "financial reporting iii"],
    "acct329": ["acct 329", "acct-329", "accounting329", "accounting 329", "advanced accounting"],
    "acct341": ["acct 341", "acct-341", "accounting341", "accounting 341", "principles auditing"],
    "acct510": ["acct 510", "acct-510", "accounting510", "accounting 510", "essentials financial reporting"],
    "acct600": ["acct 600", "acct-600", "accounting600", "accounting 600", "accounting analysis & theory"],
    "acct606": ["acct 606", "acct-606", "accounting606", "accounting 606", "current issues accounting profession"],
    "acct622": ["acct 622", "acct-622", "accounting622", "accounting 622", "advanced financial accounting"],
    "acct912": ["acct 912", "acct-912", "accounting912", "accounting 912", "applied research methods accounting"],
    "acct998": ["acct 998", "acct-998", "accounting998", "accounting 998", "dissertation research accounting"],
    "stat201": ["stat 201", "stat-201", "statistics201", "statistics 201", "introduction business statistics", "business statistics i"],
    "stat202": ["stat 202", "stat-202", "statistics202", "statistics 202", "business statistics ii"],
    "stat205": ["stat 205", "stat-205", "statistics205", "statistics 205", "statistical inference i"],
    "stat331": ["stat 331", "stat-331", "statistics331", "statistics 331", "introduction data mining business"],
    "stat610": ["stat 610", "stat-610", "statistics610", "statistics 610", "statistics business analytics"],
    "stat628": ["stat 628", "stat-628", "statistics628", "statistics 628", "applied regression analysis"],
    "stat931": ["stat 931", "stat-931", "statistics931", "statistics 931", "statistics economics"],
    "stat932": ["stat 932", "stat-932", "statistics932", "statistics 932", "statistics behavioral science"],
    "fin301": ["fin 301", "fin-301", "finance301", "finance 301", "introduction finance"],
    "fin302": ["fin 302", "fin-302", "finance302", "finance 302", "intermediate corporate finance"],
    "fin321": ["fin 321", "fin-321", "finance321", "finance 321", "investment securities & markets"],
    "fin323": ["fin 323", "fin-323", "finance323", "finance 323", "risk management"],
    "fin325": ["fin 325", "fin-325", "finance325", "finance 325", "financial institutions and markets"],
    "fin332": ["fin 332", "fin-332", "finance332", "finance 332", "investment analysis"],
    "fin335": ["fin 335", "fin-335", "finance335", "finance 335", "entrepreneurial finance"],
    "fin341": ["fin 341", "fin-341", "finance341", "finance 341", "applied portfolio management"],
    "fin345": ["fin 345", "fin-345", "finance345", "finance 345", "mergers & acquisitions"],
    "fin346": ["fin 346", "fin-346", "finance346", "finance 346", "global financial management"],
    "fin601": ["fin 601", "fin-601", "finance601", "finance 601", "corporate financial management"],
    "fin605": ["fin 605", "fin-605", "finance605", "finance 605", "business valuation"],
    "fin624": ["fin 624", "fin-624", "finance624", "finance 624", "risk management financial professionals"],
    "blaw": ["business law"],
    "blaw101": ["blaw 101", "blaw-101", "businesslaw101", "business law 101"],
    "blaw201": ["blaw 201", "blaw-201", "businesslaw201", "business law 201", "business law i"],
    "blaw202": ["blaw 202", "blaw-202", "businesslaw202", "business law 202", "business law ii"],
    "blaw330": ["blaw 330", "blaw-330", "businesslaw330", "business law 330", "real estate law"],
    "blaw346": ["blaw 346", "blaw-346", "businesslaw346", "business law 346", "entrepreneurial law"],
    "blaw358": ["blaw 358", "blaw-358", "businesslaw358", "business law 358", "employment law"],
    "blaw621": ["blaw 621", "blaw-621", "businesslaw621", "business law 621", "legal issues business"],
    "mis": ["management systems", "management system", "business analytics"],
    "mis200": ["mis 200", "mis-200", "management information systems"],
    "mis342": ["mis 342", "mis-342", "systems analysis and design"],
    "mis343": ["mis 343", "mis-343", "database design and implementation"],
    "mis346": ["mis 346", "mis-346", "management information systems strategy"],
    "mis361": ["mis 361", "mis-361", "information system project management"],
    "mis364": ["mis 364", "mis-364", "information security systems management"],
    "mis612": ["mis 612", "mis-612", "aligning information systems and business strategies"],
    "mis625": ["mis 625", "mis-625", "management information technology operations"],
    "mis642": ["mis 642", "mis-642", "emerging information technologies business"],
    "opm200": ["opm 200", "opm-200", "operations management"],
    "opm324": ["opm 324", "opm-324", "operations planning"],
    "opm342": ["opm 342", "opm-342", "sustainable supply chain management and logistics"],
    "mktg201": ["mktg 201", "mktg-201", "introduction marketing management"],
    "mktg322": ["mktg 322", "mktg-322", "advertising & integrated marketing communications"],
    "mktg326": ["mktg 326", "mktg-326", "marketing insights"],
    "mktg348": ["mktg 348", "mktg-348", "services marketing"],
    "mktg356": ["mktg 356", "mktg-356", "consumer behavior"],
    "mktg367": ["mktg 367", "mktg-367", "data-driven digital marketing", "digital marketing"],
    "mktg368": ["mktg 368", "mktg-368", "corporate responsibility management"],
    "mktg380": ["mktg 380", "mktg-380", "seminar marketing strategy"],
    "mktg510": ["mktg 510", "mktg-510", "marketing strategy"],
    "mktg601": ["mktg 601", "mktg-601", "marketing strategy & planning"],
    "mktg607": ["mktg 607", "mktg-607", "marketing experiments"],
    "mktg652": ["mktg 652", "mktg-652", "marketing information management and research"],
    "mktg654": ["mktg 654", "mktg-654", "corporate brand & reputation management"],
    "mktgt980": ["mktg t980", "mktg-t980", "perceptual processes consumer behavior"],
    "mktg998": ["mktg 998", "mktg-998", "dissertation research marketing"],
    "mgmt201": ["mgmt 201", "mgmt-201", "introduction technology innovation management"],
    "mgmt260": ["mgmt 260", "mgmt-260", "introduction entrepreneurship"],
    "mgmt240": ["mgmt 240", "mgmt-240", "mgmt 240"],
    "mgmt301": ["mgmt 301", "mgmt-301", "designing innovative organizations"],
    "mgmt302": ["mgmt 302", "mgmt-302", "competing technology industries"],
    "mgmt364": ["mgmt 364", "mgmt-364", "technology management"],
    "mgmt450": ["mgmt 450", "mgmt-450", "strategy and competitive advantage"],
    "mgmt600": ["mgmt 600", "mgmt-600", "introduction change management"],
    "mgmt603": ["mgmt 603", "mgmt-603", "technology strategy"],
    "mgmt640": ["mgmt 640", "mgmt-640", "strategic human resource management"],
    "mgmt770": ["mgmt 770", "mgmt-770", "mba capstone"],
    "mgmt906": ["mgmt 906", "mgmt-906", "foundations research behavioral science"],
    "mgmt935": ["mgmt 935", "mgmt-935", "seminar organization theory"],
    "mgmt998": ["mgmt 998", "mgmt-998", "dissertation research management"],
    "remd": ["real estate", "estate"],
    "remd110": ["remd 110", "remd-110", "real estate management", "estate 110", "estate 110"],
    "remd320": ["remd 320", "remd-320", "sustainability built environment"],
    "remd375": ["remd 375", "remd-375", "real estate finance"],
    "remd410": ["remd 410", "remd-410", "real estate investment and asset management"],
    "cs171": ["cs 171", "cs-171"],
    "cs150": ["cs 150", "cs-150"],
    "engr": ["engineering cad"],
    "engr113": ["engr 113", "engr-113"],
    "com270": ["com 270", "com-270"],
    "opr": ["operations"],
    "opr320": ["opr 320", "opr-320", "linear models decision making"],
    "opr601": ["opr 601", "opr-601", "managerial decision models and simulation"],
    "opr998": ["opr 998", "opr-998", "dissertation research operations research"],
    "busn101": ["busn 101", "busn-101", "foundations business i"],
    "busn102": ["busn 102", "busn-102", "foundations business ii"],
    "busn105": ["busn 105", "busn-105", "applied business analysis"],
    "busn111": ["busn 111", "busn-111", "foundations business"],
    "busn501": ["busn 501", "busn-501", "measuring and maximizing financial performance"],
    "busn614": ["busn 614", "busn-614", "foundations career & professional development"],
    "busn997": ["busn 997", "busn-997", "research activity phd students lebow college business"],
    "bsan160": ["bsan 160", "bsan-160", "business analytics and data visualization"],
    "bsan360": ["bsan 360", "bsan-360", "programming data analytics"],
    "bsan460": ["bsan 460", "bsan-460", "business analytics senior project"],
    "bsan601": ["bsan 601", "bsan-601", "business analytics managers"],
    "bsan710": ["bsan 710", "bsan-710", "business analytics capstone project"],
    "co-op": ["coop"],
    "co-op101": ["co-op 101", "co-op-101"],
    "orgb300": ["orgb 300", "orgb-300", "organizational behavior"],
    "orgb320": ["orgb 320", "orgb-320", "leadership: theory and practice"],
    "orgb511": ["orgb 511", "orgb-511", "leading dynamic environments: personal, relational, and strategic approach"],
    "intb200": ["intb 200", "intb-200"],
    "econ202": ["econ 202", "econ-202", "introduction economics"],
    "econ270": ["econ 270", "econ-270", "introduction macroeconomics"],
    "econ321": ["econ 321", "econ-321", "microeconomics"],
    "econ350": ["econ 350", "econ-350", "macroeconomics"],
    "econ351": ["econ 351", "econ-351", "international economics"],
    "econ352": ["econ 352", "econ-352", "financial economics"],
    "econ353": ["econ 353", "econ-353", "international trade economics"],
    "tax360": ["tax 360", "tax-360"],
    "tax341": ["tax 341", "tax-341"],
    "se181": ["se 181", "se-181"],
    "accounting": ["bookkeeping", "cash flows", "invoice", "invoices"],
    "business101": ["business 101", "business-101"],
    "business102": ["business 102", "business-102"],
    "engl": ["english"],
    "finance": ["financial", "financial reporting", "financial analysis", "portfolio rebalancing", "portfolio management", "financial markets"],
    "soft-skills": ["speaking", "speak", "technical communication", "time management", "presentation", "presentations"],
    "sales": ["sales", "selling", "sale", "selling"],
    "technical-skills": ["excel", "spreadsheet", "email", "reporting", "powerpoint", "trading", "planning", "forecasting", "graphic design"],
    "busi-comm": ["business communication", "business communications"]
}


# Function to standardize keywords based on the synonym library
def standardize_keywords(text, synonym_lib):
    for standard_term, synonyms in synonym_lib.items():
        for synonym in synonyms:
            text = re.sub(rf"\b{re.escape(synonym)}\b", standard_term, text, flags=re.IGNORECASE)
    return text

# Function to expand course codes like "business 101 102" to "business 101 business 102"
def expand_course_codes(text):
    words = text.split()  # Split the input string into words
    if len(words) > 2:  # Ensure there's something to expand
        prefix = words[0]
        numbers = words[1:]
        expanded = [f"{prefix} {number}" for number in numbers]
        return " ".join(expanded)
    return text  # Return unchanged if there's nothing to expand

# # Apply keyword standardization
df['Standardized_Response'] = df['Cleaned_Response'].apply(lambda x: standardize_keywords(x, synonym_library))

# Function to remove repetitive words while preserving the order
def remove_repeated_words(text):
    words = text.split()
    seen = set()
    distinct_words = []
    for word in words:
        if word not in seen:
            seen.add(word)
            distinct_words.append(word)
    return ' '.join(distinct_words)

# Apply the function to remove repeated words in the 'Standardized_Response' column
df['Distinct_Response'] = df['Standardized_Response'].apply(remove_repeated_words)

# Function to add binary columns for synonym library matches
def add_binary_columns(df, synonym_lib):
    for standard_term, synonyms in synonym_lib.items():
        df[standard_term] = df['Standardized_Response'].apply(
            lambda text: 1 if any(keyword in text for keyword in synonyms) else 0
        )
    return df

# Add an "Others" column
def add_others_column(df, synonym_lib):
    binary_columns = list(synonym_lib.keys())
    # Add "Others" column, where no other binary columns are 1
    df['Others'] = df[binary_columns].sum(axis=1).apply(lambda x: 1 if x == 0 else 0)
    return df

# Apply keyword standardization
df['Standardized_Response'] = df['Cleaned_Response'].apply(lambda x: standardize_keywords(x, synonym_library))

# Add binary columns based on the synonym library
df = add_binary_columns(df, synonym_library)

# Add the "Others" column
df = add_others_column(df, synonym_library)

# Save the processed DataFrame to an Excel file
df.to_excel("relevant_courses_with_binary_and_others.xlsx", index=False)

print("Processing completed and results saved with binary columns and 'Others' column!")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] = df['Standardized_Response'].apply(
  df[standard_term] =

Processing completed and results saved with binary columns and 'Others' column!
