In [3]:
import pandas as pd
import os
from tqdm import tqdm
from fuzzywuzzy import process

# 🔹 Set directory where the CSV files are stored
directory_path = '/Users/sreeharsha/Documents/TGH Data Management Cleaning/Translated ES'
output_file = os.path.join(directory_path, "merged_ES_data.csv")

# 🔹 List of desired column names
desired_columns = [
    "state", "creation date", "modified date", "completion time", "submitted date", 
    "first name", "last name", "date of birth", 
    "have you received a tgh device chromebook or ipad in the last two years",
    "primary language", "educational status", "employment status", "household income level",
    "number of people in household", "city", "do you have home internet access i e wifi",
    "home street address including apartment number", "course id", "course type",
    "phone number", "email address", "form name", "course name",
    "i agree to the learner agreement as listed above", "do you receive housing assistance",
    "name of siteschool", "childs date of birth", "primary language.1", "childs last name",
    "childs last name.1", "if you selected other what language", "do you have an existing business",
    "what industry is your business in", "if you selected other what language.1",
    "which if any social media tools do you use for your business", "if you selected other please explain",
    "zip code", "please describe your confidence level using the internet to find information you need",
    "how many tablets do you have at home e g ipad kindle fire samsung galaxy tab",
    "creating and sending emails", "opening and replying to emails",
    "downloading attachments i receive in an email documents or pictures",
    "adding an attachment to an email i am sending like documents or pictures",
    "how many working computers do you have at home",
    "is this the first activity youve participated in at your program site",
    "turning the tablet on and off", "installing applications from the chromebook store",
    "adjusting the tablets settings like the size of the text or the background picture",
    "connecting the tablet to wifi", "saving files to the tablet and finding them later",
    "using a mouse headset andor other accessories", "what is your goal for taking this course",
    "are you aware of the affordable connectivity program acp an fcc federal communications commission program",
    "are you currently enrolled in the acp program", "turning the tablet on and off.1",
    "connecting the tablet to wifi.1", "saving files to the tablet and finding them later.1",
    "adjusting the tablets settings like the size of the text or the background picture.1",
    "installing applications apps", "do you have a website for your business", "name of business",
    "your gender identity", "what is your gender", "are you self employed",
    "how would you describe your raceethnicity select all that apply", "do you have a smartphone e g iphone google pixel samsung galaxy",
    "is your internet reliable", "is your internet reliable.1",
    "how would you describe your raceethnicity select all that apply.1", "what is your gender.1",
    "what are your pronouns.1", "how would you describe your raceethnicity.1",
    "were you born in the united states", "have you ever spent time in a jail prison or juvenile detention centre",
    "because of my money situation i feel like i will never have the things i want in life",
    "i am aware of what my resources are to be employed in a good job",
    "i am able to utilize my skills to move toward career goals", "i am just getting by financially",
    "i am concerned that the money i have or will save wont last",
    "even if i am not able to achieve my financial goals right away i will find a way to get there",
    "what other boston neighborhoods are you connected to",
    "what is the zip code where you work", "i hereby give the unqualified right to the tgh program",
    "race ethnicity", "to which gender identity do you most identify", "childs grade",
    "your race ethnicity", "i am aware of what my skills are to be employed in a good job",
    "how do you describe your sexual orientation select all that apply", "zip",
    "do you have school age children in your household",
    "is this your first activity program at the school or site where you are taking tgh",
    "what is the main reason you decided to participate in tgh",
    "have you ever had to change or cancel your internet subscription because it is too expensive",
    "number of working computers at home", "what do you and other household members use the internet for",
    "which devices do you use at home", "how satisfied are you with your internet service at home",
    "how do you or anyone living in your household access the internet in your home",
    "is your internet access through a discounted program eg internet essentials pcs for people",
    "what financial resources do you use", "occupation", "place of work", "are you a veteran",
    "childs grade.1", "how do you describe your sexual orientation", "do you have a smartphone",
    "what best describes the organization you are starting or running",
    "what are the biggest challenges you are currently facing in running your business",
    "if you selected other what type", "if you selected other please explain.2",
    "i am aware of what my skills are to achieve a good employment",
    "how old is your business", "annual revenue of business", "number of employees",
    "do you accept credit debit cards", "do you know of organizations that help small business owners",
    "what does your business use the internet for", "how often are your business social media pages updated"
]

# 🔹 Automatically find all CSV files in the directory
csv_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith(".csv")]

if not csv_files:
    print("⚠️ No CSV files found in the specified directory.")
    exit()

print(f"✅ Found {len(csv_files)} CSV files to process.")

# Initialize empty merged dataframe with the desired column names
merged_df = pd.DataFrame(columns=desired_columns)

for file in tqdm(csv_files, desc="Merging files"):
    try:
        df = pd.read_csv(file, encoding="utf-8", dtype=str)

        # Select only columns that exist in both the file and the desired list
        available_cols = [col for col in desired_columns if col in df.columns]
        df = df[available_cols]

        # Add missing columns with NaN values
        for col in desired_columns:
            if col not in df.columns:
                df[col] = pd.NA

        # Reorder columns to match desired order
        df = df[desired_columns]

        # Append to final DataFrame
        merged_df = pd.concat([merged_df, df], ignore_index=True)

    except Exception as e:
        print(f"⚠️ Error processing {file}: {e}")

# Step 3: Save the merged file
merged_df.to_csv(output_file, index=False, encoding="utf-8")

print(f"\n✅ Merged data saved: {output_file}")
print("\n🎉 Merging complete! You can now review the final dataset.")


✅ Found 6 CSV files to process.


Merging files: 100%|██████████████████████████████| 6/6 [00:01<00:00,  4.18it/s]



✅ Merged data saved: /Users/sreeharsha/Documents/TGH Data Management Cleaning/Translated ES/merged_ES_data.csv

🎉 Merging complete! You can now review the final dataset.


In [3]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import pandas as pd

# Set the path to your files
files_path = '/Users/sreeharsha/Documents/TGH Data Management Cleaning/Translated ES'  # Update this path accordingly

# List all CSV files in the directory
all_files = [os.path.join(files_path, f) for f in os.listdir(files_path) if f.endswith('.csv')]

# Step 1: Print the original column names for each file for review
print("=== Original Column Names in Each File ===")
for file in all_files:
    df = pd.read_csv(file)
    print(f"File: {os.path.basename(file)}")
    print("Columns:", df.columns.tolist())
    print("-" * 40)

# Step 2: Define a function to clean the column names
def clean_column_names(columns):
    """
    Cleans column names by stripping whitespace, converting to lowercase,
    and replacing spaces with underscores.
    """
    cleaned_columns = []
    for col in columns:
        # You can add more cleaning steps as needed
        cleaned = col.strip().lower().replace(" ", "_")
        cleaned_columns.append(cleaned)
    return cleaned_columns

# Optional: Apply cleaning and print the cleaned column names for review
print("=== Cleaned Column Names in Each File ===")
for file in all_files:
    df = pd.read_csv(file)
    cleaned_cols = clean_column_names(df.columns.tolist())
    print(f"File: {os.path.basename(file)}")
    print("Cleaned Columns:", cleaned_cols)
    print("-" * 40)

# Next steps (once you're satisfied with the cleaned names):
# - You can re-read each file, assign the cleaned column names,
# - And then merge the data based on your required columns,
# - Ensuring no duplicate data is introduced.


=== Original Column Names in Each File ===
File: cleaned_translated_ES_spanish.csv
Columns: ['course name', 'name of siteschool', 'course type', 'first name', 'last name', 'what are your pronouns', 'have you received a tgh device chromebook or ipad in the last two years', 'date of birth', 'what is your gender', 'your gender identity', 'how would you describe your raceethnicity', 'how would you describe your raceethnicity select all that apply', 'primary language', 'if you selected other what language', 'home street address including apartment number', 'city', 'state', 'zip code', 'phone number', 'email address', 'number of people in household', 'educational status', 'employment status', 'what is the zip code where you work', 'are you self employed', 'household income level', 'do you receive housing assistance', 'what other boston neighborhoods are you connected to for example do you receive services visit family or friends go to school or places of worship in any of the following neigh

  df = pd.read_csv(file)
  df = pd.read_csv(file)


File: (OLD) TGH Learner Enrollment Form (6.2.2022).csv
Columns: ['course name', 'name of siteschool', 'course type', 'first name', 'last name', 'have you received a tgh device chromebook or ipad in the last two years', 'date of birth', 'to which gender identity do you most identify', 'your gender identity', 'race ethnicity', 'your race ethnicity', 'primary language', 'if you selected other what language', 'home street address including apartment number', 'city', 'state', 'zip code', 'phone number', 'email address', 'number of people in household', 'educational status', 'employment status', 'household income level', 'do you receive housing assistance', 'childs last name', 'childs last name.1', 'childs date of birth', 'childs grade', 'your gender identity.1', 'your race ethnicity.1', 'your race ethnicity.2', 'primary language.1', 'if you selected other what language.1', 'what is your goal for taking this course', 'is this the first activity youve participated in at your program site', 'h

  df = pd.read_csv(file)
  df = pd.read_csv(file)


File: (OLD)+TGH+Learner+Enrollment+Form+2025-03-06-11-24-39+457499.csv.csv
Cleaned Columns: ['submitted_date', 'name_of_siteschool', 'course_type', 'first_name', 'middle_initial', 'last_name', 'do_you_have_school_age_children_in_your_household', 'childs_last_name', 'childs_last_name.1', 'is_this_your_first_activity_program_at_the_school_or_site_where_you_are_taking_tgh', 'employment_status', 'race_ethnicity', 'household_income_level', 'primary_language', 'course_name', 'have_you_received_a_tgh_device_chromebook_or_ipad_in_the_last_two_years', 'date_of_birth', 'to_which_gender_identity_do_you_most_identify', 'if_you_selected_other_please_explain', 'if_you_selected_other_what_type', 'if_you_selected_other_what_language', 'home_street_address_including_apartment_number', 'city', 'state', 'zip', 'phone_number', 'email_address', 'number_of_people_in_household', 'educational_status', 'occupation', 'place_of_work', 'do_you_receive_housing_assistance', 'childs_grade', 'childs_date_of_birth', '

In [11]:
#cleaning the column names

import os
import pandas as pd

# Specify the directory containing your CSV and XLSM files
files_path = '/Users/sreeharsha/Documents/TGH Data Management Cleaning/Merge testing/Enrollment Survey'  # Update this path

# Gather all CSV and XLSM file paths
all_files = []
for file in os.listdir(files_path):
    if file.endswith('.csv') or file.endswith('.xlsm'):
        all_files.append(os.path.join(files_path, file))

# Function to clean and standardize column names:
def clean_column_names(columns):
    """
    Cleans column names by stripping leading/trailing whitespace and converting to lowercase.
    Additional cleaning (like replacing spaces with underscores) can be added as needed.
    """
    return [col.strip().lower() for col in columns]

# Dictionary to store file names and their cleaned columns as sets
file_columns = {}

for file in all_files:
    # Read file based on type
    if file.endswith('.csv'):
        df = pd.read_csv(file)
    else:  # for .xlsm files, read the first sheet by default
        df = pd.read_excel(file)
    
    # Clean the column names
    cleaned_cols = clean_column_names(df.columns.tolist())
    
    # Save the unique cleaned columns as a set
    file_columns[os.path.basename(file)] = set(cleaned_cols)

# Compute the union and intersection of all columns across the files
all_union = set()
all_intersection = None

for file, cols in file_columns.items():
    all_union |= cols  # union of all columns
    if all_intersection is None:
        all_intersection = cols.copy()
    else:
        all_intersection &= cols  # intersection of columns

# Determine counts
matching_count = len(all_intersection)
non_matching_columns = all_union - all_intersection
non_matching_count = len(non_matching_columns)

# Print the results
print("=== Columns per File ===")
for file, cols in file_columns.items():
    print(f"{file}: {sorted(cols)}")
    print("-" * 40)

print("\n=== Summary ===")
print("Matching columns (present in all files):")
print(sorted(all_intersection))
print("Count of matching columns:", matching_count)
print("\nNon-matching columns (not present in all files):")
print(sorted(non_matching_columns))
print("Count of non-matching columns:", non_matching_count)


  df = pd.read_csv(file)
  df = pd.read_csv(file)


=== Columns per File ===
Learner+Enrollment+ENGLISH.csv: ['8. how would you describe your race/ethnicity?', 'adding an attachment to an email i am sending (like documents or pictures)', "adjusting the computer's settings (like the size of the text or the background picture)", "adjusting the tablet's settings (like the size of the text or the background picture)", 'are you a veteran?', 'are you aware of the affordable connectivity program (acp), an fcc (federal communications commission) program to help families and households afford internet service? (more info here: https://www.fcc.gov/acp)', 'are you currently enrolled in the acp program?', 'are you self-employed?', 'because of my money situation, i feel like i will never have thethings i want in life.', "child's date of birth", "child's first name", "child's last name", "child's primary language", 'city', 'completion time', 'connecting the computer to wifi', 'connecting the tablet to wifi', 'course id', 'course name', 'course type',

In [13]:
#standardizing the column names


import os
import pandas as pd
import re
import unicodedata

# Directory containing your CSV and XLSM files
files_path = "/Users/sreeharsha/Documents/TGH Data Management Cleaning/Merge testing/Enrollment Survey"  # Update this path

# Gather all CSV and XLSM file paths
all_files = [os.path.join(files_path, f) for f in os.listdir(files_path) if f.endswith('.csv') or f.endswith('.xlsm')]

def normalize_column_name(col):
    """
    Normalize a column name by stripping whitespace, converting to lowercase,
    normalizing accented characters, removing punctuation, and replacing spaces with underscores.
    """
    # Remove leading/trailing spaces and convert to lowercase
    col = col.strip().lower()
    # Normalize unicode (e.g., remove accents)
    col = unicodedata.normalize('NFKD', col).encode('ASCII', 'ignore').decode('utf-8')
    # Remove punctuation (anything that is not a word character or whitespace)
    col = re.sub(r'[^\w\s]', '', col)
    # Replace one or more whitespace with a single underscore
    col = re.sub(r'\s+', '_', col)
    return col

# Dictionary to store file names and their normalized column sets
file_columns = {}

for file in all_files:
    # Read file depending on its type
    if file.endswith('.csv'):
        df = pd.read_csv(file)
    else:  # for .xlsm files, read the first sheet by default
        df = pd.read_excel(file)
    
    # Normalize the column names
    normalized_cols = [normalize_column_name(col) for col in df.columns.tolist()]
    
    # Save the unique normalized columns as a set
    file_columns[os.path.basename(file)] = set(normalized_cols)

# Compute the union (all unique columns) and intersection (common to every file)
all_union = set()
all_intersection = None

for cols in file_columns.values():
    all_union |= cols  # accumulate union of columns
    if all_intersection is None:
        all_intersection = cols.copy()
    else:
        all_intersection &= cols  # intersect with columns from the next file

# Determine counts
matching_count = len(all_intersection)
non_matching_columns = all_union - all_intersection
non_matching_count = len(non_matching_columns)

# Print the results
print("=== Normalized Columns per File ===")
for file, cols in file_columns.items():
    print(f"{file}: {sorted(cols)}")
    print("-" * 40)

print("\n=== Summary ===")
print("Matching columns (present in all files):")
print(sorted(all_intersection))
print("Count of matching columns:", matching_count)
print("\nNon-matching columns (not present in all files):")
print(sorted(non_matching_columns))
print("Count of non-matching columns:", non_matching_count)


  df = pd.read_csv(file)
  df = pd.read_csv(file)


=== Normalized Columns per File ===
Learner+Enrollment+ENGLISH.csv: ['8_how_would_you_describe_your_raceethnicity', 'adding_an_attachment_to_an_email_i_am_sending_like_documents_or_pictures', 'adjusting_the_computers_settings_like_the_size_of_the_text_or_the_background_picture', 'adjusting_the_tablets_settings_like_the_size_of_the_text_or_the_background_picture', 'are_you_a_veteran', 'are_you_aware_of_the_affordable_connectivity_program_acp_an_fcc_federal_communications_commission_program_to_help_families_and_households_afford_internet_service_more_info_here_httpswwwfccgovacp', 'are_you_currently_enrolled_in_the_acp_program', 'are_you_selfemployed', 'because_of_my_money_situation_i_feel_like_i_will_never_have_thethings_i_want_in_life', 'childs_date_of_birth', 'childs_first_name', 'childs_last_name', 'childs_primary_language', 'city', 'completion_time', 'connecting_the_computer_to_wifi', 'connecting_the_tablet_to_wifi', 'course_id', 'course_name', 'course_type', 'creating_and_sending_em

In [19]:
#advanced column names cleaning

import os
import pandas as pd
import re
import unicodedata

# Directory containing your CSV and XLSM files
files_path = "/Users/sreeharsha/Documents/TGH Data Management Cleaning/Merge testing/Enrollment Survey"  # Update this path accordingly

# Collect all CSV and XLSM files in the directory
all_files = [os.path.join(files_path, f) for f in os.listdir(files_path)
             if f.endswith('.csv') or f.endswith('.xlsm')]

# Optional mapping dictionary to standardize known variants
# For example, if "childs_first_name" should be "child_first_name"
mapping_dict = {
    "childs_first_name": "child_first_name",
    "childs_last_name": "child_last_name",
    "childs_date_of_birth": "child_date_of_birth",
    # add other mappings as needed
}

def advanced_normalize_column_name(col, mapping=None):
    """
    Normalize a column name by:
      - Stripping leading/trailing whitespace and converting to lowercase
      - Normalizing accented characters
      - Removing content within parentheses
      - Removing punctuation
      - Replacing one or more whitespace with an underscore
      - Removing numeric prefixes (e.g., "8_")
      - Applying a manual mapping for known variants if provided
    """
    # Step 1: Basic cleaning
    col = col.strip().lower()
    col = unicodedata.normalize('NFKD', col).encode('ASCII', 'ignore').decode('utf-8')
    # Remove text within parentheses
    col = re.sub(r'\(.*?\)', '', col)
    # Remove punctuation (keep alphanumerics and whitespace)
    col = re.sub(r'[^\w\s]', '', col)
    # Replace any whitespace with an underscore
    col = re.sub(r'\s+', '_', col)
    # Remove numeric prefixes followed by an underscore
    col = re.sub(r'^\d+_', '', col)
    
    # Step 2: Apply manual mapping if provided
    if mapping and col in mapping:
        col = mapping[col]
    
    return col

# Dictionary to store each file's normalized columns as a set
file_columns = {}

for file in all_files:
    # Read file based on its extension
    if file.endswith('.csv'):
        df = pd.read_csv(file)
    else:  # For .xlsm files (reads first sheet by default)
        df = pd.read_excel(file)
    
    # Normalize all column names in the DataFrame
    normalized_cols = [advanced_normalize_column_name(col, mapping=mapping_dict)
                       for col in df.columns.tolist()]
    
    file_columns[os.path.basename(file)] = set(normalized_cols)

# Compute the union and intersection of all normalized columns
all_union = set()
all_intersection = None

for cols in file_columns.values():
    all_union |= cols  # accumulate union
    if all_intersection is None:
        all_intersection = cols.copy()
    else:
        all_intersection &= cols  # intersect columns across files

matching_count = len(all_intersection)
non_matching_columns = all_union - all_intersection
non_matching_count = len(non_matching_columns)

# Print normalized columns per file
print("=== Normalized Columns per File ===")
for file, cols in file_columns.items():
    print(f"{file}:")
    print(sorted(cols))
    print("-" * 40)

# Print summary of matching vs. non-matching columns
print("\n=== Summary ===")
print("Matching columns (present in all files):")
print(sorted(all_intersection))
print("Count of matching columns:", matching_count)
print("\nNon-matching columns (not present in all files):")
print(sorted(non_matching_columns))
print("Count of non-matching columns:", non_matching_count)


  df = pd.read_csv(file)
  df = pd.read_csv(file)


=== Normalized Columns per File ===
Learner+Enrollment+ENGLISH.csv:
['adding_an_attachment_to_an_email_i_am_sending_', 'adjusting_the_computers_settings_', 'adjusting_the_tablets_settings_', 'are_you_a_veteran', 'are_you_aware_of_the_affordable_connectivity_program_an_fcc_program_to_help_families_and_households_afford_internet_service_', 'are_you_currently_enrolled_in_the_acp_program', 'are_you_selfemployed', 'because_of_my_money_situation_i_feel_like_i_will_never_have_thethings_i_want_in_life', 'child_date_of_birth', 'child_first_name', 'child_last_name', 'childs_primary_language', 'city', 'completion_time', 'connecting_the_computer_to_wifi', 'connecting_the_tablet_to_wifi', 'course_id', 'course_name', 'course_type', 'creating_and_sending_emails', 'creation_date', 'date_of_birth', 'do_you_have_a_smartphone_', 'do_you_have_a_website_for_your_business', 'do_you_have_an_existing_business', 'do_you_have_home_internet_access_', 'do_you_receive_housing_assistance', 'downloading_attachments_

In [17]:
import re
import unicodedata

def enhanced_normalize_column_name(col, mapping=None):
    """
    Normalize a column name by:
      - Stripping leading/trailing whitespace and converting to lowercase
      - Normalizing accented characters
      - Removing content within parentheses
      - Removing punctuation
      - Replacing one or more whitespace with an underscore
      - Removing numeric prefixes (e.g., "8_")
      - Removing redundant/trailing underscores
      - Applying a manual mapping for known variants if provided
    """
    # Basic cleaning: strip, lowercase, normalize accents
    col = col.strip().lower()
    col = unicodedata.normalize('NFKD', col).encode('ASCII', 'ignore').decode('utf-8')
    
    # Remove text within parentheses
    col = re.sub(r'\(.*?\)', '', col)
    
    # Remove punctuation (retain alphanumerics and whitespace)
    col = re.sub(r'[^\w\s]', '', col)
    
    # Replace any whitespace with an underscore
    col = re.sub(r'\s+', '_', col)
    
    # Remove numeric prefixes followed by an underscore
    col = re.sub(r'^\d+_', '', col)
    
    # Remove redundant or trailing underscores
    col = re.sub(r'_+', '_', col)  # collapse multiple underscores
    col = col.strip('_')
    
    # Apply manual mapping if provided
    if mapping and col in mapping:
        col = mapping[col]
    
    return col

# Example mapping dictionary for known variants
mapping_dict = {
    "are_you_selfemployed": "are_you_self_employed",
    "do_you_have_a_smartphone_": "do_you_have_a_smartphone",
    # Add more mappings as you identify them...
}

# Example usage with a list of test column names
test_columns = [
    "are_you_self_employed", 
    "are_you_selfemployed", 
    "do_you_have_a_smartphone_", 
    "adding_an_attachment_to_an_email_i_am_sending_",
    "adding_an_attachment_to_an_email_i_am_sending_like_documents_or_pictures"
]

normalized = [enhanced_normalize_column_name(col, mapping=mapping_dict) for col in test_columns]
print(normalized)
# Expected output:
# ['are_you_self_employed', 'are_you_self_employed', 'do_you_have_a_smartphone', 'adding_an_attachment_to_an_email_i_am_sending', 'adding_an_attachment_to_an_email_i_am_sending_like_documents_or_pictures']


['are_you_self_employed', 'are_you_self_employed', 'do_you_have_a_smartphone', 'adding_an_attachment_to_an_email_i_am_sending', 'adding_an_attachment_to_an_email_i_am_sending_like_documents_or_pictures']


In [31]:
import os
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# -------------------------------
# Step 1: Gather Files and Columns
# -------------------------------

# Set the directory containing your CSV and XLSM files (update the path)
files_path = '/Users/sreeharsha/Documents/TGH Data Management Cleaning/Merge testing/Enrollment Survey'  # <-- UPDATE THIS PATH

# Gather all CSV and XLSM files
all_files = [os.path.join(files_path, f) for f in os.listdir(files_path) 
             if f.endswith('.csv') or f.endswith('.xlsm')]

# Extract all unique column names across files
unique_cols = set()
for file in all_files:
    try:
        if file.endswith('.csv'):
            df = pd.read_csv(file)
        else:
            df = pd.read_excel(file)
        unique_cols.update(df.columns.tolist())
    except Exception as e:
        print(f"Error reading {file}: {e}")

unique_cols = sorted(unique_cols)
print("Total unique columns found:", len(unique_cols))

# -------------------------------
# Step 2: Create Interactive Widgets for Mapping
# -------------------------------

# A simple initial cleaning function to pre-populate the text boxes
def initial_clean(col):
    # Lowercase, strip extra spaces, and replace spaces with underscores
    return col.strip().lower().replace(" ", "_")

# Create a dictionary to hold a Text widget for each unique column name
col_widgets = {}
for col in unique_cols:
    default_val = initial_clean(col)
    # The description shows the original column (truncated if needed)
    widget_desc = (col[:30] + '...') if len(col) > 30 else col
    text_widget = widgets.Text(
        value=default_val,
        description=widget_desc,
        layout=widgets.Layout(width='100%')
    )
    col_widgets[col] = text_widget

# Group all widgets in a VBox and put it inside a scrollable container (useful if there are many columns)
vbox = widgets.VBox(list(col_widgets.values()))
scrollable_vbox = widgets.Box([vbox], layout=widgets.Layout(overflow='scroll', border='1px solid gray', height='500px'))
display(scrollable_vbox)

# -------------------------------
# Step 3: Create a Submit Button to Finalize the Mapping
# -------------------------------

# Create a button and output area to show the final mapping dictionary
submit_button = widgets.Button(description="Submit Mapping", button_style='success')
output_area = widgets.Output()

# Global variable to store final mapping
final_mapping = {}

def on_submit(b):
    global final_mapping
    mapping_dict = {}
    for orig, widget in col_widgets.items():
        # Use the edited value as the standardized column name
        mapping_dict[orig] = widget.value.strip()
    final_mapping = mapping_dict
    with output_area:
        output_area.clear_output()
        print("Final Mapping Dictionary:")
        for k, v in mapping_dict.items():
            print(f"'{k}'  ->  '{v}'")

submit_button.on_click(on_submit)
display(submit_button, output_area)

# -------------------------------
# Step 4 (Optional): Apply the Mapping to Your Files
# -------------------------------
# Once you have finalized the mapping (stored in final_mapping), you can re-read your files,
# update the column names, and optionally save the updated DataFrames.
#
# Example: Renaming columns in each file and saving them to a new folder.

# Uncomment and update the following code if you want to save the files with standardized columns:
#
# output_dir = "path/to/save/standardized_files"  # Update this path
# os.makedirs(output_dir, exist_ok=True)
#
# for file in all_files:
#     try:
#         if file.endswith('.csv'):
#             df = pd.read_csv(file)
#         else:
#             df = pd.read_excel(file)
#         # Rename columns using the mapping dictionary (only update if the original column exists)
#         new_cols = {col: final_mapping.get(col, col) for col in df.columns}
#         df.rename(columns=new_cols, inplace=True)
#         # Save the updated DataFrame (preserving the file type)
#         base_name = os.path.basename(file)
#         out_path = os.path.join(output_dir, base_name)
#         if file.endswith('.csv'):
#             df.to_csv(out_path, index=False)
#         else:
#             df.to_excel(out_path, index=False)
#         print(f"Processed and saved: {out_path}")
#     except Exception as e:
#         print(f"Error processing {file}: {e}")


  df = pd.read_csv(file)
  df = pd.read_csv(file)


Total unique columns found: 271


Box(children=(VBox(children=(Text(value="child's_date_of_birth", description=" Child's Date of Birth", layout=…

Button(button_style='success', description='Submit Mapping', style=ButtonStyle())

Output()

In [33]:
pip install ipywidgets


Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading uri_template-1.3.0-py3-none-any.whl.metadata (8.8 kB)
Collecting webcolors>=1.11 (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading webcolors-24.11.1-py3-none-any.whl.metadata (2.2 kB)
Downloading webcolors-24.11.1-py3-none-any.whl (14 kB)
Downloa

In [41]:
pip install ipywidgets


Note: you may need to restart the kernel to use updated packages.


In [43]:
import ipywidgets as widgets
widgets.IntSlider()


IntSlider(value=0)

In [47]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension


zsh:1: command not found: pip
zsh:1: command not found: jupyter


In [49]:
import os
import pandas as pd
import re
import unicodedata

# Define the directory with your CSV and XLSM files
files_path = "/Users/sreeharsha/Documents/TGH Data Management Cleaning/Merge testing/Enrollment Survey"  # <-- Update this path

# Function to apply basic cleaning to column names
def basic_clean(col):
    # Remove leading/trailing whitespace and convert to lowercase
    col = col.strip().lower()
    # Normalize accented characters (e.g., convert "é" to "e")
    col = unicodedata.normalize('NFKD', col).encode('ascii', 'ignore').decode('utf-8')
    # Remove punctuation and symbols (keeping alphanumerics and whitespace)
    col = re.sub(r'[^\w\s]', '', col)
    # Replace one or more whitespace with a single underscore
    col = re.sub(r'\s+', '_', col)
    # Remove any trailing underscores
    col = col.strip('_')
    return col

# Gather all CSV and XLSM file paths
all_files = [os.path.join(files_path, f) for f in os.listdir(files_path) 
             if f.endswith('.csv') or f.endswith('.xlsm')]

# Process each file and display the original and cleaned column names
for file in all_files:
    try:
        # Read the file based on its extension
        if file.endswith('.csv'):
            df = pd.read_csv(file)
        else:
            df = pd.read_excel(file)
        
        # Get original column names and apply basic cleaning
        original_cols = df.columns.tolist()
        cleaned_cols = [basic_clean(col) for col in original_cols]
        
        print(f"File: {os.path.basename(file)}")
        print("Original Columns:")
        print(original_cols)
        print("Cleaned Columns:")
        print(cleaned_cols)
        print("-" * 60)
        
        # Optionally, if you want to update the DataFrame in memory:
        # df.columns = cleaned_cols
        # And then, if you wish, save the cleaned DataFrame to a new file:
        # out_path = os.path.join("path/to/save", os.path.basename(file))
        # if file.endswith('.csv'):
        #     df.to_csv(out_path, index=False)
        # else:
        #     df.to_excel(out_path, index=False)
    except Exception as e:
        print(f"Error processing {file}: {e}")


  df = pd.read_csv(file)


File: Learner+Enrollment+ENGLISH.csv
Original Columns:
['Submitted Date', 'Course Name', 'Name of Site/School', 'Course Type', 'First Name', 'Last Name', 'Do you have home internet access (i.e. wifi)?', 'What are your pronouns?', 'Have you received a TGH device (Chromebook or iPad) in the last two years?', 'Date of Birth', 'What is your gender?', 'Your gender identity:', '8. How would you describe your race/ethnicity?', 'How would you describe your race/ethnicity? (select all that apply)', 'Primary Language', 'other_language', 'Are you a veteran?', 'Home Street Address (including apartment number)', 'City', 'State', 'Zip Code', 'Phone Number', 'Email Address', 'Number of people in Household', 'Educational Status', 'Employment Status', 'Are you self-employed?', 'What is the Zip Code where you work?', 'Household Income Level', 'Do you receive housing assistance?', 'What other Boston neighborhoods are you connected to? For example, do you receive services, visit family or friends, go to s

  df = pd.read_csv(file)


File: (OLD)+TGH+Learner+Enrollment+Form+2025-03-06-11-24-39+457499.csv.csv
Original Columns:
['Submitted Date', 'Name of Site/School', 'Course Type', 'First Name', 'Middle Initial', 'Last Name', 'Do you have school age children in your household?', "Child's First Name", "Child's Last Name", 'Is this your first activity/program at the school or site where you are taking TGH?', 'Employment Status', 'Race/Ethnicity', 'Household Income Level', 'Primary Language', 'Course Name', 'Have you received a TGH device (Chromebook or iPad) in the last two years?', 'Date of Birth', 'To which gender identity do you most identify?', 'If you selected "Other", please specify:', 'other_race/ethnicity', 'other_language', 'Home Street Address (including apartment number)', 'City', 'State', 'Zip', 'Phone Number', 'Email Address', 'Number of Members in Household', 'Educational Status', 'Occupation', 'Place of Work', 'Do you receive housing assistance?', "Child's Grade", "Child's Date of Birth", "Child's Gende

In [51]:
import os
import pandas as pd

# Set the directory containing your CSV and XLSM files
files_path = '/Users/sreeharsha/Documents/TGH Data Management Cleaning/Merge testing/Enrollment Survey'  # <-- Update this path

# Gather all CSV and XLSM file paths
all_files = [os.path.join(files_path, f) for f in os.listdir(files_path)
             if f.endswith('.csv') or f.endswith('.xlsm')]

# Dictionaries to store stats:
# file_columns: maps filename -> list of columns in that file
# column_files: maps column name -> set of filenames where it appears
file_columns = {}
column_files = {}

# Process each file
for file in all_files:
    try:
        if file.endswith('.csv'):
            df = pd.read_csv(file)
        else:
            df = pd.read_excel(file)
        
        # Get list of columns in the current file
        cols = df.columns.tolist()
        filename = os.path.basename(file)
        file_columns[filename] = cols
        
        # Update column_files dictionary
        for col in cols:
            if col not in column_files:
                column_files[col] = set()
            column_files[col].add(filename)
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Calculate frequency of each column (i.e., in how many files each column appears)
column_stats = {col: len(files) for col, files in column_files.items()}

# Sort columns by frequency (highest first)
sorted_columns = sorted(column_stats.items(), key=lambda x: x[1], reverse=True)

# Summary: print out column frequency stats
print("=== Column Frequency Across Files ===")
for col, freq in sorted_columns:
    files_list = ", ".join(sorted(column_files[col]))
    print(f"{col}: present in {freq} file(s) -> {files_list}")

# Summary: show number of fields in each file
print("\n=== Number of Fields per File ===")
for filename, cols in file_columns.items():
    print(f"{filename}: {len(cols)} fields")

# Optionally, create a DataFrame to visualize the stats
stats_df = pd.DataFrame(sorted_columns, columns=['Column', 'FileCount'])
stats_df.head(20)


  df = pd.read_csv(file)
  df = pd.read_csv(file)


=== Column Frequency Across Files ===
Submitted Date: present in 5 file(s) -> (OLD) TGH Learner Enrollment Form (6.2.2022).csv, (OLD)+TGH+Learner+Enrollment+Form+2025-03-06-11-24-39+457499.csv.csv, Learner+Enrollment+ENGLISH.csv, translated_haitian_creole.xlsm, translated_spanish.xlsm
Course Name: present in 5 file(s) -> (OLD) TGH Learner Enrollment Form (6.2.2022).csv, (OLD)+TGH+Learner+Enrollment+Form+2025-03-06-11-24-39+457499.csv.csv, Learner+Enrollment+ENGLISH.csv, translated_haitian_creole.xlsm, translated_spanish.xlsm
Name of Site/School: present in 5 file(s) -> (OLD) TGH Learner Enrollment Form (6.2.2022).csv, (OLD)+TGH+Learner+Enrollment+Form+2025-03-06-11-24-39+457499.csv.csv, Learner+Enrollment+ENGLISH.csv, translated_haitian_creole.xlsm, translated_spanish.xlsm
Course Type: present in 5 file(s) -> (OLD) TGH Learner Enrollment Form (6.2.2022).csv, (OLD)+TGH+Learner+Enrollment+Form+2025-03-06-11-24-39+457499.csv.csv, Learner+Enrollment+ENGLISH.csv, translated_haitian_creole.x

Unnamed: 0,Column,FileCount
0,Submitted Date,5
1,Course Name,5
2,Name of Site/School,5
3,Course Type,5
4,First Name,5
5,Last Name,5
6,Have you received a TGH device (Chromebook or ...,5
7,Date of Birth,5
8,Primary Language,5
9,Home Street Address (including apartment number),5


In [55]:
import pandas as pd
import re
import unicodedata

def standardize_column(col):
    """
    Standardizes a column name by:
      - Removing leading/trailing whitespace
      - Converting to lowercase
      - Normalizing accented characters
      - Removing special characters (punctuation, etc.)
      - Replacing internal spaces with a single underscore
      - Removing any underscores at the beginning or end
    """
    # Trim extra spaces and convert to lowercase
    col = col.strip().lower()
    # Normalize accented characters (e.g., é -> e)
    col = unicodedata.normalize('NFKD', col).encode('ascii', 'ignore').decode('utf-8')
    # Remove punctuation and special characters (keeping alphanumerics and whitespace)
    col = re.sub(r'[^\w\s]', '', col)
    # Replace one or more whitespace with a single underscore
    col = re.sub(r'\s+', '_', col)
    # Ensure no leading or trailing underscores remain
    col = col.strip('_')
    return col

# Update the file path with your file to standardize
file_path = '/Users/sreeharsha/Documents/TGH Data Management Cleaning/Merge testing/Enrollment Survey/Learner+Enrollment+ENGLISH.csv'  # <-- Update this path

# Load the file (assuming CSV; use pd.read_excel for xlsm files)
df = pd.read_csv(file_path)

# Display original columns
print("Original columns:")
print(df.columns.tolist())

# Apply standardization to each column name
new_columns = [standardize_column(col) for col in df.columns]
df.columns = new_columns

# Display standardized columns
print("Standardized columns:")
print(df.columns.tolist())

# Save the updated DataFrame to a new file (update the output path as needed)
output_path = "/Users/sreeharsha/Documents/TGH Data Management Cleaning/Merge testing/Enrollment Survey/ES_English_standardized.csv"  # <-- Update this path
df.to_csv(output_path, index=False)
print("Standardized file saved to:", output_path)


Original columns:
['Submitted Date', 'Course Name', 'Name of Site/School', 'Course Type', 'First Name', 'Last Name', 'Do you have home internet access (i.e. wifi)?', 'What are your pronouns?', 'Have you received a TGH device (Chromebook or iPad) in the last two years?', 'Date of Birth', 'What is your gender?', 'Your gender identity:', '8. How would you describe your race/ethnicity?', 'How would you describe your race/ethnicity? (select all that apply)', 'Primary Language', 'other_language', 'Are you a veteran?', 'Home Street Address (including apartment number)', 'City', 'State', 'Zip Code', 'Phone Number', 'Email Address', 'Number of people in Household', 'Educational Status', 'Employment Status', 'Are you self-employed?', 'What is the Zip Code where you work?', 'Household Income Level', 'Do you receive housing assistance?', 'What other Boston neighborhoods are you connected to? For example, do you receive services, visit family or friends, go to school or places of worship, in any of

  df = pd.read_csv(file_path)
