In [1]:
! pip install chardet 




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import os
import pandas as pd
import re
import chardet
from pathlib import Path

# Define input and output directories
input_folder = Path("linked_data")
output_folder = Path("clean")
output_folder.mkdir(exist_ok=True)

# Allow only alphabets and whitespace characters
allowed_characters = re.compile(r"[^a-zA-Z\s]")

processed = 0
skipped = []

#  Function to clean and capitalize each word
def clean_and_format_name(name):
    name = allowed_characters.sub("", name)  # Remove unwanted characters
    name = name.strip()
    return name.title() if name else ""  # Capitalize every word

# Function to detect encoding of file
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(10000))
    return result['encoding']

# Loop through all CSV files in input_folder
for i, file in enumerate(input_folder.glob("*.csv"), 1):
    try:
        # Try reading using UTF-8
        try:
            df = pd.read_csv(file)
        except UnicodeDecodeError:
            # Fallback: detect encoding if UTF-8 fails
            encoding = detect_encoding(file)
            try:
                df = pd.read_csv(file, encoding=encoding)
            except Exception as e:
                skipped.append((i, file.name, f"Fallback read failed with encoding '{encoding}': {e}"))
                continue

        # Check necessary columns
        if "First Name" not in df.columns or "Last Name" not in df.columns:
            skipped.append((i, file.name, "Missing 'First Name' or 'Last Name' columns"))
            continue

        # Drop rows with missing names
        df = df.dropna(subset=["First Name", "Last Name"])

        #  Apply cleaning function to names
        df["First Name"] = df["First Name"].astype(str).str.strip().apply(clean_and_format_name)
        df["Last Name"] = df["Last Name"].astype(str).str.strip().apply(clean_and_format_name)

        # Combine first and last name into full name
        df["Full Name"] = (df["First Name"] + " " + df["Last Name"]).str.strip()

        # Filter only alphabetic names
        df = df[df["Full Name"].apply(lambda x: bool(x) and x.replace(" ", "").isalpha())]

        # Prepare output DataFrame
        output_df = pd.DataFrame()
        output_df["Full Name"] = df["Full Name"]
        output_df["Company"] = df["Company"] if "Company" in df.columns else ""

        # Save cleaned file
        output_df.to_csv(output_folder / file.name, index=False)
        processed += 1

    except Exception as e:
        skipped.append((i, file.name, f"Error processing file: {e}"))

# Summary log
print(f"\n Total files processed: {processed}",'files saved to clean folder')
print(f" Total files skipped: {len(skipped)}")
if skipped:
    print("\nSkipped Files:")
    for idx, name, reason in skipped:
        print(f"{idx}. {name} - {reason}")



 Total files processed: 126 files saved to clean folder
 Total files skipped: 0
