In [12]:
def clean_survey_data(file_path):
    # Load dataset
    try:
        df = pd.read_csv(file_path)
        print("Data loaded successfully.")
        
        # Remove the header row that contains question text
        df = df.iloc[1:]
        print("Header row removed.")
        
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return
    except Exception as e:
        print(f"Error loading data: {e}")
        return
    
    # Handling missing values
    df.fillna("Unknown", inplace=True)
    print("Missing values handled.")
    
    # Q1: Ward (One-Hot Encoding)
    ward_columns = ["Ward_1", "Ward_3", "Ward_5", "Ward_6", "Ward_7", "Ward_8", "Ward_9", "Ward_11", "Ward_12", "Ward_14", "Ward_15"]
    df = pd.get_dummies(df, columns=["Q1"], prefix="Ward").reindex(columns=df.columns.tolist() + ward_columns, fill_value=0)
    print("Ward encoding complete.")
    
    # Q2: Age (Ordinal Encoding)
    age_mapping = {"18-24": 1, "25-29": 2, "30-39": 3, "40-49": 4, "50-64": 5, "65 or older": 6}
    df["Q2"] = df["Q2"].map(age_mapping).fillna(0)
    print("Age encoding complete.")
    
    # Q3: Gender ID (One-Hot Encoding)
    df = pd.get_dummies(df, columns=["Q3"], prefix="Gender")
    print("Gender encoding complete.")
    
    # Q4: Marital Status (Binary Encoding)
    df["Q4"] = df["Q4"].map({"Married": 1, "Not married": 0}).fillna(0)
    print("Marital Status encoding complete.")
    
    # Q5: Ethnicity (Multi-Hot Encoding)
    ethnicity_columns = ["White", "Black or African American", "Hispanic or Latino/a", "Asian", "American Indian or Alaska Native", "Native Hawaiian or Pacific Islander", "Other"]
    for col in ethnicity_columns:
        df[col] = df["Q5"].apply(lambda x: 1 if col in str(x) else 0)
    df.drop(columns=["Q5"], inplace=True)
    print("Ethnicity encoding complete.")
    
    # Q6: Religion (One-Hot Encoding)
    df = pd.get_dummies(df, columns=["Q6"], prefix="Religion")
    print("Religion encoding complete.")
    
    # Save cleaned data
    output_path = file_path.replace("survey_data.csv", "cleaned_survey_data.csv")
    df.to_csv(output_path, index=False)
    print(f"Data cleaning complete. Saved as '{output_path}'")

# Example usage
clean_survey_data(r"C:\Users\headl\Documents\EVC\RI-Voting-Models\raw_data\survey_data.csv")

Data loaded successfully.
Header row removed.
Missing values handled.
Ward encoding complete.
Age encoding complete.
Gender encoding complete.
Marital Status encoding complete.
Ethnicity encoding complete.
Religion encoding complete.
Data cleaning complete. Saved as 'C:\Users\headl\Documents\EVC\RI-Voting-Models\raw_data\cleaned_survey_data.csv'
