## Import Libraries

In [37]:
import pandas as pd
import numpy as np

## Load Data

In [38]:
# Read the CSV file
df = pd.read_csv('candidates_svars_long_refined.csv')

print(f"Original dataset shape: {df.shape}")
print(f"\nColumn names:")
print(df.columns.tolist())
print(f"\nFirst few rows:")
display(df.head())

Original dataset shape: (488, 7)

Column names:
['candidate_id', 'name', 'party', 'municipality', 'question_number', 'question_text', 'answer_text']

First few rows:


Unnamed: 0,candidate_id,name,party,municipality,question_number,question_text,answer_text
0,5721,Michael Vindfeldt,A,Frederiksberg Kommune,6,,Se Michaels svar1/19| √∏konomi| Frederiksberg K...
1,5721,Michael Vindfeldt,A,Frederiksberg Kommune,2,,UenigMichaelssvarEnig
2,5721,Michael Vindfeldt,A,Frederiksberg Kommune,8,,UenigMichaelssvarEnig
3,5721,Michael Vindfeldt,A,Frederiksberg Kommune,1,,
4,5725,Sine Heltberg,A,Frederiksberg Kommune,6,,Se Sines svar1/19| √∏konomi| Frederiksberg Komm...


## Inspect Data Quality

In [39]:
# Check for missing values in answer_text
print("Data Quality Overview:")
print("="*60)
print(f"Total rows: {len(df)}")
print(f"\nBlank answer_text (NaN): {df['answer_text'].isna().sum()}")
print(f"Empty string answer_text: {(df['answer_text'] == '').sum()}")


# Check for rows starting with "Uenig"
starts_with_uenig = df['answer_text'].fillna('').str.match(r'^Uenig', na=False)
print(f"Starts with 'Uenig': {starts_with_uenig.sum()}")


# Check for rows containing "UenigMichaelssvarEnig"
contains_uenig = df['answer_text'].fillna('').str.contains('UenigMichaelssvarEnig', case=False, na=False)
print(f"Contains 'UenigMichaelssvarEnig': {contains_uenig.sum()}")

# Sample of problematic rows
print(f"\nSample of rows containing 'UenigMichaelssvarEnig':")
if contains_uenig.sum() > 0:
    display(df[contains_uenig][['name', 'party', 'question_number', 'answer_text']].head())

print(f"\nSample of rows starting with 'Uenig':")
if starts_with_uenig.sum() > 0:
    display(df[starts_with_uenig][['name', 'party', 'question_number', 'answer_text']].head())

Data Quality Overview:
Total rows: 488

Blank answer_text (NaN): 122
Empty string answer_text: 0
Starts with 'Uenig': 244
Contains 'UenigMichaelssvarEnig': 9

Sample of rows containing 'UenigMichaelssvarEnig':


Unnamed: 0,name,party,question_number,answer_text
0,Michael Vindfeldt,A,6,Se Michaels svar1/19| √∏konomi| Frederiksberg K...
1,Michael Vindfeldt,A,2,UenigMichaelssvarEnig
2,Michael Vindfeldt,A,8,UenigMichaelssvarEnig
120,Michael Brautsch,C,6,Se Michaels svar1/19| √∏konomi| Frederiksberg K...
121,Michael Brautsch,C,2,UenigMichaelssvarEnig



Sample of rows starting with 'Uenig':


Unnamed: 0,name,party,question_number,answer_text
1,Michael Vindfeldt,A,2,UenigMichaelssvarEnig
2,Michael Vindfeldt,A,8,UenigMichaelssvarEnig
5,Sine Heltberg,A,2,UenigSinessvarEnig
6,Sine Heltberg,A,8,UenigSinessvarEnig
9,Malte Mathies L√∏cke,A,2,UenigMalte Mathies'svarEnig


## Clean Data

In [40]:
# Create a copy for cleaning
df_cleaned = df.copy()

# Filter out rows where answer_text is blank (NaN or empty string)
df_cleaned = df_cleaned[df_cleaned['answer_text'].notna()]
df_cleaned = df_cleaned[df_cleaned['answer_text'].str.strip() != '']

# Filter out rows containing "UenigMichaelssvarEnig"
df_cleaned = df_cleaned[~df_cleaned['answer_text'].str.contains('UenigMichaelssvarEnig', case=False, na=False)]

# Filter out rows starting with "Uenig"
df_cleaned = df_cleaned[~df_cleaned['answer_text'].str.match(r'^Uenig', na=False)]

print(f"Cleaning Results:")
print("="*60)
print(f"Original rows: {len(df)}")
print(f"Cleaned rows: {len(df_cleaned)}")
print(f"Rows removed: {len(df) - len(df_cleaned)}")
print(f"Percentage retained: {(len(df_cleaned)/len(df)*100):.1f}%")

Cleaning Results:
Original rows: 488
Cleaned rows: 119
Rows removed: 369
Percentage retained: 24.4%


## Verify Cleaned Data

In [41]:
# Verify no blank or problematic values remain
print("Verification:")
print("="*60)
print(f"Blank answer_text (NaN): {df_cleaned['answer_text'].isna().sum()}")
print(f"Empty string answer_text: {(df_cleaned['answer_text'].str.strip() == '').sum()}")
print(f"Contains 'UenigMichaelssvarEnig': {df_cleaned['answer_text'].str.contains('UenigMichaelssvarEnig', case=False, na=False).sum()}")
print(f"Starts with 'Uenig': {df_cleaned['answer_text'].str.match(r'^Uenig', na=False).sum()}")

print(f"\nSample of cleaned data:")
display(df_cleaned.head(10))

print(f"\nData types:")
print(df_cleaned.dtypes)

Verification:
Blank answer_text (NaN): 0
Empty string answer_text: 0
Contains 'UenigMichaelssvarEnig': 0
Starts with 'Uenig': 0

Sample of cleaned data:


Unnamed: 0,candidate_id,name,party,municipality,question_number,question_text,answer_text
4,5725,Sine Heltberg,A,Frederiksberg Kommune,6,,Se Sines svar1/19| √∏konomi| Frederiksberg Komm...
8,5719,Malte Mathies L√∏cke,A,Frederiksberg Kommune,6,,Se Malte Mathies' svar1/19| √∏konomi| Frederiks...
12,5710,Christina Sylvest-Noer,A,Frederiksberg Kommune,6,,Se Christinas svar1/19| √∏konomi| Frederiksberg...
16,5729,Thomas Frank,A,Frederiksberg Kommune,6,,Se Thomas' svar1/19| √∏konomi| Frederiksberg Ko...
20,5726,Sofie K√ºmpel,A,Frederiksberg Kommune,6,,Se Sofies svar1/19| √∏konomi| Frederiksberg Kom...
24,5716,Kristoffer Appel,A,Frederiksberg Kommune,6,,Se Kristoffers svar1/19| √∏konomi| Frederiksber...
28,5714,Kim Christiansen,A,Frederiksberg Kommune,6,,Se Kims svar1/19| √∏konomi| Frederiksberg Kommu...
32,5723,Rasmus Edelberg,A,Frederiksberg Kommune,6,,Se Rasmus' svar1/19| √∏konomi| Frederiksberg Ko...
36,5709,Britt Dam,A,Frederiksberg Kommune,6,,Se Britts svar1/19| √∏konomi| Frederiksberg Kom...
40,5708,Bjarne Henriksen,A,Frederiksberg Kommune,6,,Se Bjarnes svar1/19| √∏konomi| Frederiksberg Ko...



Data types:
candidate_id         int64
name                object
party               object
municipality        object
question_number      int64
question_text      float64
answer_text         object
dtype: object


## Summary Statistics

In [42]:
# Summary by candidate
print("Cleaned Data Summary:")
print("="*60)
print(f"\nUnique candidates: {df_cleaned['name'].nunique()}")
print(f"Unique parties: {df_cleaned['party'].nunique()}")
print(f"Unique municipalities: {df_cleaned['municipality'].nunique()}")
print(f"Unique questions: {df_cleaned['question_number'].nunique()}")

print(f"\nAnswers per candidate:")
answers_per_candidate = df_cleaned.groupby('name').size()
print(f"  Mean: {answers_per_candidate.mean():.1f}")
print(f"  Median: {answers_per_candidate.median():.1f}")
print(f"  Min: {answers_per_candidate.min()}")
print(f"  Max: {answers_per_candidate.max()}")

print(f"\nTop 5 parties by number of answers:")
print(df_cleaned['party'].value_counts().head())

Cleaned Data Summary:

Unique candidates: 119
Unique parties: 14
Unique municipalities: 1
Unique questions: 1

Answers per candidate:
  Mean: 1.0
  Median: 1.0
  Min: 1
  Max: 1

Top 5 parties by number of answers:
party
C    28
A    20
√ò    17
F    12
V    11
Name: count, dtype: int64


## Save Cleaned Data

In [43]:
# Save to CSV
output_file = 'cleaned_01.csv'
df_cleaned.to_csv(output_file, index=False, encoding='utf-8')

print(f"‚úÖ Cleaned data saved to: {output_file}")
print(f"   Rows: {len(df_cleaned)}")
print(f"   Columns: {len(df_cleaned.columns)}")
print(f"   File size: {pd.read_csv(output_file).memory_usage(deep=True).sum() / 1024:.1f} KB")

‚úÖ Cleaned data saved to: cleaned_01.csv
   Rows: 119
   Columns: 7
   File size: 1289.6 KB
