In [36]:
import pandas as pd

In [37]:
wave7_csv = "data/preprocessed/filtered_wave_7.csv"
df = pd.read_csv(wave7_csv, low_memory=False)

In [38]:
df.head()

Unnamed: 0,Country,Year,C Armed forces,C Television,C Police,C Courts,C Government,C Political parties,C Civil services,C Elections,Importance of democracy,Highest educational level,Age,Scale of incomes,Sex,Strong Leader,Expert Non Govt Person,Signing a petition,Joining unofficial strikes
0,20,2018,-4,1,1,1,1,1,1,1,8,3,60,5,2,4,4,2,3
1,20,2018,-4,3,3,3,4,4,3,3,10,7,47,9,1,4,4,1,2
2,20,2018,-4,4,2,2,3,3,3,3,10,7,48,5,1,4,2,1,2
3,20,2018,-4,3,3,3,3,3,3,3,7,2,62,4,2,3,2,2,2
4,20,2018,-4,3,2,2,2,3,3,3,8,2,49,4,1,3,3,1,1


In [39]:
if df["C Armed forces"].isnull().all():
    df.drop(columns=["C Armed forces"], inplace=True)


In [40]:
# Remove irrelevant columns
df.drop(columns=["Country", "Year", "C Armed forces"], inplace=True)

# Handle negative values (-1, -2, -4, -5) as missing
df.replace([-1, -2, -4, -5], None, inplace=True)

# Define ordinal categorical levels
confidence_levels = [1, 2, 3, 4]  # 1: A great deal, 2: Quite a lot, 3: Not very much, 4: None at all
democracy_levels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # Scale for Importance of Democracy
income_levels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # Scale for Scale of incomes
strong_leader_levels = [1, 2, 3, 4]  # 1: Very good, 2: Fairly good, 3: Fairly bad, 4: Very bad
expert_levels = [1, 2, 3, 4]  # Same as Strong Leader
petition_levels = [1, 2, 3]  # 1: Have done, 2: Might do, 3: Would never do
strike_levels = [1, 2, 3]  # Same as Signing a petition

# Convert ordinal variables to categorical types with ordered levels
ordinal_columns = {
    "C Television": confidence_levels,
    "C Police": confidence_levels,
    "C Courts": confidence_levels,
    "C Government": confidence_levels,
    "C Political parties": confidence_levels,
    "C Civil services": confidence_levels,
    "C Elections": confidence_levels,
    "Importance of democracy": democracy_levels,
    "Scale of incomes": income_levels,
    "Strong Leader": strong_leader_levels,
    "Expert Non Govt Person": expert_levels,
}

for col, levels in ordinal_columns.items():
    df[col] = pd.Categorical(df[col], categories=levels, ordered=True)

# Convert categorical variables
categorical_columns = {
    "Highest educational level": [0, 1, 2, 3, 4, 5, 6, 7, 8],  # ISCED levels 0-8
    "Sex": [1, 2],  # 1: Male, 2: Female
    "Signing a petition": petition_levels,
    "Joining unofficial strikes": strike_levels,
}

for col, levels in categorical_columns.items():
    df[col] = pd.Categorical(df[col], categories=levels, ordered=False)

# Convert Age to numeric and handle missing values
df["Age"] = pd.to_numeric(df["Age"], errors="coerce")

# Reset index after cleaning
df.reset_index(drop=True, inplace=True)

# Save the cleaned dataset
cleaned_csv = "data/preprocessed/cleaned_wave_7.csv"
df.to_csv(cleaned_csv, index=False)

print("Data cleaning and preprocessing completed. Cleaned data saved to:", cleaned_csv)


Data cleaning and preprocessing completed. Cleaned data saved to: data/preprocessed/cleaned_wave_7.csv
