In [158]:
import pandas as pd

In [159]:
# Read the time series data
time_series_csv = "data/preprocessed/filtered_time_series_1981_2022.csv"
df_time_series = pd.read_csv(time_series_csv, low_memory=False)

In [160]:
df_time_series.head()

Unnamed: 0,Study,WVS wave,Country,Country name,Survey year,C Armed Forces,C Police,C Parliament,C Civil Services,C Television,...,C Political Parties,C Courts,C NGOs,C TV News,C Elections,C International Criminal Court,Government order vs. freedom,Highest educational level,Age,Scale of incomes
0,2,3,8,ALB,1998,3,3,3,2,3,...,3,3,-4,-4,-4,-4,1,3,18,5
1,2,3,8,ALB,1998,3,3,3,3,3,...,3,3,-4,-4,-4,-4,1,4,18,4
2,2,3,8,ALB,1998,3,3,3,3,3,...,3,3,-4,-4,-4,-4,1,2,24,3
3,2,3,8,ALB,1998,2,2,2,2,3,...,3,3,-4,-4,-4,-4,1,3,26,6
4,2,3,8,ALB,1998,3,3,3,3,3,...,3,3,-4,-4,-4,-4,1,2,30,3


In [161]:
for col in df_time_series.columns:
    print(col)

Study
WVS wave
Country
Country name
Survey year
C Armed Forces 
C Police
C Parliament
C Civil Services
C Television
C Government
C Political Parties
C Courts
C NGOs
C TV News
C Elections
C International Criminal Court
Government order vs. freedom
Highest educational level
Age
Scale of incomes


In [162]:
# Problem 'C Armed Forces ' has an extra space
print(df_time_series.columns.tolist())

['Study', 'WVS wave', 'Country', 'Country name', 'Survey year', 'C Armed Forces ', 'C Police', 'C Parliament', 'C Civil Services', 'C Television', 'C Government', 'C Political Parties', 'C Courts', 'C NGOs', 'C TV News', 'C Elections', 'C International Criminal Court', 'Government order vs. freedom', 'Highest educational level', 'Age', 'Scale of incomes']


In [163]:
df_time_series.columns = df_time_series.columns.str.strip()

In [164]:
# Drop variables with >50% missing values
df_time_series.drop(columns=["Study", "WVS wave", "C Armed Forces"], inplace=True)

In [165]:
for col in df_time_series.columns:
    print(col)

Country
Country name
Survey year
C Police
C Parliament
C Civil Services
C Television
C Government
C Political Parties
C Courts
C NGOs
C TV News
C Elections
C International Criminal Court
Government order vs. freedom
Highest educational level
Age
Scale of incomes


In [166]:
df_time_series.head()

Unnamed: 0,Country,Country name,Survey year,C Police,C Parliament,C Civil Services,C Television,C Government,C Political Parties,C Courts,C NGOs,C TV News,C Elections,C International Criminal Court,Government order vs. freedom,Highest educational level,Age,Scale of incomes
0,8,ALB,1998,3,3,2,3,2,3,3,-4,-4,-4,-4,1,3,18,5
1,8,ALB,1998,3,3,3,3,3,3,3,-4,-4,-4,-4,1,4,18,4
2,8,ALB,1998,3,3,3,3,3,3,3,-4,-4,-4,-4,1,2,24,3
3,8,ALB,1998,2,2,2,3,2,3,3,-4,-4,-4,-4,1,3,26,6
4,8,ALB,1998,3,3,3,3,3,3,3,-4,-4,-4,-4,1,2,30,3


In [167]:
# ✅ Step 1: Handle negative values (-1, -2, -4, -5) as missing (NaN)
df_time_series.replace([-1, -2, -4, -5], None, inplace=True)

# ✅ Step 2: Define ordinal categorical levels
confidence_levels = [1, 2, 3, 4]  # Confidence in institutions (1: High, 4: Low)
ordinal_columns = [
    "C Police", "C Parliament", "C Civil Services", "C Television",
    "C Government", "C Political Parties", "C Courts", "C NGOs",
    "C TV News", "C Elections", "C International Criminal Court"
]

# Convert ordinal variables to ordered categories
for col in ordinal_columns:
    df_time_series[col] = pd.Categorical(df_time_series[col], categories=confidence_levels, ordered=True)

# ✅ Step 3: Convert specific ordinal variables
df_time_series["Government order vs. freedom"] = pd.Categorical(
    df_time_series["Government order vs. freedom"], categories=[1, 2], ordered=True
)

df_time_series["Highest educational level"] = pd.Categorical(
    df_time_series["Highest educational level"], categories=list(range(1, 9)), ordered=True
)

df_time_series["Scale of incomes"] = pd.Categorical(
    df_time_series["Scale of incomes"], categories=list(range(1, 11)), ordered=True
)

# ✅ Step 4: Convert numeric variables
df_time_series["Survey year"] = df_time_series["Survey year"].astype(int)
df_time_series["Country"] = df_time_series["Country"].astype(str)
df_time_series["Age"] = pd.to_numeric(df_time_series["Age"], errors="coerce")

# ✅ Step 5: Impute missing values

# Impute missing values for ordinal categorical variables using mode
mode_impute = df_time_series[ordinal_columns].mode().iloc[0]  # Compute mode once
df_time_series[ordinal_columns] = df_time_series[ordinal_columns].fillna(mode_impute)

# Impute missing values for additional ordinal categorical variables using mode
ordinal_vars = ["Scale of incomes", "Government order vs. freedom", "Highest educational level"]
mode_impute_ordinal = df_time_series[ordinal_vars].mode().iloc[0]
df_time_series[ordinal_vars] = df_time_series[ordinal_vars].fillna(mode_impute_ordinal)

# Impute missing values for Age using median (since it's continuous)
df_time_series["Age"] = df_time_series["Age"].fillna(df_time_series["Age"].median())

# ✅ Step 6: Verify final data types
print(df_time_series.dtypes)

# ✅ Step 7: Confirm missing values are handled
missing_values = df_time_series.isnull().sum()
print("✅ Missing Values After Imputation:\n", missing_values[missing_values > 0])


Country                             object
Country name                        object
Survey year                          int64
C Police                          category
C Parliament                      category
C Civil Services                  category
C Television                      category
C Government                      category
C Political Parties               category
C Courts                          category
C NGOs                            category
C TV News                         category
C Elections                       category
C International Criminal Court    category
Government order vs. freedom      category
Highest educational level         category
Age                                float64
Scale of incomes                  category
dtype: object
✅ Missing Values After Imputation:
 Series([], dtype: int64)


In [168]:
df_time_series.head()

Unnamed: 0,Country,Country name,Survey year,C Police,C Parliament,C Civil Services,C Television,C Government,C Political Parties,C Courts,C NGOs,C TV News,C Elections,C International Criminal Court,Government order vs. freedom,Highest educational level,Age,Scale of incomes
0,8,ALB,1998,3,3,2,3,2,3,3,2,3,2,2,1,3,18.0,5
1,8,ALB,1998,3,3,3,3,3,3,3,2,3,2,2,1,4,18.0,4
2,8,ALB,1998,3,3,3,3,3,3,3,2,3,2,2,1,2,24.0,3
3,8,ALB,1998,2,2,2,3,2,3,3,2,3,2,2,1,3,26.0,6
4,8,ALB,1998,3,3,3,3,3,3,3,2,3,2,2,1,2,30.0,3


In [169]:
# Save the cleaned dataset
cleaned_csv = "data/preprocessed/cleaned_wave_7_rq3.csv"
df_time_series.to_csv(cleaned_csv, index=False)

print("Data cleaning and preprocessing completed. Cleaned data saved to:", cleaned_csv)


Data cleaning and preprocessing completed. Cleaned data saved to: data/preprocessed/cleaned_wave_7_rq3.csv
