In [11]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from statsmodels.miscmodels.ordinal_model import OrderedModel
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [12]:
wave7_csv = "data/preprocessed/filtered_wave_7.csv"
df = pd.read_csv(wave7_csv, low_memory=False)

In [13]:
# Path to your wave 7 CSV
wave7_csv = "data/preprocessed/filtered_wave_7.csv"

# Load CSV
df = pd.read_csv(wave7_csv, low_memory=False)

# G8 country codes
g8_codes = [840, 124, 250, 276, 380, 392, 643, 826]

# Filter for G8 countries
df_g8 = df[df["Country"].isin(g8_codes)].copy()

print("Shape of G8 data:", df_g8.shape)
df_g8.head()


Shape of G8 data: (13914, 14)


Unnamed: 0,Country,Year,C Armed forces,C Television,C Police,C Courts,C Government,C Political parties,C Civil services,C Elections,Importance of democracy,Highest educational level,Age,Scale of incomes
10072,124,2020,2,3,2,3,2,3,2,2,10,7,76,9
10073,124,2020,1,2,2,2,1,3,2,1,10,6,69,3
10074,124,2020,3,3,3,3,3,3,3,3,5,3,35,3
10075,124,2020,3,3,2,3,3,3,3,2,8,7,45,7
10076,124,2020,3,3,2,4,4,4,3,3,10,6,32,5


In [14]:
# Assuming df_g8 is already loaded with G8 countries
# We will extract only the US subset
df_usa = df_g8[df_g8["Country"] == 840].copy()
print("Shape of data for USA:", df_usa.shape)
df_usa.head()


Shape of data for USA: (2596, 14)


Unnamed: 0,Country,Year,C Armed forces,C Television,C Police,C Courts,C Government,C Political parties,C Civil services,C Elections,Importance of democracy,Highest educational level,Age,Scale of incomes
89769,840,2017,3,2,4,4,4,4,4,4,2,4,43,3
89770,840,2017,2,3,2,2,3,4,4,4,5,2,35,5
89771,840,2017,2,3,2,2,2,3,3,3,1,3,48,1
89772,840,2017,2,3,2,2,4,4,3,4,10,3,49,1
89773,840,2017,2,3,2,2,4,4,4,3,5,3,20,5


In [15]:
valid_conf = [1, 2, 3, 4]      # valid responses for confidence
valid_demo = list(range(1, 11)) # 1..10 for Importance of democracy
valid_educ = list(range(0, 9))  # 0..8 if you have 9 levels of edu
valid_income = list(range(1, 11)) # 1..10 for scale of incomes

df_clean = df_usa[
    df_usa["C Government"].isin(valid_conf) &
    df_usa["C Elections"].isin(valid_conf) &
    df_usa["C Political parties"].isin(valid_conf) &
    df_usa["C Police"].isin(valid_conf) &
    df_usa["C Courts"].isin(valid_conf) &
    df_usa["C Armed forces"].isin(valid_conf) &
    df_usa["C Civil services"].isin(valid_conf) &
    df_usa["Importance of democracy"].isin(valid_demo) &
    df_usa["Highest educational level"].isin(valid_educ) &
    df_usa["Scale of incomes"].isin(valid_income) &
    (df_usa["Age"] > 0)  # ensure Age is positive
].copy()

print("Shape after filtering invalid codes for USA:", df_clean.shape)
df_clean.head()


Shape after filtering invalid codes for USA: (2428, 14)


Unnamed: 0,Country,Year,C Armed forces,C Television,C Police,C Courts,C Government,C Political parties,C Civil services,C Elections,Importance of democracy,Highest educational level,Age,Scale of incomes
89769,840,2017,3,2,4,4,4,4,4,4,2,4,43,3
89770,840,2017,2,3,2,2,3,4,4,4,5,2,35,5
89771,840,2017,2,3,2,2,2,3,3,3,1,3,48,1
89772,840,2017,2,3,2,2,4,4,3,4,10,3,49,1
89773,840,2017,2,3,2,2,4,4,4,3,5,3,20,5


In [16]:
df_clean = df_clean.rename(columns={
    "C Government": "gov_conf",
    "C Elections": "elect_conf",
    "C Political parties": "party_conf",
    "C Police": "police_conf",
    "C Courts": "courts_conf",
    "C Armed forces": "armed_conf",
    "C Civil services": "civil_conf",
    "Importance of democracy": "demo_import",
    "Highest educational level": "educ_level",
    "Scale of incomes": "income_scale",
    "Age": "age"
})

df_clean.head()


Unnamed: 0,Country,Year,armed_conf,C Television,police_conf,courts_conf,gov_conf,party_conf,civil_conf,elect_conf,demo_import,educ_level,age,income_scale
89769,840,2017,3,2,4,4,4,4,4,4,2,4,43,3
89770,840,2017,2,3,2,2,3,4,4,4,5,2,35,5
89771,840,2017,2,3,2,2,2,3,3,3,1,3,48,1
89772,840,2017,2,3,2,2,4,4,3,4,10,3,49,1
89773,840,2017,2,3,2,2,4,4,4,3,5,3,20,5


In [17]:
# Dependent variable = gov_conf (1..4)
y = df_clean["gov_conf"]

# List of predictors (feel free to remove or add as needed)
predictors = [
    "elect_conf", "party_conf", "police_conf", "courts_conf",
    "armed_conf", "civil_conf", "demo_import", "educ_level",
    "income_scale", "age"
]

# Subset the DataFrame to remove any row with NaN in these columns
df_model = df_clean.dropna(subset=["gov_conf"] + predictors)

# Reassign after dropping NaNs
y = df_model["gov_conf"]
X = df_model[predictors]

print("Shape of df_model after dropping NaNs:", df_model.shape)
X.head()


Shape of df_model after dropping NaNs: (2428, 14)


Unnamed: 0,elect_conf,party_conf,police_conf,courts_conf,armed_conf,civil_conf,demo_import,educ_level,income_scale,age
89769,4,4,4,4,3,4,2,4,3,43
89770,4,4,2,2,2,4,5,2,5,35
89771,3,3,2,2,2,3,1,3,1,48
89772,4,4,2,2,2,3,10,3,1,49
89773,3,4,2,2,2,4,5,3,5,20


In [18]:
# Instantiate the OrderedModel with a logit link
mod = OrderedModel(
    endog=y,
    exog=X,
    distr='logit'  # or 'probit'
)

# Fit the model
res = mod.fit(method='bfgs')
print("\nOrdered Logistic Regression Results for USA with multiple predictors:\n")
print(res.summary())


Optimization terminated successfully.
         Current function value: 1.041141
         Iterations: 41
         Function evaluations: 45
         Gradient evaluations: 45

Ordered Logistic Regression Results for USA with multiple predictors:

                             OrderedModel Results                             
Dep. Variable:               gov_conf   Log-Likelihood:                -2527.9
Model:                   OrderedModel   AIC:                             5082.
Method:            Maximum Likelihood   BIC:                             5157.
Date:                Fri, 24 Jan 2025                                         
Time:                        16:13:12                                         
No. Observations:                2428                                         
Df Residuals:                    2415                                         
Df Model:                          10                                         
                   coef    std err          z