In [55]:
import pandas as pd
import numpy as np

# For ordinal regression
from statsmodels.miscmodels.ordinal_model import OrderedModel

# For data splitting and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# For evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [56]:
# Path to your CSV
wave7_csv = "data/preprocessed/filtered_wave_7.csv"

# Load the dataset
df = pd.read_csv(wave7_csv, low_memory=False)

# Filter for USA data
df_usa = df[df["Country"] == 840].copy()


In [57]:
# Replace coded missing values with NaN
missing_values = [-1, -2, -4, -5]
df_usa.replace(missing_values, np.nan, inplace=True)

# Columns relevant to your analysis
selected_columns = [
    "C Government", "C Political parties", "C Courts", "C Elections",
    "Age", "Sex", "Scale of incomes", "Highest educational level",
    "Importance of democracy", "Strong Leader", "Expert Non Govt Person"
]

# Create a new DataFrame for modeling
df_model = df_usa[selected_columns].copy()


In [59]:
# Identify categorical vs. numeric columns in your selected set
categorical_cols = [
    "Sex", "Scale of incomes", "Highest educational level",
    "Strong Leader", "Expert Non Govt Person"
]
numeric_cols = [
    "C Government", "C Political parties", "C Courts", "C Elections",
    "Age", "Importance of democracy"
]

# Fill categorical columns with mode
for col in categorical_cols:
    mode_value = df_model[col].mode(dropna=True)
    if not mode_value.empty:
        df_model.loc[:, col] = df_model[col].fillna(mode_value[0])

# Fill numeric columns with median
for col in numeric_cols:
    median_value = df_model[col].median()
    df_model.loc[:, col] = df_model[col].fillna(median_value)


In [60]:
encoder = LabelEncoder()
for col in categorical_cols:
    df_model.loc[:, col] = encoder.fit_transform(df_model[col].astype(str))


In [61]:
scaler = MinMaxScaler()
df_model.loc[:, "Age"] = scaler.fit_transform(df_model[["Age"]])
df_model.loc[:, "Importance of democracy"] = scaler.fit_transform(df_model[["Importance of democracy"]])


  df_model.loc[:, "Age"] = scaler.fit_transform(df_model[["Age"]])


In [48]:
# Example of collapsing classes 1 and 2 into a single category:
df_model["C Government Collapsed"] = df_model["C Government"].replace({1: 2})
# Then you have categories: 2, 3, 4


In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    random_state=42
)


In [64]:
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))


C Government
3    0.391855
4    0.301046
2    0.237204
1    0.069895
Name: proportion, dtype: float64
C Government
3    0.373556
4    0.315789
2    0.229782
1    0.080873
Name: proportion, dtype: float64


In [65]:
# Initialize the OrderedModel
model = OrderedModel(
    endog=y_train,  # target
    exog=X_train,   # predictors
    distr='logit'   # logistic link for ordinal data
)

# Fit the model
results = model.fit(method='bfgs', maxiter=100)
print(results.summary())


MissingDataError: exog contains inf or nans

In [53]:
# 11. Predict and Evaluate
# This returns a DataFrame of probabilities for each ordinal category
pred_probs = results.predict(exog=X_test, which="prob")

# Convert to numpy array, then get argmax
pred_probs_array = pred_probs.to_numpy()
y_pred = pred_probs_array.argmax(axis=1) + 1  # +1 if categories are 1-based

# Evaluate
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))



Confusion Matrix:
[[  3  60   0   0]
 [  2 177   0   0]
 [  0   0 291   0]
 [  0   0   0 246]]

Classification Report:
              precision    recall  f1-score   support

           1       0.60      0.05      0.09        63
           2       0.75      0.99      0.85       179
           3       1.00      1.00      1.00       291
           4       1.00      1.00      1.00       246

    accuracy                           0.92       779
   macro avg       0.84      0.76      0.73       779
weighted avg       0.91      0.92      0.89       779


Accuracy Score:
0.920410783055199


In [54]:
print("\nModel Parameters:")
print(results.params)

# If you want them in a DataFrame:
coeffs_df = pd.DataFrame({
    "Parameter": results.params.index,
    "Estimate": results.params.values
})
print(coeffs_df)



Model Parameters:
C Political parties           0.448031
C Courts                      0.273756
C Elections                   0.376187
Age                          -0.939643
Sex                           0.467452
Scale of incomes              0.084848
Highest educational level     0.128332
Importance of democracy      -0.586095
Strong Leader                 0.299891
Expert Non Govt Person       -0.165389
C Government Collapsed       25.212913
1/2                          52.313600
2/3                           2.669405
3/4                           3.267311
dtype: float64
                    Parameter   Estimate
0         C Political parties   0.448031
1                    C Courts   0.273756
2                 C Elections   0.376187
3                         Age  -0.939643
4                         Sex   0.467452
5            Scale of incomes   0.084848
6   Highest educational level   0.128332
7     Importance of democracy  -0.586095
8               Strong Leader   0.299891
9      Exp