In [13]:
import pandas as pd
import numpy as np

# For ordinal regression in statsmodels
from statsmodels.miscmodels.ordinal_model import OrderedModel

# For data splitting and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# For evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load the Dataset
wave7_csv = "data/preprocessed/filtered_wave_7.csv"
df = pd.read_csv(wave7_csv, low_memory=False)

# 2. Filter for USA
df_usa = df[df["Country"] == 840].copy()

# 3. Replace coded missing values with NaN
missing_values = [-1, -2, -4, -5]
df_usa.replace(missing_values, np.nan, inplace=True)

# 4. Select Relevant Columns
selected_columns = [
    "C Government", "C Political parties", "C Courts", "C Elections",
    "Age", "Sex", "Scale of incomes", "Highest educational level",
    "Importance of democracy", "Strong Leader", "Expert Non Govt Person"
]
df_model = df_usa[selected_columns].copy()

# 5. Identify Categorical vs. Numeric Columns
categorical_cols = [
    "Sex", "Scale of incomes", "Highest educational level",
    "Strong Leader", "Expert Non Govt Person"
]
numeric_cols = [
    "C Government", "C Political parties", "C Courts", "C Elections",
    "Age", "Importance of democracy"
]

# 6. Handle Missing Values
#    - Fill categorical with mode
#    - Fill numeric with median
for col in categorical_cols:
    mode_val = df_model[col].mode(dropna=True)
    if not mode_val.empty:
        df_model.loc[:, col] = df_model[col].fillna(mode_val[0])

for col in numeric_cols:
    median_val = df_model[col].median()
    df_model.loc[:, col] = df_model[col].fillna(median_val)

# 7. Encode Categorical Variables
encoder = LabelEncoder()
for col in categorical_cols:
    df_model.loc[:, col] = encoder.fit_transform(df_model[col].astype(str))

# 8. Scale Continuous Variables (Age, Importance of democracy)
scaler = MinMaxScaler()
df_model["Age"] = df_model["Age"].astype(float)
df_model["Importance of democracy"] = df_model["Importance of democracy"].astype(float)
df_model.loc[:, "Age"] = scaler.fit_transform(df_model[["Age"]])
df_model.loc[:, "Importance of democracy"] = scaler.fit_transform(df_model[["Importance of democracy"]])

# 9. Merge Classes in the Target
#    Class 1 -> Class 2
#    So final classes: 2, 3, 4
df_model["C_Gov_3class"] = df_model["C Government"].replace({1: 2})

# 10. Define Features X and New Target y
#     Remove the old "C Government" from features to avoid leakage
X_all = df_model.drop(["C Government", "C_Gov_3class"], axis=1)
y_all = df_model["C_Gov_3class"].astype(int)

# 11. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_all,
    y_all,
    test_size=0.3,
    random_state=42
)

print("Training set distribution:\n", y_train.value_counts(normalize=True))
print("\nTest set distribution:\n", y_test.value_counts(normalize=True))

# 12. Ordinal Logistic Regression (Logit by default)
model = OrderedModel(
    endog=y_train,
    exog=X_train,
    distr='logit'  # or 'probit'
)

results = model.fit(method='bfgs', maxiter=100)
print("\nModel Summary:")
print(results.summary())

# 13. Predict on the Test Set
#     which="prob" returns a DataFrame of probabilities for each class
pred_probs = results.predict(exog=X_test, which="prob")
pred_probs_array = pred_probs.to_numpy()  # convert to numpy

# 14. Convert Probabilities to Class Labels
#     Our classes (after merging) are {2, 3, 4}.
#     Statsmodels sorts them internally, so let's find the unique classes in ascending order.
unique_classes = sorted(y_train.unique())  # e.g. [2, 3, 4]
argmax_indices = pred_probs_array.argmax(axis=1)
y_pred = [unique_classes[idx] for idx in argmax_indices]

# 15. Evaluate
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Training set distribution:
 C_Gov_3class
3    0.391855
2    0.307100
4    0.301046
Name: proportion, dtype: float64

Test set distribution:
 C_Gov_3class
3    0.373556
4    0.315789
2    0.310655
Name: proportion, dtype: float64
Optimization terminated successfully.
         Current function value: 0.950074
         Iterations: 48
         Function evaluations: 50
         Gradient evaluations: 50

Model Summary:
                             OrderedModel Results                             
Dep. Variable:           C_Gov_3class   Log-Likelihood:                -1726.3
Model:                   OrderedModel   AIC:                             3477.
Method:            Maximum Likelihood   BIC:                             3543.
Date:                Mon, 27 Jan 2025                                         
Time:                        15:25:17                                         
No. Observations:                1817                                         
Df Residuals:                 