In [1]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

df_cleaned = pd.read_csv('/Users/yzc/Desktop/EVC project/RI-Voting-Models/raw_data/survey_data_cleaned.csv')

# Assume ordinal_predictors, binary_predictors, one_hot_predictors, and df_cleaned are defined.
ordinal_predictors = ['Q2', 'Q7', 'Q8', 'Q10', 'Q11', 'Q13', 'Q14', 'Q15', 'Q16', 'Q20']
binary_predictors = ['Q4']
one_hot_predictors = [col for col in df_cleaned.columns if col.startswith('Q1_') 
                      or col.startswith('Q3_') 
                      or col.startswith('Q6_') 
                      or col.startswith('Q9_') 
                      or col.startswith('Q12_')]

# Scale ordinal predictors
scaler = StandardScaler()
X_ordinal = df_cleaned[ordinal_predictors]
X_ordinal_scaled = scaler.fit_transform(X_ordinal)

# Convert the scaled ordinal predictors back to a DataFrame
X_ordinal_scaled_df = pd.DataFrame(X_ordinal_scaled, columns=ordinal_predictors, index=df_cleaned.index)

# Retrieve binary and one-hot predictors (no scaling required for these as they are already 0/1)
X_binary = df_cleaned[binary_predictors]
X_one_hot = df_cleaned[one_hot_predictors]

# Combine all features into a single DataFrame
X_combined = pd.concat([X_ordinal_scaled_df, X_binary, X_one_hot], axis=1)

# Now X_combined is ready to be used for training your model


In [5]:
import pandas as pd
import numpy as np
import mord
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

np.int = int 

X = X_combined
y = df_cleaned['Q19']

# Split the dataset into training (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Initialize the ordinal logistic regression model from mord
model = mord.LogisticAT()

# Define a parameter grid for the regularization strength
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100, 1000]}

# Perform 5-fold cross-validation on the training set to fine-tune model parameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters from GridSearchCV:")
print(grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

# Retrieve the best model from grid search
best_model = grid_search.best_estimator_

# Apply the final model to the test set
y_pred = best_model.predict(X_test)

# Evaluate model performance on the test set
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nTest Set Accuracy: {:.4f}".format(accuracy))
print("Test Set Weighted F1 Score: {:.4f}".format(f1))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Best parameters from GridSearchCV:
{'alpha': 1000}
Best cross-validation accuracy: 0.5762

Test Set Accuracy: 0.6071
Test Set Weighted F1 Score: 0.5137

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00        12
           4       0.14      0.14      0.14        14
           5       0.69      0.96      0.80        69

    accuracy                           0.61       112
   macro avg       0.14      0.18      0.16       112
weighted avg       0.45      0.61      0.51       112



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
