In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv(r"C:\Users\headl\Documents\EVC\RI-Voting-Models\raw_data\cleaned_survey_data.csv")
df.head()

Unnamed: 0,Unnamed: 22,Ward_1,Ward_3,Ward_5,Ward_6,Ward_7,Ward_8,Ward_9,Ward_11,Ward_12,...,Info_The Voter Information Handbook sent to my house,Info_211 Voter Hotline,Reason_Today was the best option for my schedule,Reason_I wanted to wait to learn more about the candidates,Reason_I did not feel like I had all the information necessary to vote early or by mail,Reason_I prefer voting on election day,Reason_I forgot to vote early or by mail,Reason_I do not trust mail-in voting,Registration_Support,RCV_Support
0,,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,5.0,3.0
1,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.0,5.0
2,,0,0,0,0,0,0,1,0,0,...,0,0,1,1,1,0,0,0,5.0,5.0
3,,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,5.0,2.0
4,,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,5.0,4.0


In [None]:
def prepare_data_for_ordinal_regression():
    # Read cleaned data
    df = pd.read_csv(r"C:\Users\headl\Documents\EVC\RI-Voting-Models\raw_data\cleaned_survey_data.csv")
    
    # Separate features and target
    y = df['RCV_Support']
    
    # Select features grouped by category
    feature_columns = [
        # Ward indicators
        'Ward_1', 'Ward_3', 'Ward_5', 'Ward_6', 'Ward_7', 'Ward_8', 
        'Ward_9', 'Ward_11', 'Ward_12', 'Ward_14', 'Ward_15',
        
        # Demographics
        'Age',
        'Gender_Female', 'Gender_Male', 'Gender_Non-binary / Other', 'Gender_Prefer not to say',
        'Marital_Status',
        'Education', 
        'Income',
        
        # Ethnicity
        'Ethnicity_White', 'Ethnicity_Black or African American', 
        'Ethnicity_Hispanic or Latino/a', 'Ethnicity_Asian',
        'Ethnicity_American Indian or Alaska Native', 
        'Ethnicity_Native Hawaiian or Pacific Islander', 'Ethnicity_Other',
        
        # Religion
        'Religion_Atheist', 'Religion_Buddhism', 'Religion_Catholic',
        'Religion_Hinduism', 'Religion_Islam', 'Religion_Judaism',
        'Religion_Other', 'Religion_Protestant / Other Christian Denomination',
        
        # Political variables
        'Political_Democrat', 'Political_Independent', 
        'Political_Other third party', 'Political_Republican',
        'US_Direction', 'RI_Direction',
        
        # Voting behavior
        'Vote_Donald Trump', 'Vote_Kamala Harris', 'Vote_Other',
        'Vote_Excitement', 'Politician_Confidence', 
        'Officials_Confidence', 'Results_Confidence',
        
        # Information sources
        'Info_Rhode Island Secretary of State\'s Website',
        'Info_Social Media', 'Info_Friends/Family',
        'Info_Religious/Community Organizations',
        'Info_The Voter Information Handbook sent to my house',
        'Info_211 Voter Hotline',
        
        # Voting reasons
        'Reason_Today was the best option for my schedule',
        'Reason_I wanted to wait to learn more about the candidates',
        'Reason_I did not feel like I had all the information necessary to vote early or by mail',
        'Reason_I prefer voting on election day',
        'Reason_I forgot to vote early or by mail',
        'Reason_I do not trust mail-in voting',
        
        # Registration support
        'Registration_Support'
    ]
    
    X = df[feature_columns]
    
    # Scale numerical features
    scaler = StandardScaler()
    numerical_features = ['Age', 'Education', 'Income', 'Vote_Excitement', 
                         'Politician_Confidence', 'Officials_Confidence', 
                         'Results_Confidence', 'US_Direction', 'RI_Direction',
                         'Registration_Support']
    
    X[numerical_features] = scaler.fit_transform(X[numerical_features])
    
    return X, y

In [3]:
def fit_ordinal_logistic_regression(X, y):
    # Add constant and convert to numpy arrays
    X = np.asarray(sm.add_constant(X))
    y = np.asarray(y)
    
    # Fit ordinal logistic regression
    model = sm.MNLogit(y, X)
    try:
        results = model.fit(method='lbfgs', maxiter=1000)
        return results
    except np.linalg.LinAlgError:
        print("Warning: Convergence issues. Trying alternative solver...")
        return model.fit(method='newton', maxiter=1000)

In [4]:
def analyze_results(results):
    # Print summary
    print("\nModel Summary:")
    print(results.summary())
    
    # Print key metrics
    print("\nModel Statistics:")
    print(f"Pseudo R-squared: {results.prsquared:.4f}")
    print(f"Log-Likelihood: {results.llf:.4f}")
    print(f"AIC: {results.aic:.4f}")
    print(f"BIC: {results.bic:.4f}")
    
    # Get feature importance
    print("\nFeature Importance (based on p-values):")
    pvalues = results.pvalues
    significant_features = pvalues[pvalues < 0.05].sort_values()
    print("\nSignificant features (p < 0.05):")
    for feature, pvalue in significant_features.items():
        print(f"{feature}: {pvalue:.4f}")

In [5]:
# Prepare data
print("Preparing data...")
X, y = prepare_data_for_ordinal_regression()

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit model
print("Fitting ordinal logistic regression model...")
results = fit_ordinal_logistic_regression(X_train, y_train)

# Analyze results
print("Analyzing results...")
analyze_results(results)

# Make predictions
y_pred = results.predict(sm.add_constant(X_test))
y_pred_classes = np.argmax(y_pred.values, axis=1) + 1

# Calculate accuracy
accuracy = (y_pred_classes == y_test).mean()
print(f"\nTest Set Accuracy: {accuracy:.4f}")

Preparing data...
Fitting ordinal logistic regression model...


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''