In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from sklearn import metrics

import statsmodels.api as sm

# Import data and create separate dataframes

In [57]:
file = r"C:\Users\PcLaptop\Documents\GitHub\APlanet\Pilot_2\Rearrange_datasets\Part2_long_format.csv"
df = pd.read_csv(file)
df = df.dropna(subset=['policy_choice'])
df = df[df['policy_choice'] != 0]
df['treatment'] = df['treatment'].str.replace(' ', '_')
df.reset_index(drop=True, inplace=True)

In [59]:
# substitute entries of df['policy_choice'] with the corresponding policy name
policy_dict = {1: 'A', 2: 'B', 3: 'Other'}
df['policy_choice'] = df['policy_choice'].map(policy_dict)

In [60]:
categorical_columns = ['Revenue_A', 'Revenue_B']
categorical_data = df[categorical_columns]

# Apply one-hot encoding to the categorical data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_categorical_data = encoder.fit_transform(categorical_data)

df = pd.concat([df.reset_index(drop=True), pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_columns))], axis=1)



In [61]:
Baseline = df[df['treatment']=='Baseline'].reset_index(drop=True)
Pollution = df[df['treatment']=='Pollution'].reset_index(drop=True)
Public_Services = df[df['treatment']=='Public_Services'].reset_index(drop=True)
Road_Pricing = df[df['treatment']=='Road_Pricing'].reset_index(drop=True)
Social_Norm = df[df['treatment']=='Social_Norm'].reset_index(drop=True)

# Multinomial logistic regression

## Regression with all treatments together

In [142]:
# Combine the numerical and encoded categorical data
X = df[['Price_UR_A', 'Price_UN_A', 'Price_O_A', 'EV_A',
        'Price_UR_B', 'Price_UN_B', 'Price_O_B', 'EV_B',
        'Revenue_A_2.0', 'Revenue_A_3.0', 'Revenue_A_4.0', 'Revenue_A_5.0',
        'Revenue_B_2.0', 'Revenue_B_3.0', 'Revenue_B_4.0', 'Revenue_B_5.0']] 
y = df['policy_choice']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Sklearn

In [143]:
# Fit the Logistic Regression model
model = LogisticRegression(solver='newton-cg', multi_class='multinomial',random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report: \n', classification_report(y_test, y_pred))

# Display the coefficients
coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print(coefficients)

print('Intercept: \n', model.intercept_)

#coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print('Coefficients: \n', coefficients)

exp_coefficients = pd.DataFrame(np.exp(model.coef_), columns= X.columns.tolist())
print('Exponential coefficients: \n', exp_coefficients)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
pd.DataFrame(cnf_matrix, index=['A','B', 'Other'], columns=['predicted_A', 'predicted_B', 'predicted_Other'])

Accuracy: 0.48491155046826223
Classification Report: 
               precision    recall  f1-score   support

           A       0.45      0.61      0.52       320
           B       0.52      0.65      0.58       414
       Other       0.33      0.00      0.01       227

    accuracy                           0.48       961
   macro avg       0.43      0.42      0.37       961
weighted avg       0.45      0.48      0.42       961

Intercept: 
 [ 0.45184296  0.12651061 -0.57835357]
Exponential coefficients: 
    Price_UR_A  Price_UN_A  Price_O_A      EV_A  Price_UR_B  Price_UN_B  \
0    0.862318    0.953639   0.956702  1.067082    1.038130    1.041525   
1    1.082497    1.076767   1.014274  0.943450    0.918117    0.934189   
2    1.071287    0.973854   1.030547  0.993307    1.049180    1.027769   

   Price_O_B      EV_B  Revenue_A_2.0  Revenue_A_3.0  Revenue_A_4.0  \
0   1.024830  0.942609       0.935206       0.958863       1.015143   
1   0.948229  1.093447       1.010064       0.

Unnamed: 0,predicted_A,predicted_B,predicted_Other
A,196,123,1
B,144,269,1
Other,97,129,1


In [106]:
yhat = log_reg.predict(X_test) 

# Ensure the predictions are rounded to the nearest integer and converted to a list
prediction = list(map(lambda x: int(round(x)), yhat.idxmax(axis=1)))
#map predictions to outcomes
policy_dict = {0: 'A', 1: 'B', 2: 'Other'}
prediction = list(map(policy_dict.get, prediction))

# comparing original and predicted values of y 
print('Actual values', list(y_test.values)) 
print('Predictions :', prediction)

Actual values ['B', 'A', 'B', 'A', 'B', 'A', 'A', 'Other', 'Other', 'B', 'A', 'Other', 'A', 'B', 'A', 'Other', 'Other', 'A', 'Other', 'Other', 'Other', 'B', 'Other', 'B', 'A', 'B', 'B', 'B', 'B', 'Other', 'B', 'Other', 'A', 'Other', 'B', 'A', 'B', 'B', 'B', 'A', 'B', 'A', 'Other', 'A', 'B', 'B', 'Other', 'B', 'Other', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'A', 'A', 'A', 'Other', 'B', 'A', 'A', 'B', 'A', 'B', 'Other', 'B', 'B', 'Other', 'A', 'A', 'A', 'A', 'B', 'A', 'B', 'A', 'B', 'B', 'A', 'B', 'A', 'Other', 'B', 'B', 'Other', 'A', 'Other', 'A', 'B', 'A', 'B', 'Other', 'B', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'A', 'B', 'Other', 'B', 'Other', 'B', 'A', 'A', 'A', 'B', 'B', 'A', 'Other', 'A', 'Other', 'B', 'B', 'B', 'Other', 'B', 'A', 'A', 'Other', 'A', 'B', 'Other', 'A', 'B', 'A', 'A', 'A', 'Other', 'B', 'B', 'A', 'A', 'Other', 'A', 'A', 'B', 'A', 'B', 'B', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'A', 'B', 'A', 'B', 'A', 'Other', 'B', 'Other', 'A', 'A', 'B', 'A', 'Other', 'Other', 'A', 'Othe

### Statsmodels

In [107]:
# building the model and fitting the data 
log_reg = sm.MNLogit(y, X).fit() 

# printing the summary table 
print(log_reg.summary()) 

Optimization terminated successfully.
         Current function value: 1.026251
         Iterations 5
                          MNLogit Regression Results                          
Dep. Variable:          policy_choice   No. Observations:                 3844
Model:                        MNLogit   Df Residuals:                     3812
Method:                           MLE   Df Model:                           30
Date:                mer, 13 nov 2024   Pseudo R-squ.:                 0.04571
Time:                        13:44:36   Log-Likelihood:                -3944.9
converged:                       True   LL-Null:                       -4133.9
Covariance Type:            nonrobust   LLR p-value:                 8.014e-62
    policy_choice=B       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Price_UR_A              0.2110      0.044      4.830      0.000       0.125       0.297
Pr

In [96]:
#export the summary to a txt file
with open('results_all_treatments.txt', 'w') as fh:
    fh.write(log_reg.summary().as_text())

## Regression for treatment: Baseline

In [124]:
# Combine the numerical and encoded categorical data
X = Baseline[['Price_UR_A', 'Price_UN_A', 'Price_O_A', 'EV_A',
                     'Price_UR_B', 'Price_UN_B', 'Price_O_B', 'EV_B',
                     'Revenue_A_2.0', 'Revenue_A_3.0', 'Revenue_A_4.0', 'Revenue_A_5.0',
                     'Revenue_B_2.0', 'Revenue_B_3.0', 'Revenue_B_4.0', 'Revenue_B_5.0']] 
y = Baseline['policy_choice']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Sklearn

In [125]:
# Fit the Logistic Regression model
model = LogisticRegression(solver='newton-cg', multi_class='multinomial',random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report: \n', classification_report(y_test, y_pred))

# Display the coefficients
coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print(coefficients)

print('Intercept: \n', model.intercept_)

#coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print('Coefficients: \n', coefficients)

exp_coefficients = pd.DataFrame(np.exp(model.coef_), columns= X.columns.tolist())
print('Exponential coefficients: \n', exp_coefficients)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
pd.DataFrame(cnf_matrix, index=['A','B', 'Other'], columns=['predicted_A', 'predicted_B', 'predicted_Other'])

Accuracy: 0.5231788079470199
Classification Report: 
               precision    recall  f1-score   support

           A       0.52      0.60      0.55        55
           B       0.56      0.68      0.61        65
       Other       0.25      0.06      0.10        31

    accuracy                           0.52       151
   macro avg       0.44      0.45      0.42       151
weighted avg       0.48      0.52      0.49       151

Intercept: 
 [ 0.55531176  0.75500853 -1.3103203 ]
Exponential coefficients: 
    Price_UR_A  Price_UN_A  Price_O_A      EV_A  Price_UR_B  Price_UN_B  \
0    0.872455    0.870624   0.895125  1.070266    1.051443    1.029465   
1    1.046422    1.102738   1.000109  0.901450    0.856582    0.983183   
2    1.095343    1.041590   1.117041  1.036494    1.110312    0.987994   

   Price_O_B      EV_B  Revenue_A_2.0  Revenue_A_3.0  Revenue_A_4.0  \
0   1.036151  1.035445       0.782693       0.700687       0.982844   
1   0.901967  0.989720       1.065974       0.9

Unnamed: 0,predicted_A,predicted_B,predicted_Other
A,33,21,1
B,16,44,5
Other,15,14,2


In [126]:
# building the model and fitting the data 
log_reg = sm.MNLogit(y, X).fit() 

# printing the summary table 
print(log_reg.summary()) 

Optimization terminated successfully.
         Current function value: 0.994428
         Iterations 5
                          MNLogit Regression Results                          
Dep. Variable:          policy_choice   No. Observations:                  752
Model:                        MNLogit   Df Residuals:                      720
Method:                           MLE   Df Model:                           30
Date:                mer, 13 nov 2024   Pseudo R-squ.:                 0.06726
Time:                        15:13:21   Log-Likelihood:                -747.81
converged:                       True   LL-Null:                       -801.74
Covariance Type:            nonrobust   LLR p-value:                 1.028e-10
    policy_choice=B       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Price_UR_A              0.1994      0.101      1.979      0.048       0.002       0.397
Pr

In [127]:
#export the summary to a txt file
with open('results_Baseline.txt', 'w') as fh:
    fh.write(log_reg.summary().as_text())

## Regression for treatment: Pollution

In [128]:
# Combine the numerical and encoded categorical data
X = Pollution[['Price_UR_A', 'Price_UN_A', 'Price_O_A', 'EV_A',
                     'Price_UR_B', 'Price_UN_B', 'Price_O_B', 'EV_B',
                     'Revenue_A_2.0', 'Revenue_A_3.0', 'Revenue_A_4.0', 'Revenue_A_5.0',
                     'Revenue_B_2.0', 'Revenue_B_3.0', 'Revenue_B_4.0', 'Revenue_B_5.0']] 
y = Pollution['policy_choice']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# building the model and fitting the data 
log_reg = sm.MNLogit(y, X).fit() 

# printing the summary table 
print(log_reg.summary()) 

Optimization terminated successfully.
         Current function value: 0.992568
         Iterations 5
                          MNLogit Regression Results                          
Dep. Variable:          policy_choice   No. Observations:                  740
Model:                        MNLogit   Df Residuals:                      708
Method:                           MLE   Df Model:                           30
Date:                mer, 13 nov 2024   Pseudo R-squ.:                 0.07562
Time:                        15:13:29   Log-Likelihood:                -734.50
converged:                       True   LL-Null:                       -794.58
Covariance Type:            nonrobust   LLR p-value:                 9.559e-13
    policy_choice=B       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Price_UR_A              0.2287      0.101      2.256      0.024       0.030       0.427
Pr

In [129]:
# Fit the Logistic Regression model
model = LogisticRegression(solver='newton-cg', multi_class='multinomial',random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report: \n', classification_report(y_test, y_pred))

# Display the coefficients
coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print(coefficients)

print('Intercept: \n', model.intercept_)

#coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print('Coefficients: \n', coefficients)

exp_coefficients = pd.DataFrame(np.exp(model.coef_), columns= X.columns.tolist())
print('Exponential coefficients: \n', exp_coefficients)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
pd.DataFrame(cnf_matrix, index=['A','B', 'Other'], columns=['predicted_A', 'predicted_B', 'predicted_Other'])

Accuracy: 0.47297297297297297
Classification Report: 
               precision    recall  f1-score   support

           A       0.50      0.54      0.52        54
           B       0.44      0.70      0.54        53
       Other       0.67      0.10      0.17        41

    accuracy                           0.47       148
   macro avg       0.54      0.44      0.41       148
weighted avg       0.52      0.47      0.43       148

Intercept: 
 [ 0.37220601  0.48796353 -0.86016953]
Exponential coefficients: 
    Price_UR_A  Price_UN_A  Price_O_A      EV_A  Price_UR_B  Price_UN_B  \
0    0.923136    0.909059   0.913592  1.139615    0.962341    1.147454   
1    1.134777    1.002980   1.020125  0.952103    1.012829    0.753329   
2    0.954606    1.096771   1.072987  0.921632    1.025971    1.156858   

   Price_O_B      EV_B  Revenue_A_2.0  Revenue_A_3.0  Revenue_A_4.0  \
0   1.058758  0.947148       0.920902       1.037158       0.902177   
1   0.882696  1.101671       1.030350       0.

Unnamed: 0,predicted_A,predicted_B,predicted_Other
A,29,25,0
B,14,37,2
Other,15,22,4


In [None]:
#export the summary to a txt file
with open('results_Pollution.txt', 'w') as fh:
    fh.write(log_reg.summary().as_text())

## Regression for treatment: Public_Services 

In [131]:
# Combine the numerical and encoded categorical data
X = Public_Services[['Price_UR_A', 'Price_UN_A', 'Price_O_A', 'EV_A',
                     'Price_UR_B', 'Price_UN_B', 'Price_O_B', 'EV_B',
                     'Revenue_A_2.0', 'Revenue_A_3.0', 'Revenue_A_4.0', 'Revenue_A_5.0',
                     'Revenue_B_2.0', 'Revenue_B_3.0', 'Revenue_B_4.0', 'Revenue_B_5.0']] 
y = Public_Services['policy_choice']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# building the model and fitting the data 
log_reg = sm.MNLogit(y, X).fit() 

# printing the summary table 
print(log_reg.summary()) 

Optimization terminated successfully.
         Current function value: 1.042718
         Iterations 5
                          MNLogit Regression Results                          
Dep. Variable:          policy_choice   No. Observations:                  832
Model:                        MNLogit   Df Residuals:                      800
Method:                           MLE   Df Model:                           30
Date:                mer, 13 nov 2024   Pseudo R-squ.:                 0.04708
Time:                        15:15:49   Log-Likelihood:                -867.54
converged:                       True   LL-Null:                       -910.40
Covariance Type:            nonrobust   LLR p-value:                 2.885e-07
    policy_choice=B       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Price_UR_A              0.2128      0.097      2.185      0.029       0.022       0.404
Pr

In [132]:
#export the summary to a txt file
with open('results_Public_Services.txt', 'w') as fh:
    fh.write(log_reg.summary().as_text())

In [133]:
# Fit the Logistic Regression model
model = LogisticRegression(solver='newton-cg', multi_class='multinomial',random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report: \n', classification_report(y_test, y_pred))

# Display the coefficients
coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print(coefficients)

print('Intercept: \n', model.intercept_)

#coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print('Coefficients: \n', coefficients)

exp_coefficients = pd.DataFrame(np.exp(model.coef_), columns= X.columns.tolist())
print('Exponential coefficients: \n', exp_coefficients)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
pd.DataFrame(cnf_matrix, index=['A','B', 'Other'], columns=['predicted_A', 'predicted_B', 'predicted_Other'])

Accuracy: 0.3772455089820359
Classification Report: 
               precision    recall  f1-score   support

           A       0.48      0.41      0.44        68
           B       0.31      0.64      0.42        42
       Other       0.38      0.14      0.21        57

    accuracy                           0.38       167
   macro avg       0.39      0.40      0.35       167
weighted avg       0.40      0.38      0.36       167

Intercept: 
 [ 0.31351391  0.01326034 -0.32677425]
Exponential coefficients: 
    Price_UR_A  Price_UN_A  Price_O_A      EV_A  Price_UR_B  Price_UN_B  \
0    0.837781    0.972777   0.993273  0.983009    1.042281    1.036548   
1    1.098419    1.069541   1.012043  0.999771    0.882423    0.999788   
2    1.086680    0.961146   0.994792  1.017518    1.087273    0.964945   

   Price_O_B      EV_B  Revenue_A_2.0  Revenue_A_3.0  Revenue_A_4.0  \
0   1.037179  0.923961       0.872558       0.932642       0.917038   
1   0.971465  1.117910       0.836837       1.0

Unnamed: 0,predicted_A,predicted_B,predicted_Other
A,28,33,7
B,9,27,6
Other,21,28,8


## Regression for treatment: Road_Pricing

In [134]:
# Combine the numerical and encoded categorical data
X = Road_Pricing[['Price_UR_A', 'Price_UN_A', 'Price_O_A', 'EV_A',
                     'Price_UR_B', 'Price_UN_B', 'Price_O_B', 'EV_B',
                     'Revenue_A_2.0', 'Revenue_A_3.0', 'Revenue_A_4.0', 'Revenue_A_5.0',
                     'Revenue_B_2.0', 'Revenue_B_3.0', 'Revenue_B_4.0', 'Revenue_B_5.0']] 
y = Road_Pricing['policy_choice']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# building the model and fitting the data 
log_reg = sm.MNLogit(y, X).fit() 

# printing the summary table 
print(log_reg.summary()) 

Optimization terminated successfully.
         Current function value: 1.017821
         Iterations 5
                          MNLogit Regression Results                          
Dep. Variable:          policy_choice   No. Observations:                  752
Model:                        MNLogit   Df Residuals:                      720
Method:                           MLE   Df Model:                           30
Date:                mer, 13 nov 2024   Pseudo R-squ.:                 0.05093
Time:                        15:16:37   Log-Likelihood:                -765.40
converged:                       True   LL-Null:                       -806.47
Covariance Type:            nonrobust   LLR p-value:                 9.673e-07
    policy_choice=B       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Price_UR_A              0.2374      0.101      2.353      0.019       0.040       0.435
Pr

In [135]:
#export the summary to a txt file
with open('results_Road_Pricing.txt', 'w') as fh:
    fh.write(log_reg.summary().as_text())

In [136]:
# Fit the Logistic Regression model
model = LogisticRegression(solver='newton-cg', multi_class='multinomial',random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report: \n', classification_report(y_test, y_pred))

# Display the coefficients
coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print(coefficients)

print('Intercept: \n', model.intercept_)

#coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print('Coefficients: \n', coefficients)

exp_coefficients = pd.DataFrame(np.exp(model.coef_), columns= X.columns.tolist())
print('Exponential coefficients: \n', exp_coefficients)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
pd.DataFrame(cnf_matrix, index=['A','B', 'Other'], columns=['predicted_A', 'predicted_B', 'predicted_Other'])

Accuracy: 0.4370860927152318
Classification Report: 
               precision    recall  f1-score   support

           A       0.46      0.47      0.46        55
           B       0.44      0.67      0.53        58
       Other       0.20      0.03      0.05        38

    accuracy                           0.44       151
   macro avg       0.36      0.39      0.35       151
weighted avg       0.38      0.44      0.38       151

Intercept: 
 [-0.49332981  0.56612626 -0.07279645]
Exponential coefficients: 
    Price_UR_A  Price_UN_A  Price_O_A      EV_A  Price_UR_B  Price_UN_B  \
0    0.958414    0.908153   1.014473  1.192311    0.986161    1.048692   
1    1.001219    1.119234   1.036142  0.918205    0.877413    0.935889   
2    1.042120    0.983830   0.951349  0.913421    1.155708    1.018891   

   Price_O_B      EV_B  Revenue_A_2.0  Revenue_A_3.0  Revenue_A_4.0  \
0   1.024840  1.043787       1.018588       1.079923       1.047921   
1   0.991432  1.091632       1.135808       0.9

Unnamed: 0,predicted_A,predicted_B,predicted_Other
A,26,27,2
B,17,39,2
Other,14,23,1


## Regression for treatment: Social_Norm 

In [140]:
# Combine the numerical and encoded categorical data
X = Social_Norm[['Price_UR_A', 'Price_UN_A', 'Price_O_A', 'EV_A',
                     'Price_UR_B', 'Price_UN_B', 'Price_O_B', 'EV_B',
                     'Revenue_A_2.0', 'Revenue_A_3.0', 'Revenue_A_4.0', 'Revenue_A_5.0',
                     'Revenue_B_2.0', 'Revenue_B_3.0', 'Revenue_B_4.0', 'Revenue_B_5.0']] 
y = Social_Norm['policy_choice']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# building the model and fitting the data 
log_reg = sm.MNLogit(y, X).fit() 

# printing the summary table 
print(log_reg.summary()) 

Optimization terminated successfully.
         Current function value: 0.992233
         Iterations 5
                          MNLogit Regression Results                          
Dep. Variable:          policy_choice   No. Observations:                  768
Model:                        MNLogit   Df Residuals:                      736
Method:                           MLE   Df Model:                           30
Date:                mer, 13 nov 2024   Pseudo R-squ.:                 0.05942
Time:                        15:18:35   Log-Likelihood:                -762.03
converged:                       True   LL-Null:                       -810.18
Covariance Type:            nonrobust   LLR p-value:                 7.105e-09
    policy_choice=B       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Price_UR_A              0.1505      0.097      1.544      0.123      -0.041       0.342
Pr

In [138]:
#export the summary to a txt file
with open('results_Social_Norm.txt', 'w') as fh:
    fh.write(log_reg.summary().as_text())

In [141]:
# Fit the Logistic Regression model
model = LogisticRegression(solver='newton-cg', multi_class='multinomial',random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report: \n', classification_report(y_test, y_pred))

# Display the coefficients
coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print(coefficients)

print('Intercept: \n', model.intercept_)

#coefficients = pd.DataFrame(model.coef_, columns= X.columns.tolist())
#print('Coefficients: \n', coefficients)

exp_coefficients = pd.DataFrame(np.exp(model.coef_), columns= X.columns.tolist())
print('Exponential coefficients: \n', exp_coefficients)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
pd.DataFrame(cnf_matrix, index=['A','B', 'Other'], columns=['predicted_A', 'predicted_B', 'predicted_Other'])

Accuracy: 0.5194805194805194
Classification Report: 
               precision    recall  f1-score   support

           A       0.48      0.50      0.49        58
           B       0.54      0.74      0.63        69
       Other       0.00      0.00      0.00        27

    accuracy                           0.52       154
   macro avg       0.34      0.41      0.37       154
weighted avg       0.43      0.52      0.47       154

Intercept: 
 [ 0.19512362  0.44296781 -0.63809144]
Exponential coefficients: 
    Price_UR_A  Price_UN_A  Price_O_A      EV_A  Price_UR_B  Price_UN_B  \
0    0.866423    0.979997   1.009515  1.090654    1.043076    1.012410   
1    1.026255    1.014577   0.922282  0.934871    0.834881    0.986188   
2    1.124643    1.005751   1.074048  0.980756    1.148310    1.001576   

   Price_O_B      EV_B  Revenue_A_2.0  Revenue_A_3.0  Revenue_A_4.0  \
0   1.006678  0.898423       1.438464       1.177051       1.441470   
1   1.025390  1.224482       0.973271       0.8

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,predicted_A,predicted_B,predicted_Other
A,29,29,0
B,18,51,0
Other,13,14,0


In [None]:
np.shape(df)

(3844, 35)

In [None]:
numerical_columns = ['Price_UR_A', 'Price_UN_A', 'Price_O_A', 'EV_A',
                     'Price_UR_B', 'Price_UN_B', 'Price_O_B', 'EV_B']
categorical_columns = ['Revenue_A', 'Revenue_B']

# Separate the numerical and categorical data
numerical_data = df[numerical_columns]
categorical_data = df[categorical_columns]

# Apply one-hot encoding to the categorical data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_categorical_data = encoder.fit_transform(categorical_data)

# Combine the numerical and encoded categorical data
X = pd.concat([numerical_data.reset_index(drop=True), pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_columns))], axis=1)
y = df[['policy_choice']]






In [None]:
X

Unnamed: 0,Price_UR_A,Price_UN_A,Price_O_A,EV_A,Price_UR_B,Price_UN_B,Price_O_B,EV_B,Revenue_A_2.0,Revenue_A_3.0,Revenue_A_4.0,Revenue_A_5.0,Revenue_B_2.0,Revenue_B_3.0,Revenue_B_4.0,Revenue_B_5.0
0,3.0,3.0,3.0,4.0,4.0,2.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,1.0,1.0,4.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4.0,1.0,2.0,3.0,3.0,3.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3.0,1.0,2.0,4.0,2.0,2.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,2.0,1.0,2.0,1.0,3.0,2.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3839,3.0,4.0,4.0,4.0,2.0,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3840,3.0,1.0,4.0,1.0,4.0,4.0,3.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3841,3.0,4.0,1.0,2.0,4.0,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3842,2.0,2.0,2.0,1.0,4.0,1.0,1.0,4.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
df[['policy_choice']]

Unnamed: 0,policy_choice
0,Other
1,A
2,B
3,A
4,B
...,...
3839,B
3840,B
3841,A
3842,Other
