In [1]:
import pandas as pd
import statsmodels.api as sm
from scipy import stats



In [19]:
df = pd.read_csv("z_score_prob.csv")

In [20]:
df.columns

Index(['outlet', 'Probability_con', 'label', 'Drives_zscore',
       'Cognition_zscore', 'emo_pos_zscore', 'emo_neg_zscore',
       'emo_anx_zscore', 'emo_anger_zscore', 'emo_sad_zscore', 'Social_zscore',
       'Lifestyle_zscore', 'Physical_zscore', 'focuspast_zscore',
       'focuspresent_zscore', 'focusfuture_zscore', 'Moral_zscore',
       'Affect_zscore', 'Moral-Emotional_zscore'],
      dtype='object')

# General Dataset

In [21]:
def logistic_regression_likelihood_ratio_test(features, target):
    # Add a constant term to the features
    features = sm.add_constant(features)
    
    # Fit logistic regression model
    model = sm.Logit(target, features)
    result = model.fit()
    
    # Likelihood ratio test
    null_model = sm.Logit(target, sm.add_constant(pd.Series([1]*len(target))))
    null_result = null_model.fit(disp=0)
    lr = 2 * (result.llf - null_result.llf)
    df = result.df_model - null_result.df_model
    p_value = stats.chi2.sf(lr, df)
    
    return result, lr, df, p_value

In [23]:
# List to store results
results = []

# First, run logistic regression with only the 13 language features
features_13 = df[['Drives_zscore',
       'Cognition_zscore', 'emo_pos_zscore', 'emo_neg_zscore',
       'emo_anx_zscore', 'emo_anger_zscore', 'emo_sad_zscore', 'Social_zscore',
       'Lifestyle_zscore', 'Physical_zscore', 'focuspast_zscore',
       'focuspresent_zscore', 'focusfuture_zscore']]

target = df['Probability_con']

result_13, lr_13, df_13, p_value_13 = logistic_regression_likelihood_ratio_test(features_13, target)

results.append(('13 Language Features', result_13, lr_13, df_13, p_value_13))

# Then, add each additional category one by one and repeat the process
additional_categories = ['Moral_zscore', 'Affect_zscore', 'Moral-Emotional_zscore']

for category in additional_categories:
    features_additional = df[['Drives_zscore','Cognition_zscore', 'emo_pos_zscore', 'emo_neg_zscore',
                              'emo_anx_zscore', 'emo_anger_zscore', 'emo_sad_zscore', 'Social_zscore',
                              'Lifestyle_zscore', 'Physical_zscore', 'focuspast_zscore','focuspresent_zscore',
                              'focusfuture_zscore', category]]
    result_additional, lr_additional, df_additional, p_value_additional = logistic_regression_likelihood_ratio_test(features_additional, target)
    results.append(('13 Language Features + {}'.format(category), result_additional, lr_additional, df_additional, p_value_additional))

# Print results
for result in results:
    print("Model: ", result[0])
    print(result[1].summary()) 
    print("LR Statistic:", result[2])
    print("Degrees of Freedom:", result[3])
    print("p-value:", result[4])
    print("\n")


Optimization terminated successfully.
         Current function value: 0.582482
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.581027
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.576688
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.580148
         Iterations 6
Model:  13 Language Features
                           Logit Regression Results                           
Dep. Variable:        Probability_con   No. Observations:                  180
Model:                          Logit   Df Residuals:                      166
Method:                           MLE   Df Model:                           13
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.1302
Time:                        14:57:11   Log-Likelihood:                -104.85
converged:                       True   LL-Null:                       -120.54
Covariance

E.G. INTERPRETATION

Model:  13 Language Features\
LR Statistic: 123.45\
Degrees of Freedom: 13\
p-value: 0.001

Interpretation:
- The logistic regression model with 13 language features is statistically significant (p < 0.05), indicating that the set of language features collectively predicts the probability of the target variable.
- The LR statistic of 123.45 suggests that the model with the language features provides a significantly better fit than the null model.
- Individual coefficients, odds ratios, and p-values for each language feature can be examined to understand their specific contributions to the model.

In [25]:
for category in additional_categories:
    # Run logistic regression with the 13 language features plus the additional category
    features_additional = df[['Drives_zscore','Cognition_zscore', 'emo_pos_zscore', 'emo_neg_zscore',
                              'emo_anx_zscore', 'emo_anger_zscore', 'emo_sad_zscore', 'Social_zscore',
                              'Lifestyle_zscore', 'Physical_zscore', 'focuspast_zscore','focuspresent_zscore',
                              'focusfuture_zscore', category]]
    result_additional, lr_additional, df_additional, p_value_additional = logistic_regression_likelihood_ratio_test(features_additional, target)
    
    # Compare LR statistic and p-value between the models
    print("Comparison between 13 Language Features and 13 Language Features + {}: ".format(category))
    # print(result_additional.summary()) 
    print("LR Statistic:", lr_additional - lr_13)
    print("Degrees of Freedom:", df_additional - df_13)
    print("p-value:", p_value_additional)
    print("\n")

Optimization terminated successfully.
         Current function value: 0.581027
         Iterations 6
Comparison between 13 Language Features and 13 Language Features + Moral_zscore: 
LR Statistic: 0.5237646565838645
Degrees of Freedom: 1.0
p-value: 0.004103540210004421


Optimization terminated successfully.
         Current function value: 0.576688
         Iterations 6
Comparison between 13 Language Features and 13 Language Features + Affect_zscore: 
LR Statistic: 2.0856817949514266
Degrees of Freedom: 1.0
p-value: 0.0024484265226976457


Optimization terminated successfully.
         Current function value: 0.580148
         Iterations 6
Comparison between 13 Language Features and 13 Language Features + Moral-Emotional_zscore: 
LR Statistic: 0.840387560881652
Degrees of Freedom: 1.0
p-value: 0.0036991161826115553




  x = pd.concat(x[::order], 1)


- Comparison between 13 Language Features and 13 Language Features + Moral_zscore:

LR Statistic: 0.524
Degrees of Freedom: 1.0
p-value: 0.0041
Interpretation: The addition of the "Moral_zscore" feature significantly improves the model fit compared to using only the 13 language features. The p-value suggests strong evidence against the null hypothesis that the additional feature does not improve the model.

- Comparison between 13 Language Features and 13 Language Features + Affect_zscore:

LR Statistic: 2.086
Degrees of Freedom: 1.0
p-value: 0.0024
Interpretation: Similar to the previous comparison, the addition of the "Affect_zscore" feature significantly improves the model fit compared to using only the 13 language features. The p-value is even lower, indicating stronger evidence against the null hypothesis.

- Comparison between 13 Language Features and 13 Language Features + Moral-Emotional_zscore:

LR Statistic: 0.840
Degrees of Freedom: 1.0
p-value: 0.0037
Interpretation: Once again, adding the "Moral-Emotional_zscore" feature improves the model fit significantly compared to using only the 13 language features. The p-value is relatively low, indicating strong evidence against the null hypothesis.

Interpretation:

- A statistically significant p-value (typically < 0.05) suggests that adding the additional category significantly improves the model fit compared to using only the 13 language features.
- The LR statistic indicates the improvement in model fit when adding the additional category. Positive values suggest an improvement, while negative values suggest a deterioration in model fit.

**Coefficient**: Each coefficient represents the change in the log odds of the target variable for a one-unit change in the predictor variable, holding all other variables constant. A positive coefficient indicates that an increase in the predictor variable is associated with an increase in the log odds of the target variable, while a negative coefficient indicates the opposite.

**Odds Ratio**: The odds ratio represents the multiplicative change in the odds of the target variable for a one-unit change in the predictor variable. It is calculated by exponentiating the coefficient. For example, an odds ratio of 1.5 means that for every one-unit increase in the predictor variable, the odds of the target variable occurring are 1.5 times higher, holding all other variables constant.

**P-value**: The p-value associated with each coefficient indicates the statistical significance of the relationship between the predictor variable and the target variable. A small p-value (typically less than 0.05) suggests that the relationship is statistically significant, meaning that it is unlikely to have occurred by chance.

**Confidence Intervals**: Confidence intervals for the coefficients provide a range of plausible values for the true population parameter. If the confidence interval does not include zero, it suggests that the coefficient is statistically significant.

# Comparing the Con-Lib

To conduct a two-sample test on the coefficient of an additional feature while considering the interaction with the label (political affiliation), I include an interaction term in your logistic regression model. This interaction term captures the effect of the feature on the outcome (probability of being conservative) while also considering the political affiliation of the article.

The function logistic_regression_interaction_likelihood_ratio_test is designed to fit two logistic regression models: one with an interaction term and one without. Here's the breakdown of how the models are fitted within the function:

With Interaction Term:

First, a constant term is added to the features.
Then, an interaction term is created by multiplying the specified feature (interaction_var) by the 'label' column (political affiliation).
The logistic regression model is then fitted using the features including the interaction term.

Without Interaction Term:

A constant term is added to the original features.
The logistic regression model is then fitted using the original features without the interaction term.
After fitting both models, the function computes the likelihood ratio test statistics to compare the two models. The likelihood ratio test assesses whether adding the interaction term significantly improves the fit of the model compared to the model without the interaction term.

In summary, the model fitting occurs within the function for both the model with the interaction term and the model without the interaction term. The function then returns the results of both models along with the likelihood ratio test statistics.

Interpreting the printed results:

1. **Model with Interaction Term**:
   - This section provides the summary output of the logistic regression model with the interaction term included.
   - The summary includes information such as coefficients, standard errors, z-values, p-values, and odds ratios for each predictor variable, including the interaction term.
   - You can interpret the significance and magnitude of each coefficient, as well as assess the overall fit of the model using statistics like the pseudo-R-squared value.

2. **Model without Interaction Term**:
   - This section provides the summary output of the logistic regression model without the interaction term.
   - Similar to the model with the interaction term, it includes information about coefficients, standard errors, z-values, p-values, and odds ratios for each predictor variable.
   - You can compare the coefficients and other statistics between this model and the model with the interaction term to assess the impact of the interaction.

3. **Likelihood Ratio Test**:
   - This section provides the results of the likelihood ratio test conducted to compare the model with the interaction term to the model without the interaction term.
   - The LR statistic measures the improvement in model fit when including the interaction term, while the degrees of freedom represent the difference in the number of parameters between the two models.
   - The p-value associated with the likelihood ratio test indicates the significance of the improvement in model fit. A small p-value suggests that including the interaction term significantly improves the model fit compared to the model without the interaction term.

## Moral Predictor

In [7]:
features_additional = df[['Moral_zscore', 'label', 'Drives_zscore','Cognition_zscore',
                          'emo_pos_zscore', 'emo_neg_zscore', 'emo_anx_zscore', 
                          'emo_anger_zscore', 'emo_sad_zscore', 'Social_zscore',
                          'Lifestyle_zscore', 'Physical_zscore', 'focuspast_zscore',
                          'focuspresent_zscore', 'focusfuture_zscore']]

In [8]:
def logistic_regression_interaction_likelihood_ratio_test(features, interaction_var, target):
    # Add a constant term to the features
    features = sm.add_constant(features)
    
    # Add interaction term
    features_interaction = features.copy()
    features_interaction['interaction'] = features[interaction_var] * features['label']
    
    # Fit logistic regression models with and without interaction term
    model_interaction = sm.Logit(target, features_interaction)
    result_interaction = model_interaction.fit()
    
    model_no_interaction = sm.Logit(target, features)
    result_no_interaction = model_no_interaction.fit()
    
    # Likelihood ratio test
    lr = 2 * (result_interaction.llf - result_no_interaction.llf)
    df = result_interaction.df_model - result_no_interaction.df_model
    p_value = stats.chi2.sf(lr, df)
    
    return result_interaction, result_no_interaction, lr, df, p_value


In [9]:
# Run logistic regression with interaction term
result_interaction, result_no_interaction, lr, df, p_value = logistic_regression_interaction_likelihood_ratio_test(features_additional, 'Moral_zscore', target)

# Print the results
print("Model with Interaction Term:")
print(result_interaction.summary())
print("\n")

print("Model without Interaction Term:")
print(result_no_interaction.summary())
print("\n")

print("Likelihood Ratio Test:")
print("LR Statistic:", lr)
print("Degrees of Freedom:", df)
print("p-value:", p_value)

Optimization terminated successfully.
         Current function value: 0.254406
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.261352
         Iterations 8
Model with Interaction Term:
                           Logit Regression Results                           
Dep. Variable:        Probability_con   No. Observations:                  180
Model:                          Logit   Df Residuals:                      163
Method:                           MLE   Df Model:                           16
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.6201
Time:                        13:54:41   Log-Likelihood:                -45.793
converged:                       True   LL-Null:                       -120.54
Covariance Type:            nonrobust   LLR p-value:                 9.799e-24
                          coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------

### Model with Interaction Term:
- **Moral_zscore (Emotional Predictor)**:
  - **Coefficient**: The coefficient for "Moral_zscore" is -0.5331. This indicates that for a one-unit increase in the Moral_zscore, the log-odds of the target variable (probability of being conservative) decrease by 0.5331, holding all other variables constant.
  - **P-value**: The p-value associated with "Moral_zscore" is 0.176, which is greater than the typical significance level of 0.05. This suggests that the coefficient for "Moral_zscore" is not statistically significant at the 5% level, meaning we fail to reject the null hypothesis that the true coefficient is zero.
  - **Odds Ratio**: The odds ratio for "Moral_zscore" is exp(-0.5331) = 0.587. This means that for every one-unit increase in the Moral_zscore, the odds of an article being conservative decrease by approximately 41.3%, holding all other variables constant.

- **Interaction Term**:
  - **Coefficient**: The coefficient for the interaction term is 1.1997. This indicates the change in the effect of "Moral_zscore" on the probability of being conservative for a one-unit change in the label variable (political affiliation).
  - **P-value**: The p-value associated with the interaction term is 0.104, which is greater than 0.05. This suggests that the interaction term is not statistically significant at the 5% level, indicating that the effect of Moral_zscore does not significantly differ based on political affiliation.

### Model without Interaction Term:
- The interpretation of "Moral_zscore" in the model without the interaction term follows the same logic as above.

### Likelihood Ratio Test:
- **LR Statistic**: The likelihood ratio (LR) statistic is 2.5007.
- **Degrees of Freedom**: There is 1 degree of freedom in the likelihood ratio test.
- **P-value**: The p-value associated with the likelihood ratio test is 0.1138, which is greater than 0.05. This suggests that we fail to reject the null hypothesis that the model with the interaction term does not provide a better fit than the model without the interaction term.

In summary, while the coefficient for "Moral_zscore" is not statistically significant in both models, the interaction term is also not statistically significant. This suggests that the effect of Moral_zscore on the probability of being conservative does not significantly differ based on political affiliation. Additionally, the likelihood ratio test indicates that the model with the interaction term does not provide a significantly better fit than the model without the interaction term.


## Emotional Predictor

In [13]:
features_additional = df[['Affect_zscore', 'label', 'Drives_zscore','Cognition_zscore',
                          'emo_pos_zscore', 'emo_neg_zscore', 'emo_anx_zscore', 
                          'emo_anger_zscore', 'emo_sad_zscore', 'Social_zscore',
                          'Lifestyle_zscore', 'Physical_zscore', 'focuspast_zscore',
                          'focuspresent_zscore', 'focusfuture_zscore']]

In [15]:
# Run logistic regression with interaction term
result_interaction, result_no_interaction, lr, df, p_value = logistic_regression_interaction_likelihood_ratio_test(features_additional, 'Affect_zscore', target)

# Print the results
print("Model with Interaction Term:")
print(result_interaction.summary())
print("\n")

print("Model without Interaction Term:")
print(result_no_interaction.summary())
print("\n")

print("Likelihood Ratio Test:")
print("LR Statistic:", lr)
print("Degrees of Freedom:", df)
print("p-value:", p_value)

Optimization terminated successfully.
         Current function value: 0.253318
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.253370
         Iterations 8
Model with Interaction Term:
                           Logit Regression Results                           
Dep. Variable:        Probability_con   No. Observations:                  180
Model:                          Logit   Df Residuals:                      163
Method:                           MLE   Df Model:                           16
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.6217
Time:                        14:47:16   Log-Likelihood:                -45.597
converged:                       True   LL-Null:                       -120.54
Covariance Type:            nonrobust   LLR p-value:                 8.203e-24
                          coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------

## Moral - Emotional Predictor

In [17]:
features_additional = df[['Moral-Emotional_zscore', 'label', 'Drives_zscore','Cognition_zscore',
                          'emo_pos_zscore', 'emo_neg_zscore', 'emo_anx_zscore', 
                          'emo_anger_zscore', 'emo_sad_zscore', 'Social_zscore',
                          'Lifestyle_zscore', 'Physical_zscore', 'focuspast_zscore',
                          'focuspresent_zscore', 'focusfuture_zscore']]

In [18]:
# Run logistic regression with interaction term
result_interaction, result_no_interaction, lr, df, p_value = logistic_regression_interaction_likelihood_ratio_test(features_additional, 'Moral-Emotional_zscore', target)

# Print the results
print("Model with Interaction Term:")
print(result_interaction.summary())
print("\n")

print("Model without Interaction Term:")
print(result_no_interaction.summary())
print("\n")

print("Likelihood Ratio Test:")
print("LR Statistic:", lr)
print("Degrees of Freedom:", df)
print("p-value:", p_value)

Optimization terminated successfully.
         Current function value: 0.255058
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.257393
         Iterations 8
Model with Interaction Term:
                           Logit Regression Results                           
Dep. Variable:        Probability_con   No. Observations:                  180
Model:                          Logit   Df Residuals:                      163
Method:                           MLE   Df Model:                           16
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.6191
Time:                        14:48:44   Log-Likelihood:                -45.910
converged:                       True   LL-Null:                       -120.54
Covariance Type:            nonrobust   LLR p-value:                 1.090e-23
                             coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------

  x = pd.concat(x[::order], 1)
