# Finding Feature Interactions

In [1]:
import statsmodels.api as sm
import itertools
import pandas as pd
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sys

sys.path.append("../")
import utils.features

In [2]:
PROJECT_DIR = os.path.dirname(os.getcwd())
STATS_DIR = os.path.join(PROJECT_DIR, 'classification/stats')
FEATURES_DIR = os.path.join(PROJECT_DIR, 'feature_extraction/featureExtraction/output/')
RESPONSES_DIR = os.path.join(PROJECT_DIR, 'responses/')

def load_x_y(features_filename, responses_filename):
    # load features (x)
    features_df = utils.features.load_features(features_filename)
    # load labels (y)
    labels_df = utils.features.load_labels(responses_filename)
    # merge
    data_df = utils.features.merge_and_filter(features_df, labels_df)
    # scale
    data_df = utils.features.scale_features(
        data_df, 
        scaler=MinMaxScaler(
            feature_range=(0, 1)
        )
    )
    # split
    X = data_df.drop(columns=['outcome'], axis=1)
    y = data_df['outcome']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('X_train shape:', X_train.shape)
    print('y_train shape:', y_train.shape)
    print('X_test shape:', X_test.shape)
    print('y_test shape:', y_test.shape)

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_x_y(
    features_filename=os.path.join(FEATURES_DIR, '12091031_features.csv.gz'),
    responses_filename=os.path.join(RESPONSES_DIR, '12091031_parsed_turbo_10000_eval.jsonl')
)

# Load feature set
with open(os.path.join(STATS_DIR, '12091031_col-rfe_10000.json'), 'r') as f:
    stats = json.load(f)
    feature_set = [f for f in stats['coefficients'].keys()]

X_train shape: (8000, 2339)
y_train shape: (8000,)
X_test shape: (2000, 2339)
y_test shape: (2000,)


In [3]:
# Only use features in feature set
X_train = X_train[feature_set]
X_test = X_test[feature_set]

print('X_train shape:', X_train.shape, 'X_test shape:', X_test.shape)

X_train shape: (8000, 300) X_test shape: (2000, 300)


In [5]:
# all possible pairs of features
feature_pairs = list(itertools.combinations(X_train.columns, 2)) # list of tuples
len(feature_pairs)

44850

### VIF
- 1 = not correlated.
- Between 1 and 5 = moderately correlated.
- Greater than 5 = highly correlated.

### P-value
- Small is better as it indicates that a result did not take place by chance
- Smaller means the null hypothesis (that there is no relationship exists between two variables)
- smalle than 0.05 indicates significance

In [6]:
tmp_data = False

if tmp_data:
    small_feature_set = feature_set[:100]
    X_train = X_train[small_feature_set][:2000]
    X_test = X_test[small_feature_set][:1000]
    y_train = y_train[:2000]
    y_test = y_test[:1000]

    print('X_train tmp shape:', X_train.shape, 'X_test tmp shape:', X_test.shape)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.stats.outliers_influence import variance_inflation_factor


# get interactions
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_train_interactions = poly.fit_transform(X_train)

interaction_features = poly.get_feature_names_out(input_features=X_train.columns) # individual features + interactions
X_train_interactions = pd.DataFrame(X_train_interactions, columns=interaction_features) # avoid index error

model = LogisticRegression(
    penalty='elasticnet',  # l1 and l2 regularisation mix
    solver='saga',
    l1_ratio=0.5, 
    C=0.1,
    random_state=42,
    max_iter=1000)
model.fit(X_train_interactions, y_train)

# for full data took around 8 mins

In [9]:
# coefficients: all, not just for interactions (individual features and interactions)
coefficients = model.coef_[0]

# create dict with feature names and coefficients
# sorted by absolute value (magnitude of coefficient)
coefficients_dict = dict(zip(interaction_features, coefficients))
coefficients_dict = sorted(coefficients_dict.items(), key=lambda item: abs(item[1]), reverse=True)

In [10]:
threshold = 0.1 # absolute value of coefficient - if higher, we consider it a large coefficient (i.e., important feature)
large_coefficients = {feature: coef for feature, coef in coefficients_dict if abs(coef) > threshold}
large_coefficients

{'If_Lasswell': -0.6415153542091685,
 'Know_GI_verbs': 0.6378020040172478,
 'basic_nfunction_types': -0.6350752197126596,
 'Quan_GI_neg_3': -0.5875039726220488,
 'Virtue_GI_adverbs_neg_3': 0.5820481102150072,
 'Eval_GI_adverbs_neg_3': 0.5715620256480863,
 'Know_GI': -0.5675698535449349,
 'vader_neutral If_Lasswell': -0.5173566602027507,
 'Wlttot_Lasswell_neg_3': 0.5112627346156715,
 'Need_GI_nouns': -0.4920857720562387,
 'Causal_GI_adverbs ttr': 0.4827778036378788,
 'Powoth_Lasswell': -0.48009862342431914,
 'Compare_GI_neg_3': 0.47915026626152407,
 'Enlgain_Lasswell_nouns': 0.4701801611177458,
 'vader_neutral VERB': 0.45767877306809035,
 'Know_GI_nouns': 0.43415179675120985,
 'Sv_GI_verbs_neg_3': 0.4245213747709195,
 'Ngtv_GI': 0.415445902111641,
 'Submit_GI': -0.40890926814062534,
 'vader_neutral Hu_GI': 0.4042140404856982,
 'ttr': 0.3962646769784597,
 'Undrst_GI_verbs': 0.39598272015905994,
 'Tool_GI': 0.3944635722258295,
 'Undrst_GI_neg_3': 0.39322335845158546,
 'Sv_GI_verbs_neg_3 V

In [11]:
# calculate VIF and p-value only for large coefficients
# instead of for every feature (takes too long)

# a feature is robust if it has a low p-value and low VIF (and a large coefficient)
robust_features = {} # individual features and interactions

for feature, coef in large_coefficients.items():
    X = sm.add_constant(X_train_interactions[[feature]]) # stats models allows us to get the p-values
    X.reset_index(drop=True, inplace=True) # avoid index mismatch error
    y = y_train.reset_index(drop=True) # avoid index mismatch error
    
    sm_model = sm.Logit(y, X)
    result = sm_model.fit()

    p_value = result.pvalues[1]
    vif = variance_inflation_factor(X.values, 1)

    if p_value < 0.05 and vif < 5: # robust and significant
        robust_features[feature] = {'coefficient': coef, 'p_value': p_value, 'vif': vif}

Optimization terminated successfully.
         Current function value: 0.653968
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.661184
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.647149
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.664321
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.665142
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665025
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.664075
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.656204
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.664484
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.663904
  

In [14]:
# isolate interactions
robust_interactions = {} # interactions only

for feature in robust_features.keys():
    if ' ' in feature: # interaction
        robust_interactions[feature] = robust_features[feature]

robust_interactions

{'vader_neutral If_Lasswell': {'coefficient': -0.5173566602027507,
  'p_value': 5.861063380037753e-34,
  'vif': 0.9999999999999984},
 'Causal_GI_adverbs ttr': {'coefficient': 0.4827778036378788,
  'p_value': 4.2600369603705036e-38,
  'vif': 1.0000000000000007},
 'vader_neutral VERB': {'coefficient': 0.45767877306809035,
  'p_value': 2.2652984873324555e-12,
  'vif': 1.0000000000000002},
 'vader_neutral Hu_GI': {'coefficient': 0.4042140404856982,
  'p_value': 1.848354224529018e-07,
  'vif': 1.0000000000000018},
 'Sv_GI_verbs_neg_3 VERB': {'coefficient': 0.38545062795855267,
  'p_value': 1.0263268678024725e-19,
  'vif': 1.0},
 'vader_neutral Undrst_GI_neg_3': {'coefficient': 0.3336909997740311,
  'p_value': 0.0005361143500052381,
  'vif': 1.0000000000000018},
 'attention_neg_3 basic_nfunction_types': {'coefficient': -0.32189707685104296,
  'p_value': 6.923381174553515e-64,
  'vif': 0.9999999999999996},
 'vader_neutral ttr': {'coefficient': 0.31281945206323597,
  'p_value': 6.9356358696794

In [17]:
len(robust_features.keys()), len(robust_interactions.keys())
# robust_features includes robust_interactions (but also has individual features)

(89, 46)

In [15]:
write_findings = False

if write_findings == True:
    with open(os.path.join(STATS_DIR, '12091031_robust_interactions.json'), 'w') as f:
        json.dump(robust_interactions, f, indent=4)

    with open(os.path.join(STATS_DIR, '12091031_robust_features.json'), 'w') as f:
        json.dump(robust_features, f, indent=4)

In [19]:
# average VIF for robust features
# roughly can be interpreted as the VIF of a model that uses robust features as predictors
vifs = [robust_features[feature]['vif'] for feature in robust_features.keys()]
print('Avg VIF after selection:', np.mean(vifs))

Avg VIF after selection:  1.0000000000000002


In [22]:
# average VIF for model that uses all the features resulting from col-rfe (before adding interactions) - X_train

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_df = pd.DataFrame()
vif_df["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif_df["features"] = X_train.columns

# if NaN: constant or perfectly correlated with another feature
print("Number of NaN values in df:", vif_df.isna().sum()) # should be just the constant (added by statsmodels)
vif_df = vif_df.dropna()

avg_vif = vif_df["VIF"].mean()

print(f"Average VIF: {avg_vif}")


  return 1 - self.ssr/self.uncentered_tss


Number of NaN values in df: VIF         1
features    0
dtype: int64
Average VIF: 9.649755581236533


### VIF comparison before / after

If the average Variance Inflation Factor (VIF) is higher before feature selection and lower after feature selection, it suggests that the feature selection process was successful in removing features that were causing multicollinearity.

Multicollinearity refers to a situation in which two or more explanatory variables in a multiple regression model are highly linearly related. Multicollinearity can inflate the variance of the regression coefficients, making them unstable and difficult to interpret. VIF provides a measure of multicollinearity among the features, and a high VIF value for a feature suggests that it is highly correlated with the other features.

A reduction in the average VIF after feature selection indicates that the model is likely to be more stable and reliable as a result of reducing multicollinearity.