In [79]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.filterwarnings('ignore')
# Applying a more readable float format
pd.options.display.float_format = '{:.8f}'.format

In [80]:
data = pd.read_csv("C:/Users/hongz/Downloads/choices13k-main/reg_data_main2.csv")
# Modify graph_id as specified
data['graph_id'] = data['graph_id'] + 100 * data['reshuffle_ind']
data.head()

Unnamed: 0,subject_id,graph_id,time,click,risk,forecast,confidence,investment,reason_risky,reason_confidence,...,gender_ind,employment_ind,education_ind,income_ind,statistics_ind,risk-taking_ind,stock knowledge_ind,frequency _ind,history_ind,technical_ind
0,5fbb4426e47b46c3e2eeb544,156,11.55,6,6,112,62,59,,,...,0,0,1,1,0,0,0,0,0,0
1,65981b2c1df3be0020afa351,154,57.601,10,8,95,14,9,The ones that had big dips in them.,A lot of them. Only because I'm slowly trying ...,...,0,1,0,0,1,1,0,1,0,0
2,655791684bb1c5db02826d17,192,48.958,16,7,91,100,51,If it fluctuates more than 20% within 12 months.,With a $3 investment I do not feel unconfident...,...,0,1,1,1,0,1,0,1,1,0
3,62ddbd7eb3e9431e49b46ec1,182,49.95,11,8,115,79,60,,,...,1,1,1,0,1,1,0,0,0,0
4,5fb13091b87dfd5888f73e05,180,45.07,8,5,93,20,0,Those that have big drops.,When they are inconsistent and up and down on ...,...,0,0,0,0,0,1,0,1,1,0


In [81]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit and transform 'subject_id' to numeric
data['subject_id_encoded'] = le.fit_transform(data['subject_id'])

data.head()

Unnamed: 0,subject_id,graph_id,time,click,risk,forecast,confidence,investment,reason_risky,reason_confidence,...,employment_ind,education_ind,income_ind,statistics_ind,risk-taking_ind,stock knowledge_ind,frequency _ind,history_ind,technical_ind,subject_id_encoded
0,5fbb4426e47b46c3e2eeb544,156,11.55,6,6,112,62,59,,,...,0,1,1,0,0,0,0,0,0,185
1,65981b2c1df3be0020afa351,154,57.601,10,8,95,14,9,The ones that had big dips in them.,A lot of them. Only because I'm slowly trying ...,...,1,0,0,1,1,0,1,0,0,461
2,655791684bb1c5db02826d17,192,48.958,16,7,91,100,51,If it fluctuates more than 20% within 12 months.,With a $3 investment I do not feel unconfident...,...,1,1,1,0,1,0,1,1,0,438
3,62ddbd7eb3e9431e49b46ec1,182,49.95,11,8,115,79,60,,,...,1,1,0,1,1,0,0,0,0,307
4,5fb13091b87dfd5888f73e05,180,45.07,8,5,93,20,0,Those that have big drops.,When they are inconsistent and up and down on ...,...,0,0,0,0,1,0,1,1,0,183


Baseline Regression

In [82]:
# Define independent variables for the baseline regression
X_base = data[['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded']]

# Split the data
X_train_base, X_test_base, y_train, y_test = train_test_split(X_base, data['confidence'], test_size=0.2, random_state=42)

# Baseline regression
model_base = LinearRegression()
model_base.fit(X_train_base, y_train)
y_pred_base = model_base.predict(X_test_base)
r2_base = r2_score(y_test, y_pred_base)
r2_base

0.006575680627034064

3 Feature Regression

In [83]:
X_3_feature = data[['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor']]
X_train_3, X_test_3 = train_test_split(X_3_feature, test_size=0.2, random_state=42)

model_3_feature = LinearRegression()
model_3_feature.fit(X_train_3, y_train)
y_pred_3 = model_3_feature.predict(X_test_3)
r2_3_feature = r2_score(y_test, y_pred_3)
r2_3_feature

0.006906556308739598

8 Feature Regression

In [84]:
X_8_feature = data[['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor', 'SH_Rep_Factor', 'SH_Sign_Factor', 'SH_Obj_Std', 'SH_Skewness', 'SH_Kurtosis']]
X_train_8, X_test_8 = train_test_split(X_8_feature, test_size=0.2, random_state=42)

model_8_feature = LinearRegression()
model_8_feature.fit(X_train_8, y_train)
y_pred_8 = model_8_feature.predict(X_test_8)
r2_8_feature = r2_score(y_test, y_pred_8)
r2_8_feature

0.005858440493053729

Graph FE Regression

In [85]:
# Assuming 'data' is your DataFrame and 'graph_id' is the categorical variable
graph_id_dummies = pd.get_dummies(data['graph_id'], prefix='graph_id')
data_with_fe = pd.concat([data, graph_id_dummies], axis=1)

# Create a list of features including the dummy variables
features = ['time', 'click', 'order', 'subject_id_encoded'] + list(graph_id_dummies.columns)

# Select these features for X
X_graph_id_fe = data_with_fe[features]

# Assuming y_train and y_test are defined or create them if not mentioned
X_train_graph_id_fe, X_test_graph_id_fe = train_test_split(X_graph_id_fe, test_size=0.2, random_state=42)
#X_train_graph_id_fe, X_test_graph_id_fe, y_train, y_test = train_test_split(X_graph_id_fe, data['confidence'], test_size=0.2, random_state=42)

# Fit the baseline regression model
model_graph_id_fe = LinearRegression()
model_graph_id_fe.fit(X_train_graph_id_fe, y_train)

# Predict and calculate the R-squared value
y_pred_graph_id_fe = model_graph_id_fe.predict(X_test_graph_id_fe)
r2_graph_id_fe = r2_score(y_test, y_pred_graph_id_fe)

r2_graph_id_fe

-0.005168410370678611

## With Fixed Effects

In [86]:
# Create dummy variables for 'subject_id_encoded' and 'graph_id'
subject_id_dummies = pd.get_dummies(data['subject_id_encoded'], prefix='subject_id')
#graph_id_dummies = pd.get_dummies(data['graph_id'], prefix='graph_id')

# Add these dummy variables to the data
data_with_dummies = pd.concat([data, subject_id_dummies], axis=1)
data_with_dummies = pd.concat([data_with_dummies, graph_id_dummies], axis=1)
# Define the baseline features including the subject_id dummies
baseline_features_with_dummies = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order'] + list(subject_id_dummies.columns)
#baseline_features_with_dummies

Baseline Regression

In [87]:
# Define the independent and dependent variables
X_baseline_with_dummies = data_with_dummies[baseline_features_with_dummies]

# Split the data into training and testing sets
X_train_baseline, X_test_baseline = train_test_split(X_baseline_with_dummies, test_size=0.2, random_state=42)

# Fit the baseline regression model
model_baseline_with_dummies = LinearRegression()
model_baseline_with_dummies.fit(X_train_baseline, y_train)

# Predict and calculate the R-squared value
y_pred_baseline = model_baseline_with_dummies.predict(X_test_baseline)
r2_baseline_fe = r2_score(y_test, y_pred_baseline)

r2_baseline_fe

0.6457058745388641

3 Feature Regression

In [88]:
# Define 3-feature model variables
features_3_feature = baseline_features_with_dummies + ['Recency_Factor', 'Rep_Factor', 'Sign_Factor']

# Split the data for 3-feature model
X_3_feature = data_with_dummies[features_3_feature]
X_train_3_feature, X_test_3_feature = train_test_split(X_3_feature, test_size=0.2, random_state=42)

# Fit the 3-feature model
model_3_feature = LinearRegression()
model_3_feature.fit(X_train_3_feature, y_train)

# Predict and calculate R-squared
y_pred_3_feature = model_3_feature.predict(X_test_3_feature)
r2_3_feature_fe = r2_score(y_test, y_pred_3_feature)
r2_3_feature_fe

0.6477475104690575

8 Feature Regressions

In [89]:
# Define 8-feature model variables including 'subject_id_encoded' dummies
features_8_feature = baseline_features_with_dummies + ['Recency_Factor', 'Rep_Factor', 'Sign_Factor', 'SH_Rep_Factor', 'SH_Sign_Factor', 'SH_Obj_Std', 'SH_Skewness', 'SH_Kurtosis']

# Split the data for 8-feature model
X_8_feature = data_with_dummies[features_8_feature]
X_train_8_feature, X_test_8_feature = train_test_split(X_8_feature, test_size=0.2, random_state=42)

# Fit the 8-feature model
model_8_feature = LinearRegression()
model_8_feature.fit(X_train_8_feature, y_train)

# Predict and calculate R-squared
y_pred_8_feature = model_8_feature.predict(X_test_8_feature)
r2_8_feature_fe = r2_score(y_test, y_pred_8_feature)
r2_8_feature_fe

0.6466063488226277

Graph FE Regression

In [90]:
# Assuming data_with_dummies already includes the necessary dummy variables
graph_fe_features = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order'] + list(subject_id_dummies.columns) + list(graph_id_dummies.columns)
X_graph_with_dummies = data_with_dummies[graph_fe_features]

# Ensure that the order and columns used in the model are consistent
X_train_graph_id, X_test_graph_id = train_test_split(X_graph_with_dummies,test_size=0.2, random_state=42)

# Fit the baseline regression model with dummies
model_graph_id_with_dummies = LinearRegression()
model_graph_id_with_dummies.fit(X_train_graph_id, y_train)

# Predict and calculate the R-squared value
y_pred_graph_id = model_graph_id_with_dummies.predict(X_test_graph_id)
r2_graph_fe = r2_score(y_test, y_pred_graph_id)
r2_graph_fe

0.6412374352475405

# Lasso

In [91]:
# Identify all 'ret_' and 'price_' columns
ret_columns = [col for col in data.columns if col.startswith('ret_')]
price_columns = [col for col in data.columns if col.startswith('price_')]

# Identify all 'ret_', 'price_', and their differences columns
ret_1st_diff_columns = [col for col in data.columns if '1st_diff' in col and col.startswith('ret_')]
ret_2nd_diff_columns = [col for col in data.columns if '2nd_diff' in col and col.startswith('ret_')]
price_1st_diff_columns = [col for col in data.columns if '1st_diff' in col and col.startswith('price_')]
price_2nd_diff_columns = [col for col in data.columns if '2nd_diff' in col and col.startswith('price_')]

# Combine these columns with your other features for the Huge Lasso model
features_huge_lasso = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor', 'SH_Rep_Factor', 'SH_Sign_Factor', 'SH_Obj_Std', 'SH_Skewness', 'SH_Kurtosis'] + ret_columns + price_columns + ret_1st_diff_columns + ret_2nd_diff_columns + price_1st_diff_columns + price_2nd_diff_columns
X_huge_lasso = data[features_huge_lasso]
X_train_huge, X_test_huge = train_test_split(X_huge_lasso, test_size=0.2, random_state=42)

In [92]:
def lasso_kfold_cv(data, features, target):
    X = data[features]
    y = data[target]
    
    # Standardizing the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Setting up 10-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Using LassoCV to find the best alpha during cross-validation
    lasso_cv = LassoCV(alphas=np.logspace(-6, 0, 100), cv=kf, random_state=42)
    lasso_cv.fit(X_scaled, y)

    # Evaluating the model using cross-validation
    cv_scores = cross_val_score(lasso_cv, X_scaled, y, cv=kf, scoring='r2')
    mean_r2 = np.mean(cv_scores)
    std_r2 = np.std(cv_scores)

    return mean_r2, std_r2

# Using the model
features_base = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor', 'SH_Rep_Factor', 'SH_Sign_Factor', 'SH_Obj_Std', 'SH_Skewness', 'SH_Kurtosis']
r2_8_feature_lasso, std_8_feature_lasso = lasso_kfold_cv(data, features_base, 'confidence')
print(f"Mean R-squared for Lasso with 10-Fold CV: {r2_8_feature_lasso:.4f} ± {std_8_feature_lasso:.4f}")

Mean R-squared for Lasso with 10-Fold CV: 0.0063 ± 0.0055


In [93]:
# Extracting ret and price features
ret_features = [col for col in data.columns if col.startswith('ret_') and 'diff' not in col]
price_features = [col for col in data.columns if col.startswith('price_') and 'diff' not in col]

# Large Lasso features include base features plus all ret_ and price_ values
features_large = features_base + ret_features + price_features

# Adding first and second differences for Huge Lasso features
ret_diff_features = [col for col in data.columns if 'ret_' in col and 'diff' in col]
price_diff_features = [col for col in data.columns if 'price_' in col and 'diff' in col]

features_huge = features_large + ret_diff_features + price_diff_features

# Calculate R-squared for Large Lasso
r2_large_lasso, std_r2_large = lasso_kfold_cv(data, features_large, 'confidence')
print(f"Mean R-squared for Large Lasso with 10-Fold CV: {r2_large_lasso:.4f} ± {std_r2_large:.4f}")

# Calculate R-squared for Huge Lasso
r2_huge_lasso, std_r2_huge = lasso_kfold_cv(data, features_huge, 'confidence')
print(f"Mean R-squared for Huge Lasso with 10-Fold CV: {r2_huge_lasso:.4f} ± {std_r2_huge:.4f}")

Mean R-squared for Large Lasso with 10-Fold CV: 0.0058 ± 0.0055
Mean R-squared for Huge Lasso with 10-Fold CV: 0.0056 ± 0.0053


lasso within subjected_id

In [94]:
from sklearn import set_config
set_config(enable_metadata_routing=False)  # Explicitly disable metadata routing

def lasso_group_kfold_cv(data, features, target):
    X = data[features]
    y = data[target]
    groups = data['subject_id_encoded']  # This assumes subject_id_encoded is a column in the DataFrame

    # Setting up 10-fold cross-validation that respects groups
    gkf = GroupKFold(n_splits=10)

    # Create a pipeline to scale data and apply Lasso
    lasso_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('lasso_cv', LassoCV(alphas=np.logspace(-6, 0, 100), max_iter=10000, random_state=42))
    ])

    # Compute cross-validated R^2 scores
    cv_scores = cross_val_score(lasso_pipeline, X, y, groups=groups, cv=gkf, scoring='r2')
    mean_r2 = np.mean(cv_scores)
    std_r2 = np.std(cv_scores)

    return mean_r2, std_r2

# Usage example
features_base = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor', 'SH_Rep_Factor', 'SH_Sign_Factor', 'SH_Obj_Std', 'SH_Skewness', 'SH_Kurtosis']
r2_8_feature_lasso, std_8_feature_lasso = lasso_group_kfold_cv(data, features_base, 'confidence')
print(f"Mean R-squared for Lasso with 10-Fold CV: {r2_8_feature_lasso:.4f} ± {std_8_feature_lasso:.4f}")

Mean R-squared for Lasso with 10-Fold CV: -0.0173 ± 0.0165


In [95]:
# Large Lasso
r2_large_lasso, std_r2_large_lasso = lasso_group_kfold_cv(data, features_large, 'confidence')
print(f"Mean R-squared for Large Lasso with 10-Fold CV: {r2_large_lasso:.4f} ± {std_r2_large_lasso:.4f}")

# Huge Lasso
r2_huge_lasso, std_r2_huge_lasso = lasso_group_kfold_cv(data, features_huge, 'confidence')
print(f"Mean R-squared for Huge Lasso with 10-Fold CV: {r2_huge_lasso:.4f} ± {std_r2_huge_lasso:.4f}")

Mean R-squared for Large Lasso with 10-Fold CV: -0.0177 ± 0.0152
Mean R-squared for Huge Lasso with 10-Fold CV: -0.0177 ± 0.0151


With control of subject_id_encoded FE

In [96]:
r2_8_feature_lasso_fe, std_8_feature_lasso = lasso_kfold_cv(data_with_dummies, features_base, 'confidence')
print(f"Mean R-squared for Lasso with 10-Fold CV: {r2_8_feature_lasso_fe:.4f} ± {std_8_feature_lasso:.4f}")

Mean R-squared for Lasso with 10-Fold CV: 0.0063 ± 0.0055


In [97]:
r2_large_lasso_fe, std_r2_large_lasso = lasso_group_kfold_cv(data_with_dummies, features_large, 'confidence')
print(f"Mean R-squared for Large Lasso with 10-Fold CV: {r2_large_lasso_fe:.4f} ± {std_r2_large_lasso:.4f}")

# Huge Lasso
r2_huge_lasso_fe, std_r2_huge_lasso = lasso_group_kfold_cv(data_with_dummies, features_huge, 'confidence')
print(f"Mean R-squared for Huge Lasso with 10-Fold CV: {r2_huge_lasso_fe:.4f} ± {std_r2_huge_lasso:.4f}")

Mean R-squared for Large Lasso with 10-Fold CV: -0.0177 ± 0.0152
Mean R-squared for Huge Lasso with 10-Fold CV: -0.0177 ± 0.0151


In [100]:
# Creating the DataFrame
dak = {
    "Model(Confidence)": [
        "Baseline Regression", "3-feature Regression", "8-feature Regression",
        "8-feature Lasso", "Large Lasso", "Huge Lasso", "Graph FE Regression"
    ],
    "R-squared (Without Subject ID FE)": [
        r2_base, r2_3_feature, r2_8_feature,
        r2_8_feature_lasso, r2_large_lasso, r2_large_lasso, r2_graph_id_fe
    ],
    "R-squared (With Subject ID FE)": [
        r2_baseline_fe, r2_3_feature_fe, r2_8_feature_fe,
        r2_8_feature_lasso_fe, r2_large_lasso_fe, r2_large_lasso_fe, r2_graph_fe
    ],
    "R-squared (Demeaned)": [
        r2_base_demeaned, r2_3_feature_demeaned, r2_8_feature_demeaned,
        r2_8_feature_lasso_demeaned, r2_large_lasso_demeaned, r2_large_lasso_demeaned, r2_graph_id_fe_demeaned
    ]
}

df = pd.DataFrame(dak)

df

Unnamed: 0,Model(Confidence),R-squared (Without Subject ID FE),R-squared (With Subject ID FE),R-squared (Demeaned)
0,Baseline Regression,0.00657568,0.64570587,0.01064144
1,3-feature Regression,0.00690656,0.64774751,0.01762215
2,8-feature Regression,0.00585844,0.64660635,0.01477908
3,8-feature Lasso,-0.01728875,0.00630441,0.01290238
4,Large Lasso,-0.01771068,-0.01771068,0.01285531
5,Huge Lasso,-0.01771068,-0.01771068,0.01285531
6,Graph FE Regression,-0.00516841,0.64123744,-3.257371509422882e+20


In [70]:
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Demean all variables except subject_id
grouped = data.groupby('subject_id')
data_demeaned = grouped.transform(lambda x: x - x.mean())


In [72]:
# Define independent variables for the baseline regression
X_base = data_demeaned[['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded']]
y = data_demeaned['confidence']
# Split the data
X_train_base, X_test_base, y_train, y_test = train_test_split(X_base, data_demeaned['confidence'], test_size=0.2, random_state=42)

# Baseline regression
model_base_demeaned = LinearRegression()
model_base_demeaned.fit(X_train_base, y_train)
y_pred_base_demeaned = model_base_demeaned.predict(X_test_base)
r2_base_demeaned = r2_score(y_test, y_pred_base_demeaned)
r2_base_demeaned

0.01064144361972097

In [73]:
X_3_feature = data_demeaned[['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor']]
X_train_3, X_test_3 = train_test_split(X_3_feature, test_size=0.2, random_state=42)

model_3_feature_demeaned = LinearRegression()
model_3_feature_demeaned.fit(X_train_3, y_train)
y_pred_3_demeaned = model_3_feature_demeaned.predict(X_test_3)
r2_3_feature_demeaned = r2_score(y_test, y_pred_3_demeaned)
r2_3_feature_demeaned

0.017622149443554003

In [74]:
X_8_feature = data_demeaned[['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor', 'SH_Rep_Factor', 'SH_Sign_Factor', 'SH_Obj_Std', 'SH_Skewness', 'SH_Kurtosis']]
X_train_8, X_test_8 = train_test_split(X_8_feature, test_size=0.2, random_state=42)

model_8_feature = LinearRegression()
model_8_feature.fit(X_train_8, y_train)
y_pred_8 = model_8_feature.predict(X_test_8)
r2_8_feature_demeaned = r2_score(y_test, y_pred_8)
r2_8_feature_demeaned

0.014779082636650509

In [75]:
r2_8_feature_lasso_demeaned, std_8_feature_lasso_demeaned = lasso_kfold_cv(data_demeaned, features_base, 'confidence')
print(f"Mean R-squared for Lasso with 10-Fold CV: {r2_8_feature_lasso_demeaned:.4f} ± {std_8_feature_lasso_demeaned:.4f}")

Mean R-squared for Lasso with 10-Fold CV: 0.0129 ± 0.0059


In [76]:
# Calculate R-squared for Large Lasso
r2_large_lasso_demeaned, std_r2_large_demeaned = lasso_kfold_cv(data_demeaned, features_large, 'confidence')
print(f"Mean R-squared for Large Lasso with 10-Fold CV: {r2_large_lasso_demeaned:.4f} ± {std_r2_large_demeaned:.4f}")

# Calculate R-squared for Huge Lasso
r2_huge_lasso_demeaned, std_r2_huge_demeaned = lasso_kfold_cv(data_demeaned, features_huge, 'confidence')
print(f"Mean R-squared for Huge Lasso with 10-Fold CV: {r2_huge_lasso_demeaned:.4f} ± {std_r2_huge_demeaned:.4f}")

Mean R-squared for Large Lasso with 10-Fold CV: 0.0129 ± 0.0081
Mean R-squared for Huge Lasso with 10-Fold CV: 0.0118 ± 0.0079


In [77]:
# Assuming 'data' is your DataFrame and 'graph_id' is the categorical variable
graph_id_dummies = pd.get_dummies(data_demeaned['graph_id'], prefix='graph_id')
data_with_fe = pd.concat([data_demeaned, graph_id_dummies], axis=1)

# Create a list of features including the dummy variables
features = ['time', 'click', 'order', 'subject_id_encoded'] + list(graph_id_dummies.columns)

# Select these features for X
X_graph_id_fe = data_with_fe[features]

# Assuming y_train and y_test are defined or create them if not mentioned
X_train_graph_id_fe, X_test_graph_id_fe = train_test_split(X_graph_id_fe, test_size=0.2, random_state=42)
#X_train_graph_id_fe, X_test_graph_id_fe, y_train, y_test = train_test_split(X_graph_id_fe, data['confidence'], test_size=0.2, random_state=42)

# Fit the baseline regression model
model_graph_id_fe = LinearRegression()
model_graph_id_fe.fit(X_train_graph_id_fe, y_train)

# Predict and calculate the R-squared value
y_pred_graph_id_fe = model_graph_id_fe.predict(X_test_graph_id_fe)
r2_graph_id_fe_demeaned = r2_score(y_test, y_pred_graph_id_fe)

r2_graph_id_fe_demeaned

-3.257371509422882e+20