In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.filterwarnings('ignore')
# Applying a more readable float format
pd.options.display.float_format = '{:.8f}'.format

In [2]:
data = pd.read_csv("C:/Users/hongz/Downloads/choices13k-main/reg_data_main2.csv")
# Modify graph_id as specified
data['graph_id'] = data['graph_id'] + 100 * data['reshuffle_ind']
data

Unnamed: 0,subject_id,graph_id,time,click,risk,forecast,confidence,investment,reason_risky,reason_confidence,...,gender_ind,employment_ind,education_ind,income_ind,statistics_ind,risk-taking_ind,stock knowledge_ind,frequency _ind,history_ind,technical_ind
0,5fbb4426e47b46c3e2eeb544,156,11.55000000,6,6,112,62,59,,,...,0,0,1,1,0,0,0,0,0,0
1,65981b2c1df3be0020afa351,154,57.60100000,10,8,95,14,9,The ones that had big dips in them.,A lot of them. Only because I'm slowly trying ...,...,0,1,0,0,1,1,0,1,0,0
2,655791684bb1c5db02826d17,192,48.95800000,16,7,91,100,51,If it fluctuates more than 20% within 12 months.,With a $3 investment I do not feel unconfident...,...,0,1,1,1,0,1,0,1,1,0
3,62ddbd7eb3e9431e49b46ec1,182,49.95000000,11,8,115,79,60,,,...,1,1,1,0,1,1,0,0,0,0
4,5fb13091b87dfd5888f73e05,180,45.07000000,8,5,93,20,0,Those that have big drops.,When they are inconsistent and up and down on ...,...,0,0,0,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10035,65a038f02fb72900ba2653fc,67,26.48100000,17,8,106,47,0,Any stocks with dramatic movements of price ei...,Those kinds of risky prices where it goes up a...,...,0,1,1,0,0,0,0,0,0,0
10036,5596ab22fdf99b2d3a68c840,13,47.07800000,6,8,101,92,0,I think ones that have wide changes up and dow...,The ones that fluctuate randomly and have vari...,...,0,1,1,0,1,0,0,1,1,0
10037,65675f2ae0bb2e4ecb9c539d,31,20.73800000,22,3,105,87,13,High-volatility stocks are seen to be riskier ...,Stock price drops that are abrupt and sharp wi...,...,0,1,1,1,1,1,0,1,0,0
10038,6296290ec41bde1525dbb77e,61,33.98900000,9,6,106,40,60,The ones going down and the ones that seem the...,The volatility in swings makes me unsure and n...,...,1,0,0,0,1,1,0,0,1,1


In [3]:
na_counts = data.isna().sum()
print(na_counts[na_counts > 0])

reason_risky          260
reason_confidence     300
statistics             40
stock knowledge      1800
frequency            1800
attention            1800
history              1800
technical            1800
dtype: int64


In [4]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit and transform 'subject_id' to numeric
data['subject_id_encoded'] = le.fit_transform(data['subject_id'])

data.head()

Unnamed: 0,subject_id,graph_id,time,click,risk,forecast,confidence,investment,reason_risky,reason_confidence,...,employment_ind,education_ind,income_ind,statistics_ind,risk-taking_ind,stock knowledge_ind,frequency _ind,history_ind,technical_ind,subject_id_encoded
0,5fbb4426e47b46c3e2eeb544,156,11.55,6,6,112,62,59,,,...,0,1,1,0,0,0,0,0,0,185
1,65981b2c1df3be0020afa351,154,57.601,10,8,95,14,9,The ones that had big dips in them.,A lot of them. Only because I'm slowly trying ...,...,1,0,0,1,1,0,1,0,0,461
2,655791684bb1c5db02826d17,192,48.958,16,7,91,100,51,If it fluctuates more than 20% within 12 months.,With a $3 investment I do not feel unconfident...,...,1,1,1,0,1,0,1,1,0,438
3,62ddbd7eb3e9431e49b46ec1,182,49.95,11,8,115,79,60,,,...,1,1,0,1,1,0,0,0,0,307
4,5fb13091b87dfd5888f73e05,180,45.07,8,5,93,20,0,Those that have big drops.,When they are inconsistent and up and down on ...,...,0,0,0,0,1,0,1,1,0,183


In [5]:
# Assuming 'data' is your DataFrame and 'graph_id' is the categorical variable
graph_id_dummies = pd.get_dummies(data['graph_id'], prefix='graph_id')
data = pd.concat([data, graph_id_dummies], axis=1)
data.head()

Unnamed: 0,subject_id,graph_id,time,click,risk,forecast,confidence,investment,reason_risky,reason_confidence,...,graph_id_191,graph_id_192,graph_id_193,graph_id_194,graph_id_195,graph_id_196,graph_id_197,graph_id_198,graph_id_199,graph_id_200
0,5fbb4426e47b46c3e2eeb544,156,11.55,6,6,112,62,59,,,...,0,0,0,0,0,0,0,0,0,0
1,65981b2c1df3be0020afa351,154,57.601,10,8,95,14,9,The ones that had big dips in them.,A lot of them. Only because I'm slowly trying ...,...,0,0,0,0,0,0,0,0,0,0
2,655791684bb1c5db02826d17,192,48.958,16,7,91,100,51,If it fluctuates more than 20% within 12 months.,With a $3 investment I do not feel unconfident...,...,0,1,0,0,0,0,0,0,0,0
3,62ddbd7eb3e9431e49b46ec1,182,49.95,11,8,115,79,60,,,...,0,0,0,0,0,0,0,0,0,0
4,5fb13091b87dfd5888f73e05,180,45.07,8,5,93,20,0,Those that have big drops.,When they are inconsistent and up and down on ...,...,0,0,0,0,0,0,0,0,0,0


In [6]:
subject_id_dummies = pd.get_dummies(data['subject_id_encoded'], prefix='subject_id')
# Add these dummy variables to the data
data = pd.concat([data, subject_id_dummies], axis=1)
data.head()

Unnamed: 0,subject_id,graph_id,time,click,risk,forecast,confidence,investment,reason_risky,reason_confidence,...,subject_id_492,subject_id_493,subject_id_494,subject_id_495,subject_id_496,subject_id_497,subject_id_498,subject_id_499,subject_id_500,subject_id_501
0,5fbb4426e47b46c3e2eeb544,156,11.55,6,6,112,62,59,,,...,0,0,0,0,0,0,0,0,0,0
1,65981b2c1df3be0020afa351,154,57.601,10,8,95,14,9,The ones that had big dips in them.,A lot of them. Only because I'm slowly trying ...,...,0,0,0,0,0,0,0,0,0,0
2,655791684bb1c5db02826d17,192,48.958,16,7,91,100,51,If it fluctuates more than 20% within 12 months.,With a $3 investment I do not feel unconfident...,...,0,0,0,0,0,0,0,0,0,0
3,62ddbd7eb3e9431e49b46ec1,182,49.95,11,8,115,79,60,,,...,0,0,0,0,0,0,0,0,0,0
4,5fb13091b87dfd5888f73e05,180,45.07,8,5,93,20,0,Those that have big drops.,When they are inconsistent and up and down on ...,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Identify all 'ret_' and 'price_' columns
ret_columns = [col for col in data.columns if col.startswith('ret_')]
price_columns = [col for col in data.columns if col.startswith('price_')]

# Identify all 'ret_', 'price_', and their differences columns
ret_1st_diff_columns = [col for col in data.columns if '1st_diff' in col and col.startswith('ret_')]
ret_2nd_diff_columns = [col for col in data.columns if '2nd_diff' in col and col.startswith('ret_')]
price_1st_diff_columns = [col for col in data.columns if '1st_diff' in col and col.startswith('price_')]
price_2nd_diff_columns = [col for col in data.columns if '2nd_diff' in col and col.startswith('price_')]

# Adding first and second differences for Huge Lasso features
ret_diff_features = [col for col in data.columns if 'ret_' in col and 'diff' in col]
price_diff_features = [col for col in data.columns if 'price_' in col and 'diff' in col]

In [8]:
grouped = data.groupby('subject_id')
data_demeaned = grouped.transform(lambda x: x - x.mean())

In [9]:
from sklearn.model_selection import GroupShuffleSplit

# Assuming 'data' is your DataFrame and 'confidence' is the target variable
# 'subject_id_encoded' is a column in 'data' that you want to group by during the split

# Define your feature matrix X and target variable y
X = data_demeaned.drop('risk', axis=1)
y = data_demeaned['risk']

# Create a GroupShuffleSplit instance
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Get the index of the split, using 'subject_id_encoded' to create groups
train_idx, test_idx = next(gss.split(X, y, groups=data['subject_id_encoded']))

# Split the data into training and testing sets based on the indices
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [10]:
na_counts = data_demeaned.isna().sum()
print(na_counts[na_counts > 0])

statistics           40
stock knowledge    1800
frequency          1800
attention          1800
history            1800
technical          1800
dtype: int64


In [11]:
base_features = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded']

X_train_base = X_train[base_features]
X_test_base = X_test[base_features]

# Baseline regression
model_base = LinearRegression()
model_base.fit(X_train_base, y_train)
y_pred_base = model_base.predict(X_test_base)
r2_base = r2_score(y_test, y_pred_base)
r2_base

0.02798203275762945

In [12]:
X_3_feature = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor']

X_train_3_feature = X_train[X_3_feature]
X_test_3_feature = X_test[X_3_feature]

model_3_feature = LinearRegression()
model_3_feature.fit(X_train_3_feature, y_train)
y_pred_3 = model_3_feature.predict(X_test_3_feature)
r2_3_feature = r2_score(y_test, y_pred_3)
r2_3_feature

0.18713114390010455

In [13]:
X_8_feature = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order', 'subject_id_encoded', 'Recency_Factor', 'Rep_Factor', 'Sign_Factor', 'SH_Rep_Factor', 'SH_Sign_Factor', 'SH_Obj_Std', 'SH_Skewness', 'SH_Kurtosis']

X_train_8_feature = X_train[X_8_feature]
X_test_8_feature = X_test[X_8_feature]

model_8_feature = LinearRegression()
model_8_feature.fit(X_train_8_feature, y_train)
y_pred_8 = model_8_feature.predict(X_test_8_feature)
r2_8_feature = r2_score(y_test, y_pred_8)
r2_8_feature

0.19088698316702113

In [14]:
features = ['time', 'click', 'order', 'subject_id_encoded'] + list(graph_id_dummies.columns)

# Select these features for X
X_graph_id_train = X_train[features]
X_graph_id_test = X_test[features]

# Fit the baseline regression model
model_graph_id_fe = LinearRegression()
model_graph_id_fe.fit(X_graph_id_train, y_train)

# Predict and calculate the R-squared value
y_pred_graph_id_fe = model_graph_id_fe.predict(X_graph_id_test)
r2_graph_id_fe = r2_score(y_test, y_pred_graph_id_fe)

r2_graph_id_fe

0.22515752397068356

In [15]:
baseline_features_with_dummies = ['Obj_Std', 'Skewness', 'Kurtosis', 'time', 'click', 'order'] + list(subject_id_dummies.columns)

# Define the independent and dependent variables
X_baseline_with_dummies_train = X_train[baseline_features_with_dummies]
X_baseline_with_dummies_test = X_test[baseline_features_with_dummies]

# Fit the baseline regression model
model_baseline_with_dummies = LinearRegression()
model_baseline_with_dummies.fit(X_baseline_with_dummies_train, y_train)

# Predict and calculate the R-squared value
y_pred_baseline = model_baseline_with_dummies.predict(X_baseline_with_dummies_test)
r2_baseline_fe = r2_score(y_test, y_pred_baseline)

r2_baseline_fe

0.027982032757629338

In [16]:
# Define 3-feature model variables
features_3_feature = baseline_features_with_dummies + ['Recency_Factor', 'Rep_Factor', 'Sign_Factor']

# Split the data for 3-feature model
X_3_feature_train = X_train[features_3_feature]
X_3_feature_test = X_test[features_3_feature]

# Fit the 3-feature model
model_3_feature = LinearRegression()
model_3_feature.fit(X_3_feature_train, y_train)

# Predict and calculate R-squared
y_pred_3_feature = model_3_feature.predict(X_test_3_feature)
r2_3_feature_fe = r2_score(y_test, X_3_feature_test)
r2_3_feature_fe

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- subject_id_encoded
Feature names seen at fit time, yet now missing:
- subject_id_0
- subject_id_1
- subject_id_10
- subject_id_100
- subject_id_101
- ...


In [19]:
# # Convert 'subject_id' into dummy variables
# data_with_dummies = pd.get_dummies(data, columns=['subject_id'], drop_first=True)

# # Separate the features and the target
# features = [col for col in data_with_dummies.columns if col != 'risk']
# X = data_with_dummies[features]
# y = data_with_dummies['risk']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define 3-feature model variables
features_3_feature = ['Recency_Factor', 'Rep_Factor', 'Sign_Factor'] + [col for col in X_train.columns if col.startswith('subject_id_')]

# Split the data for 3-feature model
X_3_feature_train = X_train[features_3_feature]
X_3_feature_test = X_test[features_3_feature]

# Fit the 3-feature model
model_3_feature = LinearRegression()
model_3_feature.fit(X_3_feature_train, y_train)

# Predict and calculate R-squared
y_pred_3_feature = model_3_feature.predict(X_3_feature_test)
r2_3_feature_fe = r2_score(y_test, y_pred_3_feature)

print("R-squared for the 3-feature model:", r2_3_feature_fe)

R-squared for the 3-feature model: 0.2446519444985451


In [17]:
# Define 8-feature model variables including 'subject_id_encoded' dummies
features_8_feature = baseline_features_with_dummies + ['Recency_Factor', 'Rep_Factor', 'Sign_Factor', 'SH_Rep_Factor', 'SH_Sign_Factor', 'SH_Obj_Std', 'SH_Skewness', 'SH_Kurtosis']

# Split the data for 3-feature model
X_8_feature_train = X_train[features_8_feature]
X_8_feature_test = X_test[features_8_feature]

# Fit the 8-feature model
model_8_feature = LinearRegression()
model_8_feature.fit(X_8_feature_train, y_train)

# Predict and calculate R-squared
y_pred_8_feature = model_8_feature.predict(X_8_feature_test)
r2_8_feature_fe = r2_score(y_test, y_pred_8_feature)
r2_8_feature_fe

0.19088698316702113

In [20]:
features_large = base_features + ret_columns + price_columns
X_large_train = X_train[features_large]
X_large_test = X_test[features_large]

features_huge = features_large + ret_diff_features + price_diff_features
X_large_train = X_train[features_huge]
X_large_test = X_test[features_huge]