In [35]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.api import OLS, add_constant

In [24]:
data = pd.read_csv("data/feature_engineered_data.csv")
data.reset_index(drop=True, inplace=True)

In [25]:
data.head()
df = data


In [None]:
# Define features and target
predictors = ["Lagged_Sentiment_Index"]  # to add more predictors here
target_gdp = "GDP_Growth_Rate"
target_sentiment = "Sentiment_Index"

In [27]:
# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(data[predictors])
y_gdp = data[target_gdp]
y_sentiment = data[target_sentiment]

In [28]:
# Split data into train-test sets for cross-validation
X_train, X_test, y_gdp_train, y_gdp_test = train_test_split(
    X, y_gdp, test_size=0.2, random_state=42
)
_, _, y_sentiment_train, y_sentiment_test = train_test_split(
    X, y_sentiment, test_size=0.2, random_state=42
)

In [29]:
# First Lasso: Predict GDP Growth Rate
lasso_gdp = LassoCV(cv=5, random_state=42).fit(X_train, y_gdp_train)
selected_gdp_predictors = np.where(lasso_gdp.coef_ != 0)[0]

In [30]:
selected_gdp_predictors

array([0])

In [21]:
# Get the coefficients
print("Lasso coefficients:", lasso.coef_)

Lasso coefficients: [-0.          1.31614344]


In [31]:
# Second Lasso: Predict Sentiment Index
lasso_sentiment = LassoCV(cv=5, random_state=42).fit(X_train, y_sentiment_train)
selected_sentiment_predictors = np.where(lasso_sentiment.coef_ != 0)[0]

In [32]:
# Combine selected predictors
selected_features = set(selected_gdp_predictors).union(
    set(selected_sentiment_predictors)
)
combined_predictors = [predictors[i] for i in selected_features]

In [33]:
combined_predictors

['Lagged_Sentiment_Index']

In [36]:
# Final Model: OLS using selected features
X_final = data[combined_predictors]
X_final = add_constant(X_final)  # Add constant for intercept
final_model = OLS(data[target_gdp], X_final).fit()

# Display the results
print("Selected Predictors from Lasso:")
print(combined_predictors)
print("\nOLS Regression Summary:")
print(final_model.summary())

Selected Predictors from Lasso:
['Lagged_Sentiment_Index']

OLS Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        GDP_Growth_Rate   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                    0.5000
Date:                Sun, 22 Dec 2024   Prob (F-statistic):              0.481
Time:                        11:05:07   Log-Likelihood:                -152.26
No. Observations:                  90   AIC:                             308.5
Df Residuals:                      88   BIC:                             313.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------