In [1]:
import datetime 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
from os.path import join
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import statsmodels.api as sm
import pipeline

In [2]:
# Survey Aggregate data and indicator data
df_agg, df_ind = pipeline.read_and_format_data()
# Smoothing the aggregate survey data
rolling_window_size = 20
df_agg = pipeline.data_filter(df_agg, window_size=rolling_window_size)

In [3]:
DATA_BASE_PATH = '../Data'
df_meta = pd.read_parquet(join(DATA_BASE_PATH,'meta.pq')) 
df_raw = pd.read_parquet(join(DATA_BASE_PATH,'raw.pq')) 

In [4]:
df_meta.head()

Unnamed: 0,questionid,answerid,category,subcategory,tag,questiontext,alpharocquestionbank,alpharocpermanent,is_multiselect,answerordershuffle,hasdisplaydependencies,showrules,parents,answerorder,answertext
69,96,1109,Household Information,Home Ownership,Home Ownership,Do you own or rent your home?,False,False,False,False,True,,,1,Own
70,96,1110,Household Information,Home Ownership,Home Ownership,Do you own or rent your home?,False,False,False,False,True,,,2,Rent
71,96,1111,Household Information,Home Ownership,Home Ownership,Do you own or rent your home?,False,False,False,False,True,,,3,"I live in a home that someone else, besides me..."
72,97,1119,Household Information,Home Value Prediction,Home Value in 12 Months,[ASKED OF HOME OWNERS] What do you expect to h...,False,False,False,False,False,"{'atom': {'type': '3', 'type_human': 'previous...",96.0,1,I expect it to increase
73,97,1120,Household Information,Home Value Prediction,Home Value in 12 Months,[ASKED OF HOME OWNERS] What do you expect to h...,False,False,False,False,False,"{'atom': {'type': '3', 'type_human': 'previous...",96.0,2,I expect it to remain the same


In [5]:
df_raw.head()

Unnamed: 0_level_0,datesubmitted,age,gender,race,householdincome,education,urbandensity,partisanship,censusregion,censusdivision,...,Q3053A8020,Q3053A8021,Q3053A8022,Q3053A8023,Q3053A8024,Q3053A8025,Q3053A8026,Q3053A8027,Q3053A8028,Q3053A8029
respondentid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002bd51e1d7497190e6d8d94e099d05f,2019-07-23,65+,Female,White,$50k-$100k,No college degree,Rural,Modeled Republican,West,Mountain,...,0,0,0,0,0,0,0,0,0,0
aa2f797bfdc8adaad44041a304703a69,2019-07-23,45-64,Male,White,$0-$50k,No college degree,Rural,Modeled Republican,South,West South Central,...,0,0,0,0,0,0,0,0,0,0
c1c737ef955f531fbd7a61bfa8830fb7,2019-07-23,30-44,Male,White,$150k+,College degree or higher,Rural,Modeled Swing Voter,South,West South Central,...,0,0,0,0,0,0,0,0,0,0
b470df5c9350c78512f1fa26490812e7,2019-07-23,65+,Female,White,$0-$50k,No college degree,Urban,Modeled Republican,Northeast,Middle Atlantic,...,0,0,0,0,0,0,0,0,0,0
aa284c0e4f729c29ff449f9b95b0eee3,2019-07-23,65+,Male,White,$50k-$100k,College degree or higher,Exurban,Modeled Republican,West,Pacific,...,0,0,0,0,0,0,0,0,0,0


In [6]:
indicator = "CONSSENT Index"
print(f"Indicator: {indicator}")
if pd.api.types.is_numeric_dtype(df_ind[indicator]):
    df_comb = pipeline.get_combined_data_with_given_indicator(df_ind, df_agg, indicator_col=indicator)
    df_comb = df_comb.dropna(axis=1,how='all').bfill().ffill()
    print(df_comb.shape)
    X = df_comb.iloc[:,:-1]
    y = df_comb.iloc[:,-1]
    y = (y - np.mean(y))/np.std(y)

Indicator: CONSSENT Index
(49, 1231)


In [7]:
df = df_comb

In [17]:
# Assuming df is your DataFrame
X = df.drop('CONSSENT Index', axis=1)
y = df['CONSSENT Index']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit Lasso model
alpha = 0.01  # Tune this parameter
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train)

# Get coefficients and pair with feature names
coefficients = lasso.coef_
features = X.columns
feature_importance = pd.Series(coefficients, index=features)

# Sort features by importance
important_features = feature_importance.abs().sort_values(ascending=False)


# Filter out zero coefficients
important_features = important_features[important_features > 0][:50]
# print("Sorted Important Features:")
# print(important_features)
# Function to extract question and answer numbers and texts
def extract_qa_info(feature_name, df_meta):
    q_parts = feature_name.split('A')
    q_id = int(q_parts[0][1:])
    a_id = int(q_parts[1])
    
    if q_id not in df_meta['questionid'].values:
        return -1, a_id, "Question ID not found", "N/A"
    
    if a_id not in df_meta['answerid'].values:
        return -1, a_id, "N/A", "Answer ID not found"
    
    q_text = df_meta.loc[df_meta['questionid'] == q_id, 'questiontext'].values[0]
    a_text = df_meta.loc[df_meta['answerid'] == a_id, 'answertext'].values[0]
    
    return q_id, a_id, q_text, a_text

# Extracting question and answer numbers and texts
print("\nQuestion and Answer Details:")
for feature in important_features.index:
    q_id, a_id, q_text, a_text = extract_qa_info(feature, df_meta)
    if q_id != -1:
        print(f"Feature: {feature}, Question ID: {q_id}, Answer ID: {a_id}, Question Text: {q_text}, Answer Text: {a_text}")


Question and Answer Details:
Feature: Q585A1085, Question ID: 585, Answer ID: 1085, Question Text: Thinking about business in the country as a whole, do you think that during the next 12 months we'll have good times or bad times financially?, Answer Text: Good times
Feature: Q585A1087, Question ID: 585, Answer ID: 1087, Question Text: Thinking about business in the country as a whole, do you think that during the next 12 months we'll have good times or bad times financially?, Answer Text: Not sure
Feature: Q340A1072, Question ID: 340, Answer ID: 1072, Question Text: Six months from now, do you think business conditions in the United States will be...?, Answer Text: Worse than they are now
Feature: Q585A1086, Question ID: 585, Answer ID: 1086, Question Text: Thinking about business in the country as a whole, do you think that during the next 12 months we'll have good times or bad times financially?, Answer Text: Bad times
Feature: Q173A1287, Question ID: 173, Answer ID: 1287, Question 

  model = cd_fast.enet_coordinate_descent(


In [23]:
# Assuming df is your DataFrame
X = df.drop('CONSSENT Index', axis=1)
y = df['CONSSENT Index']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit Lasso model
alpha = 0.01  # Tune this parameter
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train)

# Get coefficients and pair with feature names
coefficients = lasso.coef_
features = X.columns
feature_importance = pd.Series(coefficients, index=features)

# Sort features by importance
features_ = feature_importance.abs()

zero_features = features_[features_ == 0][:140]

# Function to extract question and answer numbers and texts
def extract_qa_info(feature_name, df_meta):
    q_parts = feature_name.split('A')
    q_id = int(q_parts[0][1:])
    a_id = int(q_parts[1])
    
    if q_id not in df_meta['questionid'].values:
        return -1, a_id, "Question ID not found", "N/A"
    
    if a_id not in df_meta['answerid'].values:
        return -1, a_id, "N/A", "Answer ID not found"
    
    q_text = df_meta.loc[df_meta['questionid'] == q_id, 'questiontext'].values[0]
    a_text = df_meta.loc[df_meta['answerid'] == a_id, 'answertext'].values[0]
    
    return q_id, a_id, q_text, a_text

# Extracting question and answer numbers and texts
print("\nQuestion and Answer Details:")
for feature in zero_features.index:
    q_id, a_id, q_text, a_text = extract_qa_info(feature, df_meta)
    if q_id != -1:
        print(f"Feature: {feature}, Question ID: {q_id}, Answer ID: {a_id}, Question Text: {q_text}, Answer Text: {a_text}")


Question and Answer Details:
Feature: Q583A1079, Question ID: 583, Answer ID: 1079, Question Text: Would you say that you and your family are better off or worse off financially than you were a year ago?, Answer Text: Better off
Feature: Q583A1080, Question ID: 583, Answer ID: 1080, Question Text: Would you say that you and your family are better off or worse off financially than you were a year ago?, Answer Text: The same today as a year ago
Feature: Q583A1081, Question ID: 583, Answer ID: 1081, Question Text: Would you say that you and your family are better off or worse off financially than you were a year ago?, Answer Text: Worse off
Feature: Q584A1082, Question ID: 584, Answer ID: 1082, Question Text: In a year from now, do you think you and your family will be better off financially, worse off financially, or about the same as you are now?, Answer Text: Better off
Feature: Q584A1083, Question ID: 584, Answer ID: 1083, Question Text: In a year from now, do you think you and your 

  model = cd_fast.enet_coordinate_descent(


In [9]:
# Selecting specific columns
df_selected = df_meta[['questionid', 'questiontext']].drop_duplicates()
# Converting the selected dataframe to a CSV format string
# Index and header are set to False to exclude them from the output
csv_string = df_selected.to_csv(index=False, header=False)
# Printing the CSV string
print(csv_string)

96,Do you own or rent your home?
97,[ASKED OF HOME OWNERS] What do you expect to happen to the price of your home in the next 12 months?
98,[ASKED OF HOME OWNERS] What do you expect to happen to the price of your home in the next 5 years?
100,"[ASKED OF HOME OWNERS] In the next 12 months, do you anticipate making any renovations or improvements to your house?"
101,"[ASKED OF HOME OWNERS] In the next 12 months, do you anticipate...?"
103,"[Over the next year, do you expect to spend more or less than you currently do on the following categories?] Groceries"
104,"[Over the next year, do you expect to spend more or less than you currently do on the following categories?] Gasoline / Fuel"
105,"[Over the next year, do you expect to spend more or less than you currently do on the following categories?] Household goods (e.g. toilet paper, cleaning supplies)"
106,"[Over the next year, do you expect to spend more or less than you currently do on the following categories?] Rent / Mortgage"
107,"[