In [3]:
# retrieve data from BigQuery


from google.cloud import bigquery
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

client = bigquery.Client()

query = """
DECLARE MaxDate DATETIME
  DEFAULT (
    SELECT MAX(run_date)
    FROM `executive-orders-448515.weekly_data_collected.weekly_variables_flattened`
  );

SELECT 
  * EXCEPT(run_date)
FROM `executive-orders-448515.weekly_data_collected.weekly_variables_flattened`
WHERE run_date = MaxDate
ORDER BY week_start ASC;
"""

raw_data = client.query_and_wait(query)
df = raw_data.to_dataframe()

In [None]:
# index rows by week
df['week_start'] = pd.to_datetime(df['week_start'])
df = df.sort_values(by='week_start', ascending=True)
df.set_index('week_start', inplace=True)

In [12]:
# spline-interpolate exog vars to fill missing values
df_exog_interpolated = df.drop('orders_outcome_var', axis=1).astype(float).interpolate(method='spline', order = 3) # order 3 is cubic

The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.
  terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)


In [35]:
# create principal components from exog vars
scaler = StandardScaler()
df_to_standardize = df_exog_interpolated.drop(['disapproving'], axis=1)
df_standardized = scaler.fit_transform(df_to_standardize)

pca = PCA( n_components=4) # data exploration determined that 4 PCs will be retained
principal_components = pca.fit_transform(df_standardized)
df_principal_components = pd.DataFrame(principal_components, columns=['PC1','PC2','PC3','PC4'], index=df_to_standardize.index)



In [38]:
df1 = df_principal_components.join(df['orders_outcome_var'])

                 PC1       PC2       PC3       PC4  orders_outcome_var
week_start                                                            
2004-07-04 -2.496230  5.017040  1.072649 -0.341472                   3
2004-07-11 -2.496230  5.017040  1.072649 -0.341472                   6
2004-07-18 -2.496230  5.017040  1.072649 -0.341472                   0
2004-07-25 -2.496230  5.017040  1.072649 -0.341472                  12
2004-08-01 -2.370164  5.148217  1.442226 -0.202536                   0
...              ...       ...       ...       ...                 ...
2024-12-01  6.201533 -1.054575 -0.325082  1.142886                   0
2024-12-08  6.166974 -1.077202 -0.405412  1.127427                   0
2024-12-15  6.096656 -1.125358 -0.564260  1.092884                   0
2024-12-22  5.978336 -1.208337 -0.827310  1.031917                   3
2024-12-29  5.799770 -1.335433 -1.220248  0.937187                   2

[1070 rows x 5 columns]
