## Setup

In [None]:
import json
import boto3
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import IntegerType, FloatType
from snowflake.snowpark.functions import avg, sum, col, udf, call_udf, call_builtin, year
import streamlit as st
import pandas as pd
from datetime import date
from sklearn.linear_model import LinearRegression


In [None]:
%%sql
-- First create database using the Knoema Economical Data Atlas
-- Go to Marketplace to get database

-- Setup database, need to be logged in as accountadmin role */
--Set role and warehouse (compute)
USE ROLE accountadmin;
USE WAREHOUSE compute_wh;

--Create database and stage for the Snowpark Python UDF
CREATE DATABASE IF NOT EXISTS sparsh;
USE sparsh;
CREATE SCHEMA IF NOT EXISTS knoema_regression;
CREATE STAGE IF NOT EXISTS knoema_regression_stage;

In [3]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [5]:
creds = get_secret("wysde")

connection_parameters = {
    "account": creds["SNOWFLAKE_ACCOUNT"],
    "user": creds["SNOWFLAKE_USERNAME"],
    "password": creds["SNOWFLAKE_PASSWORD"],
    "warehouse": creds["SNOWFLAKE_WAREHOUSE"],
    "role": creds["SNOWFLAKE_ROLE"],
    "database": "sparsh",
    "schema": "knoema_regression"
}
session = Session.builder.configs(connection_parameters).create()

2023-01-16 21:21:34.972 INFO    botocore.credentials: Found credentials in shared credentials file: ~/.aws/credentials
2023-01-16 21:21:36.779 INFO    snowflake.connector.connection: Snowflake Connector for Python Version: 2.7.11, Python Version: 3.8.13, Platform: macOS-10.16-x86_64-i386-64bit
2023-01-16 21:21:36.782 INFO    snowflake.connector.connection: This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
2023-01-16 21:21:36.783 INFO    snowflake.connector.connection: Setting use_openssl_only mode to False
2023-01-16 21:21:40.699 INFO    snowflake.snowpark.session: Snowpark Session information: 
"version" : 0.8.0,
"python.version" : 3.8.13,
"python.connector.version" : 2.7.11,
"python.connector.session.id" : 787671232426006,
"os.name" : Darwin



In [6]:
# test if we have a connection
session.sql("select current_warehouse() wh, current_database() db, current_schema() schema, current_version() v").show()

2023-01-16 21:21:58.294 INFO    snowflake.connector.cursor: query: [SELECT  *  FROM (select current_warehouse() wh, current_database() db, current_s...]
2023-01-16 21:21:58.597 INFO    snowflake.connector.cursor: query execution done


---------------------------------------------------
|"WH"        |"DB"    |"SCHEMA"           |"V"    |
---------------------------------------------------
|COMPUTE_WH  |SPARSH  |KNOEMA_REGRESSION  |7.1.1  |
---------------------------------------------------



## Query data

In [7]:
# SQL query to explore the data
session.sql("SELECT * FROM ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA WHERE \"Table Name\" = 'Price Indexes For Personal Consumption Expenditures By Major Type Of Product' AND \"Indicator Name\" = 'Personal consumption expenditures (PCE)' AND \"Frequency\" = 'A' ORDER BY \"Date\" LIMIT 10").show()

2023-01-16 21:23:30.409 INFO    snowflake.connector.cursor: query: [SELECT  *  FROM (SELECT * FROM ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA WHERE "Table N...]
2023-01-16 21:23:33.713 INFO    snowflake.connector.cursor: query execution done


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"Table"  |"Table Name"                                        |"Table Description"  |"Table Full Name"                                   |"Table Unit"     |"Indicator"  |"Indicator Name"                         |"Indicator Description"  |"Indicator Full Name"  |"Units"          |"Scale"  |"Frequency"  |"Date"      |"Value"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|T20304   |Pr

In [8]:
# Now use Snowpark dataframe
snow_df_pce = (session.table("ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA") 
                            .filter(col('Table Name') == 'Price Indexes For Personal Consumption Expenditures By Major Type Of Product') 
                            .filter(col('Indicator Name') == 'Personal consumption expenditures (PCE)')
                            .filter(col('"Frequency"') == 'A')
                            .filter(col('"Date"') >= '1972-01-01'))
snow_df_pce.show(10)

2023-01-16 21:24:14.264 INFO    snowflake.connector.cursor: query: [SELECT  *  FROM ( SELECT  *  FROM ( SELECT  *  FROM ( SELECT  *  FROM ( SELECT  ...]
2023-01-16 21:24:17.341 INFO    snowflake.connector.cursor: query execution done


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"Table"  |"Table Name"                                        |"Table Description"  |"Table Full Name"                                   |"Table Unit"     |"Indicator"  |"Indicator Name"                         |"Indicator Description"  |"Indicator Full Name"  |"Units"          |"Scale"  |"Frequency"  |"Date"      |"Value"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|T20304   |Pr

In [9]:
# Let Snowflake perform filtering using the Snowpark pushdown and display results in a Pandas dataframe
snow_df_pce = (session.table("ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA")
                        .filter(col('"Table Name"') == 'Price Indexes For Personal Consumption Expenditures By Major Type Of Product')
                        .filter(col('"Indicator Name"') == 'Personal consumption expenditures (PCE)')
                        .filter(col('"Frequency"') == 'A')
                        .filter(col('"Date"') >= '1972-01-01'))
pd_df_pce_year = snow_df_pce.select(year(col('"Date"')).alias('"Year"'), col('"Value"').alias('PCE') ).to_pandas()
pd_df_pce_year

2023-01-16 21:25:17.112 INFO    snowflake.connector.cursor: query: [SELECT year("Date") AS "Year", "Value" AS "PCE" FROM ( SELECT  *  FROM ( SELECT ...]
2023-01-16 21:25:20.238 INFO    snowflake.connector.cursor: query execution done


Unnamed: 0,Year,PCE
0,1972,22.542
1,1973,23.756
2,1974,26.229
3,1975,28.415
4,1976,29.974
5,1977,31.923
6,1978,34.145
7,1979,37.178
8,1980,41.182
9,1981,44.871


## Train the Linear Regression model

In [10]:
# train model with PCE index

x = pd_df_pce_year["Year"].to_numpy().reshape(-1,1)
y = pd_df_pce_year["PCE"].to_numpy()

model = LinearRegression().fit(x, y)

# test model for 2022
predictYear = 2022
pce_pred = model.predict([[predictYear]])
# print the last 5 years
print (pd_df_pce_year.tail() )
# run the prediction for 2022
print ('Prediction for '+str(predictYear)+': '+ str(round(pce_pred[0],2)))

    Year      PCE
45  2017  106.054
46  2018  108.317
47  2019  109.933
48  2020  111.145
49  2021  115.621
Prediction for 2022: 118.0


### Creating a User Defined Function within Snowflake to do the scoring there

In [12]:
def predict_pce(predictYear: int) -> float:
    return model.predict([[predictYear]])[0].round(2).astype(float)

_ = session.udf.register(predict_pce,
                        return_type=FloatType(),
                        input_type=IntegerType(),
                        packages= ["pandas","scikit-learn"],
                        is_permanent=True, 
                        name="predict_pce_udf", 
                        replace=True,
                        stage_location="@knoema_regression_stage")

2023-01-16 21:27:22.170 INFO    snowflake.connector.cursor: query: [ls '@knoema_regression_stage']
2023-01-16 21:27:22.503 INFO    snowflake.connector.cursor: query execution done
2023-01-16 21:27:22.507 INFO    snowflake.connector.cursor: query: [SELECT "name" FROM ( SELECT  *  FROM  TABLE ( RESULT_SCAN('01a9b0dd-3200-a224-00...]
2023-01-16 21:27:23.622 INFO    snowflake.connector.cursor: query execution done
2023-01-16 21:27:23.624 INFO    snowflake.connector.cursor: query: [select package_name, version from information_schema.packages where language='py...]
2023-01-16 21:27:24.786 INFO    snowflake.connector.cursor: query execution done
2023-01-16 21:27:24.813 INFO    snowflake.connector.cursor: query: [CREATE OR REPLACE FUNCTION predict_pce_udf(arg1 BIGINT) RETURNS FLOAT LANGUAGE P...]
2023-01-16 21:27:41.077 INFO    snowflake.connector.cursor: query execution done


## Test the trained model by invoking the UDF via a SQL statement

In [13]:
session.sql("select predict_pce_udf(2022)").show()

2023-01-16 21:28:00.025 INFO    snowflake.connector.cursor: query: [SELECT  *  FROM (select predict_pce_udf(2022)) LIMIT 10]
2023-01-16 21:28:03.155 INFO    snowflake.connector.cursor: query execution done


---------------------------
|"PREDICT_PCE_UDF(2022)"  |
---------------------------
|118.0                    |
---------------------------



## Run this in Snowflake editor

In [None]:
%%sql
DROP TABLE IF EXISTS BEANIPA;

CREATE TABLE BEANIPA AS
SELECT * from ECONOMY_DATA_ATLAS.ECONOMY.BEANIPA
WHERE year("Date")>2000
LIMIT 20

ALTER TABLE BEANIPA
ADD COLUMN dt INT AS year("Date");

ALTER TABLE BEANIPA
ADD prediction float;

SELECT * FROM BEANIPA;

SELECT predict_pce_udf(2022)

DECLARE
    c1 CURSOR FOR SELECT dt FROM BEANIPA;
    dtv VARCHAR;
BEGIN
  FOR record IN c1 DO
      dtv := record.dt;
      UPDATE BEANIPA f SET PREDICTION = prediction.p 
      FROM (SELECT predict_pce_udf(:dtv) p ) AS prediction;
  END FOR;
END;

SELECT * FROM BEANIPA;

