# Get more fancy pants data
- go to the web factset 
- request it
- back to your sheet and run it (insert the name)


# Integrate it with the model
- join the data
- so update the model features


In [15]:
import json
import pandas as pd
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import PandasDataFrameType, IntegerType, StringType, FloatType, DateType
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.linear_model import LinearRegression
#from snowflake.ml.registry import model_registry
from snowflake.ml.registry import registry
from snowflake.ml._internal.utils import identifier

# 3.1 Get the Data Share


https://app.snowflake.com/marketplace/listing/GZT1ZA3NLF/similarweb-ltd-global-stocks-25-000-tickers-digital-traffic-data-by-domain?search=Global%20Stocks

# 3.2 Reading Snowflake Connection Details, create a Session


In [2]:
snowflake_connection_cfg = json.loads(open("/Users/mitaylor/Documents/creds/creds_sf_azure.json").read()) # <--- Update here
session = Session.builder.configs(snowflake_connection_cfg).create()
session.sql("USE DATABASE HOL_DEMO").collect()
#session.sql("USE STAGE YOUR_STAGE").collect()
session.sql("CREATE OR REPLACE WAREHOUSE ASYNC_WH WITH WAREHOUSE_SIZE='MEDIUM' WAREHOUSE_TYPE = 'SNOWPARK-OPTIMIZED'").collect()

[Row(status='Warehouse ASYNC_WH successfully created.')]

# 3.3 Use SQL to import the data

In [4]:
sdf = session.sql("""
SELECT ts.date,
       ts.variable_name,
       ts.value
FROM FINANCIAL__ECONOMIC_ESSENTIALS.cybersyn.financial_fred_timeseries AS ts
JOIN FINANCIAL__ECONOMIC_ESSENTIALS.cybersyn.financial_fred_attributes AS att
    ON (att.variable = ts.variable)
WHERE variable_group IN ('Bank of Brazil Selic Interest Rate Target',
                         'Bank of Canada Overnight Lending Rate',
                         'Bank of England Official Bank Rate',
                         'Bank of Japan Policy-Rate Balance Rate',
                         'Bank of Mexico Official Overnight Target Rate',
                         'ECB Main Refinancing Operations Rate: Fixed Rate Tenders for Euro Area',
                         'Federal Funds Effective Rate')""")
sdf.limit(5).to_pandas()

Unnamed: 0,DATE,VARIABLE_NAME,VALUE
0,2024-01-20,ECB Main Refinancing Operations Rate: Fixed Ra...,0.045
1,2024-01-16,Bank of Canada Overnight Lending Rate,0.05
2,2023-12-02,Bank of Brazil Selic Interest Rate Target,0.1225
3,2023-11-29,Bank of England Official Bank Rate,0.0525
4,2023-11-28,Federal Funds Effective Rate,0.0533


# 3.4 Use Native Snowpark to Manipulate the Data
## 3.4.1 Filtering

TODO:

1. Trim the dataframe columns to just the two we care about for now - "DATE" and "VALUE"

Hints:

To select a specific set of spark dataframe columns you would use the syntax spark_dataframe[["col_name1", "col_name2"]], Snowpark syntax is the same



In [5]:
sdf_trimmed = sdf[["DATE", "VALUE"]]
sdf_trimmed.show()

------------------------
|"DATE"      |"VALUE"  |
------------------------
|2024-01-29  |0.1125   |
|2024-01-28  |0.1175   |
|2024-01-25  |0.0525   |
|2024-01-29  |0.045    |
|2024-01-30  |0.1125   |
|2024-01-30  |0.045    |
|2024-01-28  |0.045    |
|2024-01-28  |0.1125   |
|2024-01-30  |0.1175   |
|2024-01-26  |0.0533   |
------------------------



## 3.4.2 GroupBy

TODO:

1. Group by date to get the average overnight rates for each of the central banks 

Hints:

Spark groupBy grouped_df = df.groupBy("the column you want to group by").agg(mean("temperature")

Snowpark's "group by" is invoked via "group_by", but otherwise the same
Snowpark's "mean" function is imported in the cell below 


In [6]:
from snowflake.snowpark.functions import mean as mean_

In [7]:
sdf_grouped = sdf_trimmed.group_by("DATE").agg(mean_("VALUE"))
sdf_grouped.limit(5).to_pandas()

Unnamed: 0,DATE,AVG(VALUE)
0,2024-01-30,0.091667
1,2024-01-21,0.082075
2,2023-12-14,0.0614
3,1956-10-09,0.0512
4,2010-02-02,0.021771


In [8]:
sdf_grouped = sdf_grouped.filter((F.col("DATE") >= '2022-01-01'))

# 3.5 Join it with our Prior Data

In [9]:
sdf_ml = session.table("ML_PREDICT")
sdf_ml = sdf_ml.filter((F.col("SYMBOL") == 'IBM'))
sdf_joined = sdf_ml.join(sdf_grouped, sdf_ml.DATE == sdf_grouped.DATE, rsuffix="_right", how="left")
sdf_joined = sdf_joined.rename(F.col("AVG(VALUE)"), "NEW_FEATURE")
sdf_joined = sdf_joined.drop("DATE_RIGHT")

In [10]:
sdf_joined.show()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"OPEN"              |"HIGH"              |"LOW"               |"CLOSE"             |"SYMBOL"  |"CLOSE_M1"          |"CLOSE_M2"          |"CLOSE_M3"          |"CLOSE_M4"          |"CLOSE_M5"          |"CLOSE_PREDICT"     |"NEW_FEATURE"         |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2022-01-06  |172.6999969482422   |175.3000030517578   |171.63999938964844  |172.0               |IBM       |174.9199981689453   |179.6999969482422   |182.009994506836    |177.57000732421875  |178.1999969482422   |172.98595

# 3.6 Train and Test the Model

In [13]:
sdf_joined = sdf_joined.na.fill(0.05)
sdf_joined_train, sdf_joined_test = sdf_joined.filter((F.col("DATE") <= '2023-01-01')), sdf_joined.filter((F.col("DATE") > '2023-01-01')) 

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "DATE", Type: DateType(), Input Value: 0.05, Type: <class 'float'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "SYMBOL", Type: StringType(16777216), Input Value: 0.05, Type: <class 'float'>


In [18]:
REGISTRY_DATABASE_NAME = "MODEL_REGISTRY"
REGISTRY_SCHEMA_NAME = "PUBLIC"
native_registry = registry.Registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

#train
regressor = LinearRegression
regressor = regressor(input_cols=["CLOSE_M1", "CLOSE_M2", "CLOSE_M3", "CLOSE_M4", "CLOSE_M5"],
                         label_cols=["CLOSE"],
                         output_cols=["CLOSE_PREDICT"])
regressor.fit(sdf_joined_train)

MODEL_NAME = "REGRESSION_IBM"
MODEL_VERSION = "v14"
model = native_registry.log_model(
    model_name=MODEL_NAME,
    version_name=MODEL_VERSION,
    model=regressor,
)

In [20]:
model_ = native_registry.get_model(MODEL_NAME).version(MODEL_VERSION)
model_.run(sdf_joined_test, function_name="predict").limit(10).to_pandas()

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,SYMBOL,NEW_FEATURE,CLOSE_M1,CLOSE_M2,CLOSE_M3,CLOSE_M4,CLOSE_M5,CLOSE_PREDICT
0,2023-09-14,174.0,176.100006,173.580002,175.740005,IBM,0.063186,174.210007,176.300003,179.360001,178.179993,177.559998,173.147545
1,2023-08-01,196.240005,196.729996,195.279999,195.610001,IBM,0.063186,196.449997,195.830002,193.220001,194.5,193.619995,192.002038
2,2023-12-14,198.020004,199.619995,196.160004,198.110001,IBM,0.0614,197.960007,194.710007,193.179993,195.710007,194.270004,192.71849
3,2023-11-09,182.960007,184.119995,181.809998,182.410004,IBM,0.062114,182.889999,181.820007,179.229996,176.649994,190.539993,180.491222
4,2023-03-29,159.369995,161.050003,159.350006,160.770004,IBM,0.059614,157.649994,158.279999,160.25,158.929993,157.830002,157.79124
5,2023-02-13,150.949997,154.259995,150.919998,153.850006,IBM,0.058186,151.009995,150.869995,151.919998,154.649994,151.729996,151.377481
6,2023-06-30,191.630005,194.479996,191.259995,193.970001,IBM,0.061012,189.589996,189.25,188.059998,185.270004,186.679993,186.208682
7,2023-05-17,171.710007,172.929993,170.419998,172.690002,IBM,0.061043,172.070007,172.070007,172.570007,173.75,173.559998,170.482932
8,2023-09-27,172.619995,173.039993,169.050003,170.429993,IBM,0.062829,171.960007,176.080002,174.789993,173.929993,175.490005,171.425859
9,2023-08-14,177.970001,179.690002,177.309998,179.460007,IBM,0.063186,177.789993,177.970001,178.190002,179.800003,178.850006,175.606562


In [21]:
model_.run(sdf_joined_test, function_name="predict").write.save_as_table("ML_PREDICT", mode="overwrite")