# Get more fancy pants data
- go to the web factset 
- request it
- back to your sheet and run it (insert the name)


# Integrate it with the model
- join the data
- so update the model features


In [1]:
import json
import pandas as pd
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import PandasDataFrameType, IntegerType, StringType, FloatType, DateType
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.linear_model import LinearRegression
#from snowflake.ml.registry import model_registry
from snowflake.ml.registry import registry
from snowflake.ml._internal.utils import identifier

# 3.1 Get the Data Share


https://app.snowflake.com/marketplace/listing/GZT1ZA3NLF/similarweb-ltd-global-stocks-25-000-tickers-digital-traffic-data-by-domain?search=Global%20Stocks

# 3.2 Reading Snowflake Connection Details, create a Session


In [2]:
snowflake_connection_cfg = json.loads(open("/Users/mitaylor/Documents/creds/creds_sf_azure.json").read()) # <--- Update here
session = Session.builder.configs(snowflake_connection_cfg).create()
session.sql("USE DATABASE HOL_DEMO").collect()
#session.sql("USE STAGE YOUR_STAGE").collect()
session.sql("CREATE OR REPLACE WAREHOUSE ASYNC_WH WITH WAREHOUSE_SIZE='MEDIUM' WAREHOUSE_TYPE = 'SNOWPARK-OPTIMIZED'").collect()

[Row(status='Warehouse ASYNC_WH successfully created.')]

# 3.3 Use SQL to import the data

In [3]:
sdf = session.sql("""
SELECT ts.date,
       ts.variable_name,
       ts.value
FROM FINANCIAL__ECONOMIC_ESSENTIALS.cybersyn.financial_fred_timeseries AS ts
JOIN FINANCIAL__ECONOMIC_ESSENTIALS.cybersyn.financial_fred_attributes AS att
    ON (att.variable = ts.variable)
WHERE variable_group IN ('Bank of Brazil Selic Interest Rate Target',
                         'Bank of Canada Overnight Lending Rate',
                         'Bank of England Official Bank Rate',
                         'Bank of Japan Policy-Rate Balance Rate',
                         'Bank of Mexico Official Overnight Target Rate',
                         'ECB Main Refinancing Operations Rate: Fixed Rate Tenders for Euro Area',
                         'Federal Funds Effective Rate')""")
sdf.limit(5).to_pandas()

Unnamed: 0,DATE,VARIABLE_NAME,VALUE
0,2024-01-20,ECB Main Refinancing Operations Rate: Fixed Ra...,0.045
1,2024-01-16,Bank of Canada Overnight Lending Rate,0.05
2,2023-12-02,Bank of Brazil Selic Interest Rate Target,0.1225
3,2023-11-29,Bank of England Official Bank Rate,0.0525
4,2023-11-28,Federal Funds Effective Rate,0.0533


# 3.4 Use Native Snowpark to Manipulate the Data
## 3.4.1 Filtering

TODO:

1. Trim the dataframe columns to just the two we care about for now - "DATE" and "VALUE"

Hints:

To select a specific set of spark dataframe columns you would use the syntax spark_dataframe[["col_name1", "col_name2"]], Snowpark syntax is the same



In [4]:
sdf_trimmed = sdf[["DATE", "VALUE"]]
sdf_trimmed.show()

------------------------
|"DATE"      |"VALUE"  |
------------------------
|2024-01-20  |0.045    |
|2024-01-16  |0.05     |
|2023-12-02  |0.1225   |
|2023-11-29  |0.0525   |
|2023-11-28  |0.0533   |
|2023-11-23  |-0.001   |
|2023-11-30  |0.1125   |
|2023-11-24  |-0.001   |
|2023-11-28  |-0.001   |
|2023-11-30  |0.1225   |
------------------------



## 3.4.2 GroupBy

TODO:

1. Group by date to get the average overnight rates for each of the central banks 

Hints:

Spark groupBy grouped_df = df.groupBy("the column you want to group by").agg(mean("temperature")

Snowpark's "group by" is invoked via "group_by", but otherwise the same
Snowpark's "mean" function is imported in the cell below 


In [5]:
from snowflake.snowpark.functions import mean as mean_

In [6]:
sdf_grouped = sdf_trimmed.group_by("DATE").agg(mean_("VALUE"))
sdf_grouped.limit(5).to_pandas()

Unnamed: 0,DATE,AVG(VALUE)
0,2024-01-26,0.0718
1,2024-01-25,0.0718
2,2014-08-10,0.0356
3,1960-12-31,0.039733
4,1997-09-21,0.0293


In [7]:
sdf_grouped = sdf_grouped.filter((F.col("DATE") >= '2022-01-01'))

## 3.4.3 Join it with our Prior Data

In [8]:
sdf_ml = session.table("ML_PREDICT")
sdf_ml = sdf_ml.filter((F.col("SYMBOL") == 'IBM'))
sdf_joined = sdf_ml.join(sdf_grouped, sdf_ml.DATE == sdf_grouped.DATE, rsuffix="_right", how="left")
sdf_joined = sdf_joined.rename(F.col("AVG(VALUE)"), "NEW_FEATURE")
sdf_joined = sdf_joined.drop("DATE_RIGHT")

In [9]:
sdf_joined.show()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"OPEN"              |"HIGH"              |"LOW"               |"CLOSE"             |"SYMBOL"  |"CLOSE_M1"          |"CLOSE_M2"          |"CLOSE_M3"          |"CLOSE_M4"          |"CLOSE_M5"          |"CLOSE_PREDICT"     |"NEW_FEATURE"         |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2022-09-12  |159.58999633789062  |164.25999450683594  |159.3000030517578   |163.42999267578125  |IBM       |157.3699951171875   |154.4600067138672   |155.9600067138672   |154.52999877929688  |155.80999755859375  |154.92362

# 3.5 Train and Test the Model

In [10]:
sdf_joined = sdf_joined.na.fill(0.05)
sdf_joined_train, sdf_joined_test = sdf_joined.filter((F.col("DATE") <= '2023-01-01')), sdf_joined.filter((F.col("DATE") > '2023-01-01')) 

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "DATE", Type: DateType(), Input Value: 0.05, Type: <class 'float'>
Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "SYMBOL", Type: StringType(16777216), Input Value: 0.05, Type: <class 'float'>


In [11]:
REGISTRY_DATABASE_NAME = "MODEL_REGISTRY"
REGISTRY_SCHEMA_NAME = "PUBLIC"
native_registry = registry.Registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

#train
regressor = LinearRegression
regressor = regressor(input_cols=["CLOSE_M1", "CLOSE_M2", "CLOSE_M3", "CLOSE_M4", "CLOSE_M5"],
                         label_cols=["CLOSE"],
                         output_cols=["CLOSE_PREDICT"])
regressor.fit(sdf_joined_train)

MODEL_NAME = "REGRESSION_IBM"
MODEL_VERSION = "v14"
model = native_registry.log_model(
    model_name=MODEL_NAME,
    version_name=MODEL_VERSION,
    model=regressor,
)

In [12]:
model_ = native_registry.get_model(MODEL_NAME).version(MODEL_VERSION)
model_.run(sdf_joined_test, function_name="predict").limit(10).to_pandas()

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,SYMBOL,NEW_FEATURE,CLOSE_M1,CLOSE_M2,CLOSE_M3,CLOSE_M4,CLOSE_M5,CLOSE_PREDICT
0,2023-09-11,180.070007,180.300003,177.339996,179.360001,IBM,0.063186,178.179993,177.559998,182.910004,189.699997,189.460007,176.024114
1,2023-07-27,196.020004,197.199997,192.550003,193.220001,IBM,0.063186,194.5,193.619995,192.75,191.940002,193.130005,190.4534
2,2023-12-11,193.110001,193.490005,191.419998,193.179993,IBM,0.062114,195.710007,194.270004,192.320007,193.419998,189.429993,191.032851
3,2023-11-06,176.380005,179.429993,176.210007,179.229996,IBM,0.062114,176.649994,190.539993,173.970001,170.770004,170.289993,176.98865
4,2023-03-24,158.860001,160.339996,157.850006,160.25,IBM,0.059614,158.929993,157.830002,147.919998,157.399994,155.0,157.35637
5,2023-02-08,153.880005,154.580002,151.169998,151.919998,IBM,0.057471,154.649994,151.729996,165.559998,150.820007,145.429993,155.14867
6,2023-06-27,185.889999,188.389999,185.669998,188.059998,IBM,0.062457,185.270004,186.679993,187.0,175.429993,185.009995,183.468657
7,2023-05-12,173.619995,174.059998,171.0,172.570007,IBM,0.061043,173.75,173.559998,171.770004,173.5,172.190002,171.719619
8,2023-01-09,130.470001,133.410004,129.889999,130.149994,IBM,0.055329,129.619995,125.019997,126.360001,125.07,129.929993,131.507156
9,2023-08-28,180.089996,180.589996,178.550003,180.190002,IBM,0.064967,179.070007,140.360001,143.110001,137.589996,137.350006,167.375192


In [13]:
model_.run(sdf_joined_test, function_name="predict").write.save_as_table("ML_PREDICT", mode="overwrite")