# Get more fancy pants data
- go to the web factset 
- request it
- back to your sheet and run it (insert the name)


# Integrate it with the model
- join the data
- so update the model features


In [16]:
import json
import pandas as pd
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import PandasDataFrameType, IntegerType, StringType, FloatType, DateType
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.linear_model import LinearRegression
from snowflake.ml.registry import model_registry
from snowflake.ml._internal.utils import identifier

# 3.1 Get the Data Share


https://app.snowflake.com/marketplace/listing/GZT1ZA3NLF/similarweb-ltd-global-stocks-25-000-tickers-digital-traffic-data-by-domain?search=Global%20Stocks

# 3.2 Reading Snowflake Connection Details, create a Session


In [2]:
snowflake_connection_cfg = json.loads(open("/Users/mitaylor/Documents/creds/creds_sf_azure.json").read()) # <--- Update here
session = Session.builder.configs(snowflake_connection_cfg).create()
session.sql("USE DATABASE HOL_DEMO").collect()
#session.sql("USE STAGE YOUR_STAGE").collect()
session.sql("CREATE OR REPLACE WAREHOUSE ASYNC_WH WITH WAREHOUSE_SIZE='MEDIUM' WAREHOUSE_TYPE = 'SNOWPARK-OPTIMIZED'").collect()

[Row(status='Warehouse ASYNC_WH successfully created.')]

# 3.3 Use SQL to import the data

In [10]:
sdf = session.sql("""
SELECT ts.date,
       ts.variable_name,
       ts.value
FROM FINANCIAL__ECONOMIC_ESSENTIALS.cybersyn.financial_fred_timeseries AS ts
JOIN FINANCIAL__ECONOMIC_ESSENTIALS.cybersyn.financial_fred_attributes AS att
    ON (att.variable = ts.variable)
WHERE variable_group IN ('Bank of Brazil Selic Interest Rate Target',
                         'Bank of Canada Overnight Lending Rate',
                         'Bank of England Official Bank Rate',
                         'Bank of Japan Policy-Rate Balance Rate',
                         'Bank of Mexico Official Overnight Target Rate',
                         'ECB Main Refinancing Operations Rate: Fixed Rate Tenders for Euro Area',
                         'Federal Funds Effective Rate')""")
sdf.limit(5).to_pandas()

Unnamed: 0,DATE,VARIABLE_NAME,VALUE
0,2024-01-09,Bank of England Official Bank Rate,0.0525
1,2024-01-10,ECB Main Refinancing Operations Rate: Fixed Ra...,0.045
2,2023-12-19,Bank of Japan Policy-Rate Balance Rate,-0.001
3,2023-12-11,Bank of Brazil Selic Interest Rate Target,0.1225
4,2023-11-12,Bank of Mexico Official Overnight Target Rate,0.1125


# 3.4 Use Native Snowpark to Manipulate the Data
## 3.4.1 Filtering

TODO:

1. Trim the dataframe columns to just the two we care about for now - "DATE" and "VALUE"

Hints:

To select a specific set of spark dataframe columns you would use the syntax spark_dataframe[["col_name1", "col_name2"]], Snowpark syntax is the same



In [11]:
sdf_trimmed = sdf[["DATE", "VALUE"]]
sdf_trimmed.show()

------------------------
|"DATE"      |"VALUE"  |
------------------------
|2024-01-09  |0.0525   |
|2024-01-10  |0.045    |
|2023-12-19  |-0.001   |
|2023-12-11  |0.1225   |
|2023-11-12  |0.1125   |
|2014-02-24  |0.0025   |
|2024-01-30  |0.1175   |
|2024-01-26  |0.0533   |
|2024-01-29  |0.1125   |
|2024-01-28  |0.1175   |
------------------------



## 3.4.2 GroupBy

TODO:

1. Group by date to get the average overnight rates for each of the central banks 

Hints:

Spark groupBy grouped_df = df.groupBy("the column you want to group by").agg(mean("temperature")

Snowpark's "group by" is invoked via "group_by", but otherwise the same
Snowpark's "mean" function is imported in the cell below 


In [12]:
from snowflake.snowpark.functions import mean as mean_

In [13]:
sdf_grouped = sdf_trimmed.group_by("DATE").agg(mean_("VALUE"))
sdf_grouped.limit(5).to_pandas()

Unnamed: 0,DATE,AVG(VALUE)
0,2024-01-19,0.0614
1,2023-11-24,0.062114
2,2023-11-28,0.062114
3,1979-11-06,0.109867
4,2009-10-27,0.021729


In [14]:
sdf_grouped = sdf_grouped.filter((F.col("DATE") >= '2022-01-01'))

# 3.5 Join it with our Prior Data

In [18]:
sdf_ml = session.table("ML_PREPPED")
sdf_ml = sdf_ml.filter((F.col("SYMBOL") == 'IBM'))
sdf_joined = sdf_ml.join(sdf_grouped, sdf_ml.DATE == sdf_grouped.DATE, rsuffix="_right", how="left")
sdf_joined = sdf_joined.rename(F.col("AVG(VALUE)"), "NEW_FEATURE")
sdf_joined = sdf_joined.drop("DATE_RIGHT")

In [19]:
sdf_joined.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"OPEN"              |"HIGH"              |"LOW"               |"CLOSE"             |"SYMBOL"  |"CLOSE_M1"          |"CLOSE_M2"          |"CLOSE_M3"          |"CLOSE_M4"          |"CLOSE_M5"          |"NEW_FEATURE"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2020-09-18  |110.4000015258789   |110.87999725341795  |106.08999633789062  |106.83999633789062  |IBM       |106.83999633789062  |106.83999633789062  |106.83999633789062  |106.83999633789062  |106.83999633789062  |NULL           |
|2020-08-27  |127.14250183105467  |127.48500061035156  |123.8324966430664   

# 3.6 Train and Test the Model

In [24]:
sdf_joined = sdf_joined.na.fill(0.05)
sdf_joined_train, sdf_joined_test = sdf_joined.filter((F.col("DATE") <= '2023-01-01')), sdf_joined.filter((F.col("DATE") > '2023-01-01')) 



In [26]:
REGISTRY_DATABASE_NAME = "MODEL_REGISTRY"
REGISTRY_SCHEMA_NAME = "PUBLIC"
registry = model_registry.ModelRegistry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

#train
regressor = LinearRegression(input_cols=["CLOSE_M1", "CLOSE_M2", "CLOSE_M3", "CLOSE_M4", "CLOSE_M5", "NEW_FEATURE"],
                         label_cols=["CLOSE"],
                         output_cols=["CLOSE_PREDICT"])
regressor.fit(sdf_joined_train)

#register
MODEL_NAME = "REGRESSION_IBM_W_NEW_DATA"
MODEL_VERSION = "v3"
model = registry.log_model(model_name=MODEL_NAME,
                           model_version=MODEL_VERSION,
                           model=regressor,
                           tags={"stage": "testing", "classifier_type": "Lin_Reg"})

# deploy
model.deploy(deployment_name=MODEL_NAME + MODEL_VERSION,
             target_method="predict",
             permanent=True,
             options={"relax_version": True})

The `snowflake.ml.registry.model_registry.ModelRegistry` has been deprecated starting from version 1.2.0.
It will stay in the Private Preview phase. For future implementations, kindly utilize `snowflake.ml.registry.Registry`,
except when specifically required. The old model registry will be removed once all its primary functionalities are
fully integrated into the new registry.
        
  registry = model_registry.ModelRegistry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)


RuntimeError: (2100) 
The model's dependencies are not available in Snowflake Anaconda Channel. 
Required packages are: "absl-py<2,>=1.4" "anyio<4,>=3.5" "cloudpickle<3,>=2.2" "numpy<2,>=1.24" "packaging<24,>=23.1" "pandas<2,>=1.5" "pyyaml<7,>=6.0" "snowflake-snowpark-python<2,>=1.11" "typing-extensions<5,>=4.7" "snowflake-ml-python<2,>=1.2" "scikit-learn<2,>=1.3"
Required Python version is: 3.11
Packages that are not available are: []
Packages that cannot meet your requirements are: ['snowflake-ml-python']
Package availability information of those you requested is: {'absl-py': [<Version('1.4.0')>], 'anyio': [<Version('3.5.0')>], 'cloudpickle': [<Version('2.2.1')>], 'numpy': [<Version('1.24.3')>, <Version('1.25.0')>, <Version('1.25.2')>, <Version('1.26.0')>, <Version('1.26.2')>, <Version('1.26.3')>], 'packaging': [<Version('23.1')>], 'pandas': [<Version('1.5.2')>, <Version('1.5.3')>], 'pyyaml': [<Version('6.0')>, <Version('6.0.1')>], 'snowflake-snowpark-python': [<Version('1.11.1')>], 'typing-extensions': [<Version('4.7.1')>], 'snowflake-ml-python': [], 'scikit-learn': [<Version('1.3.0')>]}


In [None]:
# predict
model.predict(deployment_name=MODEL_NAME + MODEL_VERSION, data=sdf_filt_test).limit(20).to_pandas()