# NOTEBOOK 1: INGEST HOUSING DATA INTO SNOWFLAKE

#### Download the housing dataset

In [None]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [None]:
# Snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import *

import pandas as pd 

def load_housing_data ( housing_path = HOUSING_PATH ): 
    csv_path = os.path.join ( housing_path , "housing.csv" ) 
    return pd.read_csv ( csv_path ) 

housing = load_housing_data()
housing.head()

In [None]:
import json

with open('creds.json') as f:
    data = json.load(f)
    USERNAME = data['username']
    PASSWORD = data['password']
    SF_ACCOUNT = data['sf_account']
    SF_WH = data['sf_wh']
    SF_DB = data['sf_db']
    SF_SCHEMA = data['sf_schema']

CONNECTION_PARAMETERS = {
   "account": SF_ACCOUNT,
   "user": USERNAME,
   "password": PASSWORD,
   "database": SF_DB,
   "schema": SF_SCHEMA,
   "warehouse": SF_WH
}
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

#### Create the HOUSING_DATA table

In [None]:
%%time

query = "create or replace table HOUSING_DATA (LONGITUDE float,LATITUDE float,HOUSING_MEDIAN_AGE float," +\
        "TOTAL_ROOMS float,TOTAL_BEDROOMS float,POPULATION float,HOUSEHOLDS float," +\
        "MEDIAN_INCOME float,MEDIAN_HOUSE_VALUE float,OCEAN_PROXIMITY varchar)"
        
session.sql(query).collect()

In [None]:
# need to convert column names to upper case before writing into Snowflake
housing.columns = [x.upper() for x in housing.columns]
housing.head()

Write the data into a Snowflake table named `HOUSING_DATA`

In [None]:
snowpark_df = session.write_pandas(housing, "HOUSING_DATA")

In [None]:
snowpark_df.toPandas().head()