# Realestate Rental ROI Prediction - Loading Dataset using Snowpark Python


## Loading Real Estates Ads into Snowflake

### Import the dependencies and connect to Snowflake

In [None]:
# Snowpark
from snowflake.snowpark import Session
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import *

# Print the version of Snowpark we are using
from importlib.metadata import version
version('snowflake_snowpark_python')

In [None]:
# Other
import json

**Before connecting make sure you have updated creds.json with information for your Snowflake account**

In [None]:
with open('creds.json') as f:
    connection_parameters = json.load(f)

In [None]:
session = Session.builder.configs(connection_parameters).create()

The **get_** functions can be use to get information about the current database, schema, role etc

In [None]:
print(f"Current schema: {session.get_fully_qualified_current_schema()}, current role: {session.get_current_role()}, current warehouse:  {session.get_current_warehouse()}")

### Define Staging Area and the Schema for the transaction table

Using SQL we can create a internal stage and then use the **put** function to uplad the **realestatesads.csv.gz** file to it.

In [None]:
stage_name = "REAL_ESTATE_DATA"
# Create a internal staging area for uploading the source file
session.sql(f"CREATE or replace STAGE {stage_name}").collect()

# Upload the source file to the stage
putResult = session.file.put("data/realestatesads.csv.gz", f"@{stage_name}", auto_compress=False)

putResult

Define the schma for our **ADS** table

In [None]:
# Define the schema for the ADS table
df_ADS_Schema = StructType(
    [
        StructField("ADS_ID", IntegerType()),                          
        StructField("ADS_CATEGORY_NAME", StringType()),              
        StructField("ADS_FIRST_PUBLICATION_DATE", TimestampType()),  
        StructField("ADS_INDEX_DATE", TimestampType()),              
        StructField("ADS_SUBJECT", StringType()),                    
        StructField("ADS_PRICE", FloatType()),                       
        StructField("ADS_GEO_CITY", StringType()),                   
        StructField("ADS_GEO_ZIPCODE", StringType()),                
        StructField("ADS_GEO_LAT", FloatType()),                     
        StructField("ADS_GEO_LNG", FloatType()),                     
        StructField("ADS_OPT_URGENT", BooleanType()),                
        StructField("ADS_OWNER_TYPE", StringType()),                
        StructField("ADS_ATTR_REAL_ESTATE_TYPE", StringType()),     
        StructField("ADS_ATTR_ROOMS", IntegerType()),                  
        StructField("ADS_ATTR_SQUARE", IntegerType()),                 
        StructField("ADS_ATTR_GES", StringType()),                   
        StructField("ADS_ATTR_ENERGY_RATE", StringType()),           
        StructField("ADS_ATTR_FAI_INCLUDED", StringType()),          
        StructField("ADS_ATTR_CHARGES_INCLUDED", StringType()),      
        StructField("ADS_ATTR_FURNISHED", StringType()),             
        StructField("ADS_CODEINSEE", StringType())               
    ]
)



Load the **realestatesads.csv.gz** to a DataFrame reader and save into a table

In [None]:
# Crete a reader
df_Reader = session.read.schema(df_ADS_Schema)

# Get the data into the data frame
df_ADS = df_Reader.csv(f"@{stage_name}/realestatesads.csv.gz")

In [None]:
# Write the dataframe in a table
ret = df_ADS.write.mode("overwrite").saveAsTable("ADS")
ret


Chek the result

In [None]:
import pandas as pd
ADS_df_Raw = session.table("ADS").filter(col("ADS_CATEGORY_NAME") ==  "Locations" )
pd.DataFrame(ADS_df_Raw.limit(10).collect())