In [1]:
import os
import sys
import socket

import sys,uuid,datetime
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

key = os.environ["MINIO_ROOT_USER"]
secret = os.environ["MINIO_ROOT_PASSWORD"]
endpoint = os.environ["MINIO_SECRET_ENDPOINT"]
endpoint = "http://127.0.0.1:9000"
print(endpoint)

#sc.stop()

spark = SparkSession.builder \
.master("k8s://https://kubernetes.docker.internal:6443") \
.appName("playing_with_immo24") \
.config("spark.hadoop.fs.s3a.access.key", key) \
.config("spark.hadoop.fs.s3a.secret.key", secret) \
.config("spark.hadoop.fs.s3a.endpoint", endpoint) \
.config("spark.hadoop.fs.s3a.path.style.access", "true") \
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
.config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0,org.apache.hadoop:hadoop-aws:3.2.0,com.amazonaws:aws-java-sdk-bundle:1.11.375") \
.config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
.config('spark.submit.deployMode', 'client') \
.config("spark.kubernetes.container.image", "spark:spark-docker") \
.config("spark.kubernetes.pyspark.pythonVersion", "3") \
.config("spark.kubernetes.authenticate.driver.serviceAccountName", "default") \
.config("spark.executor.instances", "1") \
.config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
.config("spark.kubernetes.executor.request.cores","0.5") \
.config("spark.kubernetes.executor.limit.cores","1") \
.config("jupyterService.jupyterPort", "30888") \
.config("serviceAccount", "spark") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.getOrCreate()

#.config("spark.driver.host", "10.1.2.104") \
#.config("spark.driver.port", "4040") \

sc = spark.sparkContext
#sc._conf.getAll()

http://127.0.0.1:9000


In [2]:
import pandas as pd
import json, ijson #made for big files which cannot be handled in memory

#visualisations
import folium
from folium import plugins

ModuleNotFoundError: No module named 'pandas'

In [2]:
path_BE = "s3a://real-estate/staging/201031_Bern_buy_0_flat.gz" 
path_SO = "s3a://real-estate/staging/201031_Solothurn_buy_0_flat.gz"
df_SO = spark.read.json(path_SO)
df = spark.read.json(path_BE)

In [3]:
from pyspark.sql.types import ArrayType, StructType
from pyspark.sql.functions import col, explode_outer

#Flatten array of structs and structs
def flatten(df):

   # compute Complex Fields (Lists and Structs) in Schema   
   complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if (type(field.dataType) == ArrayType or type(field.dataType) == StructType) and field.name.startswith('propertyDetails')])
   
   #print(complex_fields) 
   while len(complex_fields)!=0:    
        
      col_name=list(complex_fields.keys())[0]
      #print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))
       
      if col_name in ["propertyDetails_images","propertyDetails_pdfs","propertyDetails_commuteTimes_defaultPois_transportations"]:
            #remove and skip next part
            df=df.drop(col_name)
      else:
          # if StructType then convert all sub element to columns.
          # i.e. flatten structs
          if (type(complex_fields[col_name]) == StructType):
             expanded = [col(col_name+'.'+k).alias(col_name+'_'+k) for k in [ n.name for n in  complex_fields[col_name]]]
             df=df.select("*", *expanded).drop(col_name)


          # if ArrayType then add the Array Elements as Rows using the explode function
          # i.e. explode Arrays
          elif (type(complex_fields[col_name]) == ArrayType):
             df=df.withColumn(col_name,explode_outer(col_name))
    
      # recompute remaining Complex Fields in Schema       
      complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
      #print(complex_fields)
      #print(df.count())

   return df

df_flatten_BE=flatten(df)
df_flatten_SO=flatten(df_SO)

In [4]:
df_flatten_BE.printSchema()
df_flatten_BE.count()

root
 |-- propertyDetails_accountId: long (nullable = true)
 |-- propertyDetails_availableFrom: string (nullable = true)
 |-- propertyDetails_availableFromFormatted: string (nullable = true)
 |-- propertyDetails_cityId: long (nullable = true)
 |-- propertyDetails_cityName: string (nullable = true)
 |-- propertyDetails_contactFormTypeId: long (nullable = true)
 |-- propertyDetails_countryId: long (nullable = true)
 |-- propertyDetails_description: string (nullable = true)
 |-- propertyDetails_geoAccuracy: long (nullable = true)
 |-- propertyDetails_hasNewBuildingProject: boolean (nullable = true)
 |-- propertyDetails_hasVirtualTour: boolean (nullable = true)
 |-- propertyDetails_id: long (nullable = true)
 |-- propertyDetails_isBuyRent: string (nullable = true)
 |-- propertyDetails_isHighlighted: boolean (nullable = true)
 |-- propertyDetails_isNew: boolean (nullable = true)
 |-- propertyDetails_isNewEndDate: string (nullable = true)
 |-- propertyDetails_isOnline: boolean (nullable = tr

20

In [19]:
insert_columns = '\n, '.join(df_flatten_BE.columns)
#print(insert_columns)

#update_columns = ', trg.'.join(df_flatten_BE.columns).join(' = src.')


update_columns = '\n, '.join(['trg.'+ c +' = src.'+c for c in df_flatten_BE.columns])
print(update_columns)

trg.propertyDetails_accountId = src.propertyDetails_accountId
, trg.propertyDetails_availableFrom = src.propertyDetails_availableFrom
, trg.propertyDetails_availableFromFormatted = src.propertyDetails_availableFromFormatted
, trg.propertyDetails_cityId = src.propertyDetails_cityId
, trg.propertyDetails_cityName = src.propertyDetails_cityName
, trg.propertyDetails_contactFormTypeId = src.propertyDetails_contactFormTypeId
, trg.propertyDetails_countryId = src.propertyDetails_countryId
, trg.propertyDetails_description = src.propertyDetails_description
, trg.propertyDetails_geoAccuracy = src.propertyDetails_geoAccuracy
, trg.propertyDetails_hasNewBuildingProject = src.propertyDetails_hasNewBuildingProject
, trg.propertyDetails_hasVirtualTour = src.propertyDetails_hasVirtualTour
, trg.propertyDetails_id = src.propertyDetails_id
, trg.propertyDetails_isBuyRent = src.propertyDetails_isBuyRent
, trg.propertyDetails_isHighlighted = src.propertyDetails_isHighlighted
, trg.propertyDetails_isNew 

In [20]:
from functools import reduce
from pyspark.sql import DataFrame

dfs = []
dfs.append(df_flatten_BE)
dfs.append(df_flatten_SO)

#doesn work with dataframes with different schemas
#df_all = reduce(DataFrame.unionAll, dfs)

In [None]:
from functools import reduce
from pyspark.sql import DataFrame

prop_s3_coordinates = ["s3a://real-estate/staging/201031_Bern_buy_0_flat.gz", "s3a://real-estate/staging/201031_Solothurn_buy_0_flat.gz"]

dfs = []
for path in prop_s3_coordinates:
    dfs.append(spark.read.json(path))

df_all = reduce(DataFrame.unionAll, dfs)

In [13]:
#df_flatten.select("propertyDetails_id", "propertyDetails_normalizedPrice").show()
#df_flatten.select("id", "normalizedPrice").show()
df_flatten.createOrReplaceTempView("flatten")

In [16]:
spark.sql("SELECT propertyDetails_cityName, count(*) FROM flatten GROUP BY propertyDetails_cityName").show()

+------------------------+--------+
|propertyDetails_cityName|count(1)|
+------------------------+--------+
|                    Bern|      20|
+------------------------+--------+



In [23]:
#map_SO = folium.Map(location=[47.03303, 7.8], zoom_start=12)
#map_SO

#marker_cluster = folium.MarkerCluster().add_to(map_SO)
#from folium.plugins import MarkerCluster

stops_map = folium.Map(location=[47.03303, 7.8], zoom_start=12)
marker_cluster = MarkerCluster().add_to(stops_map)
for name, row in df.iloc[:1000].iterrows():
    folium.Marker([row["propertyDetails_longitude"], row["propertyDetails_latitude"]], popup=row["propertyDetails_title"]).add_to(marker_cluster)
stops_map.create_map('stops.html')
stops_map


NameError: name 'MarkerCluster' is not defined