In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
spark = SparkSession.builder.appName("SCD").getOrCreate()

In [0]:
geoname_source = spark.read.format("csv").\
    option("header", "True").\
    load("dbfs:/FileStore/tables/geoname_source.csv")

geoname = spark.read.format("csv").\
    option("header", "True").\
    load("dbfs:/FileStore/tables/geoname.csv")

In [0]:
new_true = geoname_source.join(
    geoname, (geoname_source.geonameid == geoname.geonameid) & 
                          ((geoname_source.latitude != geoname.latitude) | (geoname_source.population != geoname.population))
).select(geoname_source['*']).\
    withColumn("start_date", F.current_date()) \
    .withColumn("end_date", F.lit("9999-12-31").cast(T.TimestampType())) \
    .withColumn("is_current", F.lit('True'))

geonameid,name,asciiname,alternatenames,latitude,longitude,feature_class,feature_code,country_code,cc2,admin1_code,admin2_code,admin3_code,admin4_code,population,elevation,dem,timezone,modification_date,start_date,end_date,is_current
10000278,Dongwanzi,Dongwanzi,"Dongwanzi,dong wan zi,东湾子",44.903199,116.06985,P,PPL,CN,,10,1307,,,1203494,,852,Asia/Shanghai,2021-09-20,2024-10-07,9999-12-31T00:00:00.000+0000,True
10000341,Dalongchi,Dalongchi,"Dalongchi,da long chi,大龙池",44.926783,116.11892,P,PPL,CN,,10,1307,,,1203494,,1055,Asia/Shanghai,2021-09-20,2024-10-07,9999-12-31T00:00:00.000+0000,True
10000539,Gangfang Yaozi,Gangfang Yaozi,"Gangfang Yaozi,gang fang yao zi,缸房窑子",44.891143,116.3312,P,PPL,CN,,10,1307,,,1203494,,575,Asia/Shanghai,2021-09-20,2024-10-07,9999-12-31T00:00:00.000+0000,True
10000761,Nanguan,Nanguan,"Nanguan,sa san ying,撒三营",45.379818,116.52841,P,PPL,CN,,10,1308,,,1203494,,778,Asia/Shanghai,2021-09-20,2024-10-07,9999-12-31T00:00:00.000+0000,True
10000899,Fangjialiang,Fangjialiang,"Fangjialiang,fang jia liang,房家梁",45.246718,116.6538,P,PPL,CN,,10,1308,,,1203494,,909,Asia/Shanghai,2021-09-20,2024-10-07,9999-12-31T00:00:00.000+0000,True
10000989,Liangjianfangcun,Liangjianfangcun,"Liangjianfang,Liangjianfangcun,liang jian fang,liang jian fang cun,两间房,两间房村",45.440175,116.76081,P,PPL,CN,,10,1308,,,1203494,,744,Asia/Shanghai,2021-09-20,2024-10-07,9999-12-31T00:00:00.000+0000,True
10001070,Henanying,Henanying,"Henanying,he nan ying,河南营",45.951906,116.83629,P,PPL,CN,,10,1308,,,1203494,,1180,Asia/Shanghai,2021-09-20,2024-10-07,9999-12-31T00:00:00.000+0000,True
10001109,Laodongyingcun,Laodongyingcun,"Laodongying,Laodongyingcun,lao dong ying,lao dong ying cun,老东营,老东营村",45.690909,116.87376,P,PPL,CN,,10,1308,,,1203494,,1081,Asia/Shanghai,2021-09-20,2024-10-07,9999-12-31T00:00:00.000+0000,True
1000113,Greenfountain,Greenfountain,,-36.910995,26.94347,S,FRM,ZA,,05,DC10,EC105,,1203494,,81,Africa/Johannesburg,2012-07-12,2024-10-07,9999-12-31T00:00:00.000+0000,True
1000128,Greefswald,Greefswald,,-24.433706,29.37706,S,FRM,ZA,,09,DC34,LIM341,,1203494,,550,Africa/Johannesburg,2012-07-12,2024-10-07,9999-12-31T00:00:00.000+0000,True


In [0]:
old_updated = geoname.join(new_true, geoname.geonameid == new_true.geonameid).\
                           select(geoname['*']).\
                           drop("end_date", "is_current").\
                           withColumn("end_date", F.current_date()).\
                           withColumn("is_current", F.lit('False'))

Out[28]: 4316920

In [0]:
unaltered = geoname.join(new_true, on="geonameid", how="left_anti").select(geoname['*'])

Out[29]: 8633526

In [0]:
new_entries = geoname_source.join(geoname, on="geonameid", how="left_anti").\
             withColumn("start_date", F.current_date()) \
            .withColumn("end_date", F.lit("9999-12-31").cast(T.TimestampType())) \
            .withColumn("is_current", F.lit('True'))

In [0]:
geo_name_scd = new_true.union(old_updated).union(unaltered).union(new_entries)