In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession

# Create the new session using the most forceful configuration
spark = SparkSession.builder \
    .appName("JDBCFix") \
    .config("spark.driver.extraClassPath", '/kaggle/input/ecommerce-file/sqlite-jdbc-3.45.1.0.jar') \
    .getOrCreate()

print(spark.sparkContext.getConf().get("spark.driver.extraClassPath"))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/19 04:16:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


/kaggle/input/ecommerce-file/sqlite-jdbc-3.45.1.0.jar


In [102]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from py4j.java_gateway import java_import
import sqlite3
import pandas as pd
import os

if os.path.exists('/kaggle/working/ecommerce.db'):
    print('Already DB Exists')
    pass
else:
    sqlite3.connect('ecommerce.db')

db_file_path = "jdbc:sqlite:/kaggle/working/ecommerce.db"

# Converted into a dataframe
def sqltodf(query):
    df=spark.read.format("jdbc") \
        .option("url",f"{db_file_path}") \
        .option("driver","org.sqlite.JDBC") \
        .option("dbtable",f"{query}").load()
    return df

#Dataframe written into the table
def writetodb_over(df_to_write,tbl_name):
    tbl_col=sqltodf(f'(select * from {tbl_name} where 1=0)')
    df_write = df_to_write.select(*tbl_col.columns)
    df_write.write.format("jdbc") \
        .option("url",f"{db_file_path}") \
        .option("driver","org.sqlite.JDBC") \
        .option("dbtable",f"{tbl_name}") \
        .mode("overwrite") \
        .save()

#Dataframe written into the table
def writetodb_append(df_to_write,tbl_name):
    tbl_col=sqltodf(f'(select * from {tbl_name} where 1=0)')
    df_write = df_to_write.select(*tbl_col.columns)
    df_write.write.format("jdbc") \
        .option("url",f"{db_file_path}") \
        .option("driver","org.sqlite.JDBC") \
        .option("dbtable",f"{tbl_name}") \
        .mode("append") \
        .save()

#Directly query executed 
def excutequery(query):
    gateway = spark.sparkContext._gateway
    java_import(gateway.jvm, 'java.sql.DriverManager')
    connection = gateway.jvm.DriverManager.getConnection(db_file_path)
    statement = connection.createStatement()
    statement.executeUpdate(query)

Already DB Exists


In [103]:
query="""
    CREATE TABLE IF NOT EXISTS per_customers (
        customer_id TEXT,
        customer_unique_id TEXT,
        customer_zip_code_prefix INTEGER,
        customer_city TEXT,
        customer_state TEXT,
        insert_updt_flag Text,
        insert_updt_ts timestamp,
        active_flg Text,
        file_name Text,
        created_date Date
     )"""
excutequery(query)

query="""
    CREATE TABLE IF NOT EXISTS per_customers_tmp (
        customer_id TEXT
     )"""
excutequery(query)

In [4]:
output=!ls /kaggle/input/ecommerce-file/ | grep 'olist'
print(output)

['olist_customers_dataset_20251116.csv', 'olist_customers_dataset.csv', 'olist_geolocation_dataset.csv', 'olist_order_items_dataset.csv', 'olist_order_payments_dataset.csv', 'olist_order_reviews_dataset.csv', 'olist_orders_dataset.csv', 'olist_products_dataset.csv', 'olist_sellers_dataset.csv']


In [136]:
per_df=spark.read.csv('/kaggle/input/ecommerce-file/olist_customers_dataset.csv',header=True,sep=',')
per_df=per_df.withColumn('insert_updt_flag',lit('I').cast(StringType()))
per_df=per_df.withColumn('insert_updt_ts',lit(current_date()).cast(DateType()))
per_df=per_df.withColumn('active_flg',lit('Y').cast(StringType()))
per_df=per_df.withColumn('file_name',lit('olist_customers_dataset').cast(StringType()))
per_df=per_df.withColumn('created_date',lit(current_date()-1).cast(DateType())) #Like data loaded in yesterday

In [137]:
writetodb_over(per_df,'per_customers')

25/11/19 05:27:18 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8
25/11/19 05:27:18 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8
25/11/19 05:27:18 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8
                                                                                

In [138]:
for col in per_df.columns:
    per_df=per_df.withColumnRenamed(col,f'tgt_{col}')
per_df.show(5)

+--------------------+----------------------+----------------------------+--------------------+------------------+--------------------+------------------+--------------+--------------------+----------------+
|     tgt_customer_id|tgt_customer_unique_id|tgt_customer_zip_code_prefix|   tgt_customer_city|tgt_customer_state|tgt_insert_updt_flag|tgt_insert_updt_ts|tgt_active_flg|       tgt_file_name|tgt_created_date|
+--------------------+----------------------+----------------------------+--------------------+------------------+--------------------+------------------+--------------+--------------------+----------------+
|06b8999e2fba1a1fb...|  861eff4711a542e4b...|                       14409|              franca|                SP|                   I|        2025-11-19|             Y|olist_customers_d...|      2025-11-18|
|18955e83d337fd6b2...|  290c77bc529b7ac93...|                       09790|sao bernardo do c...|                SP|                   I|        2025-11-19|             Y

In [139]:
import pyspark
sc = spark.sparkContext

stg_df=spark.read.csv('/kaggle/input/ecommerce-file/olist_customers_dataset_20251116.csv',header=True,sep=',')
stg_df=stg_df.withColumn('file_name',lit('olist_customers_dataset').cast(StringType()))
stg_df=stg_df.withColumn('file_process_date ',lit(current_date()).cast(DateType())) 

In [140]:
left_df=stg_df.join(per_df, on=[stg_df.customer_id==per_df.tgt_customer_id], how='left')

In [142]:
left_df=left_df.selectExpr(
    "*",
    """
    CASE WHEN tgt_insert_updt_flag is null THEN 'I'
         WHEN tgt_customer_id=customer_id and
              (
                  COALESCE(customer_unique_id,'a') <> COALESCE(tgt_customer_unique_id,'a') or
                  COALESCE(cast(customer_zip_code_prefix as int),'a')<>COALESCE(cast(tgt_customer_zip_code_prefix as int),'a') or
                  COALESCE(customer_city,'a') <> COALESCE(tgt_customer_city,'a') or
                  COALESCE(customer_state,'a') <> COALESCE(tgt_customer_state,'a')
              ) THEN 'U' END as insert_updt_flag
    """,
    " 'Y' as active_flg",
    "current_date() as insert_updt_ts",
    "CASE WHEN tgt_insert_updt_flag is null THEN current_date() else tgt_created_date end as created_date",
)

In [143]:
fleft_df=left_df.select('customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state', 'file_name', 'insert_updt_flag', 'active_flg', 'insert_updt_ts', 'created_date')

In [144]:
insert_df=fleft_df.filter(left_df.insert_updt_flag=='I')
upsert_df=fleft_df.filter(left_df.insert_updt_flag=='U')

In [148]:
n_df=upsert_df.select('customer_id')
writetodb_over(n_df,'per_customers_tmp')

25/11/19 05:34:06 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8
25/11/19 05:34:06 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8


In [158]:
query="""
    UPDATE per_customers as tgt
    set active_flg='N',
    insert_updt_ts=current_timestamp
    where exists(select 1 from per_customers_tmp as tmp where tmp.customer_id=tgt.customer_id)
"""
excutequery(query)

In [160]:
writetodb_append(insert_df,'per_customers')
writetodb_append(upsert_df,'per_customers')

25/11/19 05:41:41 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8
25/11/19 05:41:42 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8
25/11/19 05:41:42 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8


In [165]:
tbl_df=sqltodf('(select * from per_customers)')

In [167]:
tbl_df.groupBy('created_date','insert_updt_ts','active_flg','insert_updt_flag').count()\
        .select('created_date','insert_updt_ts','active_flg','insert_updt_flag','count').show()

+------------+--------------+----------+----------------+-----+
|created_date|insert_updt_ts|active_flg|insert_updt_flag|count|
+------------+--------------+----------+----------------+-----+
|  2025-11-18|    2025-11-19|         Y|               I|99436|
|  2025-11-19|    2025-11-19|         Y|               I|    1|
|  2025-11-18|    2025-11-19|         N|               I|    5|
|  2025-11-18|    2025-11-19|         Y|               U|    5|
+------------+--------------+----------+----------------+-----+

