#### Scenario 1 : Incremental Data Loading using Autoloader and schema evolution in pyspark streaming

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('Scenario').getOrCreate()

In [0]:
spark

In [0]:
df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('dbfs:/FileStore/rawsource/sales_data_first.csv')
df.display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date
BR0006,DLR0168,Ren-M128,12971088,3,DT01236,5,2020,AC Cars Motors,Saab Motors,Renault,5/20/2020 0:00
BR0011,DLR0069,Vol-M256,14181510,3,DT01225,5,2020,Acura Motors,Geo Motors,Volkswagen,5/9/2020 0:00
BR0021,DLR0070,Vol-M257,7738896,1,DT01226,5,2020,Aixam-Mega (including Arola) Motors,Gilbern Motors,Volkswagen,5/10/2020 0:00
BR0031,DLR0071,Vol-M258,10067596,2,DT01227,5,2020,Alfa Romeo Motors,Ginetta Motors,Volkswagen,5/11/2020 0:00
BR0041,DLR0072,Vol-M259,13055810,2,DT01228,5,2020,Alpine Motors,Glas Motors,Volkswagen,5/12/2020 0:00
BR0051,DLR0073,Vol-M260,2224368,1,DT01229,5,2020,Alvis Motors,GMC Motors,Volkswagen,5/13/2020 0:00
BR0061,DLR0074,Nis-M261,11270580,3,DT01230,5,2020,"AMC, Eagle Motors",GTA Spano Motors,Nissan,5/14/2020 0:00
BR0066,DLR0169,Ren-M129,693559,1,DT01237,5,2020,Acura Motors,SAIC Motor Motors,Renault,5/21/2020 0:00
BR0071,DLR0075,Nis-M262,18784710,3,DT01231,5,2020,Anadol Motors,Gumpert Motors,Nissan,5/15/2020 0:00
BR0081,DLR0076,Nis-M263,2354637,3,DT01232,5,2020,Ariel Motors,Healey Motors,Nissan,5/16/2020 0:00


### Auto Loader

#### Streaming Read

In [0]:
df = spark.readStream.format('cloudFiles')\
                    .option('cloudFiles.format', 'csv')\
                    .option('cloudFiles.schemaEvolutionMode', 'addNewColumns')\
                    .option('cloudFiles.schemaLocation', '/FileStore/rawdestination/checkpoint')\
                    .load('/FileStore/rawsource')

#### Streaming Write

In [0]:
df.writeStream.format('delta')\
              .option('checkpointLocation', '/FileStore/rawdestination/checkpoint')\
              .trigger(processingTime = '3 Seconds')\
              .option('mergeSchema', True)\
              .start('/FileStore/rawdestination/data')

Out[16]: <pyspark.sql.streaming.query.StreamingQuery at 0x7fdfdb4bf4f0>

### Scenario 2 - Implement a solution to handle slowly changing dimension for both initial and incremental run in one notebook

In [0]:
df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('/FileStore/rawcsv')

df.display()

p_id,p_name,p_category,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11
1.0,cookies,food,,,,,,,,,
2.0,almonds,food,,,,,,,,,
3.0,Toothpase,merchandise,,,,,,,,,
4.0,earphones,electronics,,,,,,,,,
5.0,oil,merchandise,,,,,,,,,
6.0,shirt,merchandise,,,,,,,,,
,,,,,,,,,,,
,,,,,,,,,,,
,,,,,,,,,,,
,,,,,,,,,,,


#### Removed Nulls

In [0]:
from pyspark.sql.functions import *
df = df.select('p_id', 'p_name','p_category').filter(col('p_id').isNotNull())

In [0]:
from delta.tables import DeltaTable

### Upsert Operation to handle SCD

In [0]:
initial_run = 0

In [0]:
if (initial_run == 0):
    delta_table = DeltaTable.forPath(spark, '/FileStore/rawcsvsink')

    delta_table.alias('trg').merge(df.alias('src'), 'trg.p_id = src.p_id')\
                            .whenMatchedUpdateAll()\
                            .whenNotMatchedInsertAll()\
                            .execute()
else:
    df.write.format('delta')\
            .mode('append')\
            .option('path', '/FileStore/rawcsvsink')\
            .saveAsTable('dimproducts')

In [0]:
%sql

select * from dimproducts

p_id,p_name,p_category
1,cookies,food
2,almonds,food
3,Toothpase,merchandise
4,earphones,electronics
5,oil,merchandise
6,shirt,merchandise


### Scenario 3 - Python class to store window functions in PySpark

In [0]:

df_new = spark.read.format('csv').option('header', True).option('inferSchema', True).load('/FileStore/rawsource/sales_data_second.csv')

df.display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date
BR2280,DLR0048,Aud-M235,23929671,3,DT01204,4,2020,Deccan Motors,De Tomaso Motors,Audi,4/18/2020 0:00
BR2285,DLR0149,Vol-M109,13073782,2,DT01217,5,2020,Herald Motors,Panoz Motors,Volkswagen,5/1/2020 0:00
BR2290,DLR0049,Aud-M236,14157891,3,DT01205,4,2020,Herald Motors,Dodge Motors,Audi,4/19/2020 0:00
BR2295,DLR0150,Vol-M110,16770687,3,DT01218,5,2020,Zion Automobils,Panther Motors,Volvo,5/2/2020 0:00
BR2300,DLR0050,Aud-M237,5769353,1,DT01206,4,2020,Zion Automobils,Donkervoort Motors,Audi,4/20/2020 0:00
BR2305,DLR0151,Agr-M111,18708022,2,DT01219,5,2020,Atlantic Motor Company,Peel Motors,Agrale,5/3/2020 0:00
BR2310,DLR0051,Aud-M238,6666418,1,DT01207,4,2020,Atlantic Motor Company,Edsel Motors,Audi,4/21/2020 0:00
BR2315,DLR0152,Agr-M112,18804507,3,DT01220,5,2020,2008 NRHP-listed,Perodua Motors,Agrale,5/4/2020 0:00
BR2320,DLR0052,Aud-M239,5651517,3,DT01208,4,2020,2008 NRHP-listed,e.GO Mobile Motors,Audi,4/22/2020 0:00
BR2330,DLR0053,Aud-M240,21747525,3,DT01209,4,2020,Blankinship Motor Company Building,Eicher Polaris Motors,Audi,4/23/2020 0:00


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
class window_functions:

    df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('/FileStore/rawsource/sales_data_first.csv')

    def dense_rank_func(self, new_col, part_cols, ord_cols):
        self.df = self.df.withColumn(new_col, dense_rank().over(Window.partitionBy(part_cols).orderBy(col(ord_cols).desc())))
        return self.df
    
    def rank_func(self, new_col, part_cols, ord_cols):
        self.df = self.df.withColumn(new_col, rank().over(Window.partitionBy(part_cols).orderBy(col(ord_cols).desc())))
        return self.df
    
    def row_number_func(self, new_col, part_cols, ord_cols):
        self.df = self.df.withColumn(new_col, row_number().over(Window.partitionBy(part_cols).orderBy(col(ord_cols).desc())))
        return self.df


In [0]:
obj = window_functions()

In [0]:
obj.df = df_new

In [0]:
df_new = obj.dense_rank_func('dense_rank_col', 'Month', 'Units_Sold')

df_new.display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date,dense_rank_col
BR2280,DLR0048,Aud-M235,23929671,3,DT01204,4,2020,Deccan Motors,De Tomaso Motors,Audi,4/18/2020 0:00,1
BR2290,DLR0049,Aud-M236,14157891,3,DT01205,4,2020,Herald Motors,Dodge Motors,Audi,4/19/2020 0:00,1
BR2320,DLR0052,Aud-M239,5651517,3,DT01208,4,2020,2008 NRHP-listed,e.GO Mobile Motors,Audi,4/22/2020 0:00,1
BR2330,DLR0053,Aud-M240,21747525,3,DT01209,4,2020,Blankinship Motor Company Building,Eicher Polaris Motors,Audi,4/23/2020 0:00,1
BR2350,DLR0055,BMW-M242,1772823,3,DT01211,4,2020,Buick Automobile Company Building,Facel Vega Motors,BMW,4/25/2020 0:00,1
BR2360,DLR0056,BMW-M243,17444340,3,DT01212,4,2020,Cadillac Automobile Company Building,Ferrari Motors,BMW,4/26/2020 0:00,1
BR2380,DLR0058,BMW-M245,3241882,2,DT01214,4,2020,Cass Motor Sales,Fiat do Brasil Motors,BMW,4/28/2020 0:00,2
BR2300,DLR0050,Aud-M237,5769353,1,DT01206,4,2020,Zion Automobils,Donkervoort Motors,Audi,4/20/2020 0:00,3
BR2310,DLR0051,Aud-M238,6666418,1,DT01207,4,2020,Atlantic Motor Company,Edsel Motors,Audi,4/21/2020 0:00,3
BR2340,DLR0054,Aud-M241,7111275,1,DT01210,4,2020,Bohn Motor Company Automobile Dealership,Elva Motors,Audi,4/24/2020 0:00,3


### Scenario 4 - Creating a Conditional Column

In [0]:
df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('dbfs:/FileStore/rawsource/sales_data_first.csv')

In [0]:
df.display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date
BR0006,DLR0168,Ren-M128,12971088,3,DT01236,5,2020,AC Cars Motors,Saab Motors,Renault,5/20/2020 0:00
BR0011,DLR0069,Vol-M256,14181510,3,DT01225,5,2020,Acura Motors,Geo Motors,Volkswagen,5/9/2020 0:00
BR0021,DLR0070,Vol-M257,7738896,1,DT01226,5,2020,Aixam-Mega (including Arola) Motors,Gilbern Motors,Volkswagen,5/10/2020 0:00
BR0031,DLR0071,Vol-M258,10067596,2,DT01227,5,2020,Alfa Romeo Motors,Ginetta Motors,Volkswagen,5/11/2020 0:00
BR0041,DLR0072,Vol-M259,13055810,2,DT01228,5,2020,Alpine Motors,Glas Motors,Volkswagen,5/12/2020 0:00
BR0051,DLR0073,Vol-M260,2224368,1,DT01229,5,2020,Alvis Motors,GMC Motors,Volkswagen,5/13/2020 0:00
BR0061,DLR0074,Nis-M261,11270580,3,DT01230,5,2020,"AMC, Eagle Motors",GTA Spano Motors,Nissan,5/14/2020 0:00
BR0066,DLR0169,Ren-M129,693559,1,DT01237,5,2020,Acura Motors,SAIC Motor Motors,Renault,5/21/2020 0:00
BR0071,DLR0075,Nis-M262,18784710,3,DT01231,5,2020,Anadol Motors,Gumpert Motors,Nissan,5/15/2020 0:00
BR0081,DLR0076,Nis-M263,2354637,3,DT01232,5,2020,Ariel Motors,Healey Motors,Nissan,5/16/2020 0:00


### Method 1: Using When OtherWise

In [0]:
df = df.withColumn('conditional_column', when(col('Units_Sold') == 1, 'Low').when(col('Units_Sold')==2, 'Medium').otherwise('High'))

df.display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date,conditional_column
BR0006,DLR0168,Ren-M128,12971088,3,DT01236,5,2020,AC Cars Motors,Saab Motors,Renault,5/20/2020 0:00,High
BR0011,DLR0069,Vol-M256,14181510,3,DT01225,5,2020,Acura Motors,Geo Motors,Volkswagen,5/9/2020 0:00,High
BR0021,DLR0070,Vol-M257,7738896,1,DT01226,5,2020,Aixam-Mega (including Arola) Motors,Gilbern Motors,Volkswagen,5/10/2020 0:00,Low
BR0031,DLR0071,Vol-M258,10067596,2,DT01227,5,2020,Alfa Romeo Motors,Ginetta Motors,Volkswagen,5/11/2020 0:00,Medium
BR0041,DLR0072,Vol-M259,13055810,2,DT01228,5,2020,Alpine Motors,Glas Motors,Volkswagen,5/12/2020 0:00,Medium
BR0051,DLR0073,Vol-M260,2224368,1,DT01229,5,2020,Alvis Motors,GMC Motors,Volkswagen,5/13/2020 0:00,Low
BR0061,DLR0074,Nis-M261,11270580,3,DT01230,5,2020,"AMC, Eagle Motors",GTA Spano Motors,Nissan,5/14/2020 0:00,High
BR0066,DLR0169,Ren-M129,693559,1,DT01237,5,2020,Acura Motors,SAIC Motor Motors,Renault,5/21/2020 0:00,Low
BR0071,DLR0075,Nis-M262,18784710,3,DT01231,5,2020,Anadol Motors,Gumpert Motors,Nissan,5/15/2020 0:00,High
BR0081,DLR0076,Nis-M263,2354637,3,DT01232,5,2020,Ariel Motors,Healey Motors,Nissan,5/16/2020 0:00,High


### Method 2 using UDF

In [0]:
def conditionalcol(x):
  if(x==1):
    return 'Low'
  elif(x==2):
    return 'Medium'
  else:
    return 'High'
  


In [0]:
conditionalcol = udf(conditionalcol)

In [0]:
df = df.withColumn('Conditional_Col', conditionalcol('Units_Sold'))

df.display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date,conditional_column,Conditional_Col
BR0006,DLR0168,Ren-M128,12971088,3,DT01236,5,2020,AC Cars Motors,Saab Motors,Renault,5/20/2020 0:00,High,High
BR0011,DLR0069,Vol-M256,14181510,3,DT01225,5,2020,Acura Motors,Geo Motors,Volkswagen,5/9/2020 0:00,High,High
BR0021,DLR0070,Vol-M257,7738896,1,DT01226,5,2020,Aixam-Mega (including Arola) Motors,Gilbern Motors,Volkswagen,5/10/2020 0:00,Low,Low
BR0031,DLR0071,Vol-M258,10067596,2,DT01227,5,2020,Alfa Romeo Motors,Ginetta Motors,Volkswagen,5/11/2020 0:00,Medium,Medium
BR0041,DLR0072,Vol-M259,13055810,2,DT01228,5,2020,Alpine Motors,Glas Motors,Volkswagen,5/12/2020 0:00,Medium,Medium
BR0051,DLR0073,Vol-M260,2224368,1,DT01229,5,2020,Alvis Motors,GMC Motors,Volkswagen,5/13/2020 0:00,Low,Low
BR0061,DLR0074,Nis-M261,11270580,3,DT01230,5,2020,"AMC, Eagle Motors",GTA Spano Motors,Nissan,5/14/2020 0:00,High,High
BR0066,DLR0169,Ren-M129,693559,1,DT01237,5,2020,Acura Motors,SAIC Motor Motors,Renault,5/21/2020 0:00,Low,Low
BR0071,DLR0075,Nis-M262,18784710,3,DT01231,5,2020,Anadol Motors,Gumpert Motors,Nissan,5/15/2020 0:00,High,High
BR0081,DLR0076,Nis-M263,2354637,3,DT01232,5,2020,Ariel Motors,Healey Motors,Nissan,5/16/2020 0:00,High,High


### Scenario 5 - Finding a CumSum of units Sold per Month in the given dataFrame

In [0]:
df.withColumn('CumSum', sum('Units_Sold').over(Window.orderBy('Month').rowsBetween(Window.unboundedPreceding,Window.currentRow))).display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date,conditional_column,Conditional_Col,CumSum
BR1075,DLR0028,Nis-M265,946626,1,DT01096,1,2020,Lexus Motors,Buick Motors,Nissan,1/1/2020 0:00,Low,Low,1
BR1105,DLR0031,Nis-M268,2349414,3,DT01099,1,2020,Lotus Motors,Caterham Motors,Nissan,1/4/2020 0:00,High,High,4
BR1125,DLR0033,Nis-M270,5501066,2,DT01101,1,2020,LuAZ Motors,Chevrolet Motors,Nissan,1/6/2020 0:00,Medium,Medium,6
BR1135,DLR0034,Sko-M271,15299630,2,DT01102,1,2020,Lynx Motors,Chevrolet India Motors,Skoda,1/7/2020 0:00,Medium,Medium,8
BR1145,DLR0035,Sko-M272,573260,1,DT01103,1,2020,Mahindra Motors,Chrysler Motors,Skoda,1/8/2020 0:00,Low,Low,9
BR1155,DLR0036,Sko-M273,6991308,2,DT01104,1,2020,Marcos Motors,Chrysler Europe Motors,Skoda,1/9/2020 0:00,Medium,Medium,11
BR1165,DLR0037,Sko-M274,4594208,1,DT01105,1,2020,Marlin Motors,Citroën Motors,Skoda,1/10/2020 0:00,Low,Low,12
BR1175,DLR0038,Mit-M275,6379479,3,DT01106,1,2020,Maruti Motors,Cizeta-Moroder Motors,Mitsubishi,1/11/2020 0:00,High,High,15
BR1195,DLR0040,Mit-M277,12731434,2,DT01108,1,2020,Mastretta Motors,"Daewoo, Saehan, Chevrolet Korea Motors",Mitsubishi,1/13/2020 0:00,Medium,Medium,17
BR1200,DLR0207,Ren-M127,5217794,1,DT01096,1,2020,Mastretta Motors,Westfield Motors,Renault,1/1/2020 0:00,Low,Low,18


#### Scenario 6 - Create a Parameterized Notebook which takes a values from the user to use it in your filter criteria. Also, create mutiple outputs based on this parameter

In [0]:
var_units_sold = [1,2,3]

In [0]:
df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('dbfs:/FileStore/rawsource/sales_data_first.csv')
df.display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date
BR0006,DLR0168,Ren-M128,12971088,3,DT01236,5,2020,AC Cars Motors,Saab Motors,Renault,5/20/2020 0:00
BR0011,DLR0069,Vol-M256,14181510,3,DT01225,5,2020,Acura Motors,Geo Motors,Volkswagen,5/9/2020 0:00
BR0021,DLR0070,Vol-M257,7738896,1,DT01226,5,2020,Aixam-Mega (including Arola) Motors,Gilbern Motors,Volkswagen,5/10/2020 0:00
BR0031,DLR0071,Vol-M258,10067596,2,DT01227,5,2020,Alfa Romeo Motors,Ginetta Motors,Volkswagen,5/11/2020 0:00
BR0041,DLR0072,Vol-M259,13055810,2,DT01228,5,2020,Alpine Motors,Glas Motors,Volkswagen,5/12/2020 0:00
BR0051,DLR0073,Vol-M260,2224368,1,DT01229,5,2020,Alvis Motors,GMC Motors,Volkswagen,5/13/2020 0:00
BR0061,DLR0074,Nis-M261,11270580,3,DT01230,5,2020,"AMC, Eagle Motors",GTA Spano Motors,Nissan,5/14/2020 0:00
BR0066,DLR0169,Ren-M129,693559,1,DT01237,5,2020,Acura Motors,SAIC Motor Motors,Renault,5/21/2020 0:00
BR0071,DLR0075,Nis-M262,18784710,3,DT01231,5,2020,Anadol Motors,Gumpert Motors,Nissan,5/15/2020 0:00
BR0081,DLR0076,Nis-M263,2354637,3,DT01232,5,2020,Ariel Motors,Healey Motors,Nissan,5/16/2020 0:00


In [0]:
for i in var_units_sold:
    df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('dbfs:/FileStore/rawsource/sales_data_first.csv')\
        .filter(col('Units_Sold')== i)

    df.write.format('csv')\
            .mode('append')\
            .option('path', f'/FileStore/loopdata/units_sold = {i}')\
            .save()

### Scenario 7 - Data Skewness - Finding skewness in the data

In [0]:
df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('dbfs:/FileStore/rawsource/sales_data_first.csv')

In [0]:
df.withColumn('partitionID', spark_partition_id()).display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date,partitionID
BR0066,DLR0169,Ren-M129,693559,1,DT01237,5,2020,Acura Motors,SAIC Motor Motors,Renault,5/21/2020 0:00,0
BR0071,DLR0075,Nis-M262,18784710,3,DT01231,5,2020,Anadol Motors,Gumpert Motors,Nissan,5/15/2020 0:00,0
BR0141,DLR0082,Nis-M269,17522592,2,DT01238,5,2020,Autobianchi Motors,Honda Motors,Nissan,5/22/2020 0:00,0
BR1420,DLR0229,Mar-M149,3362029,1,DT01118,1,2020,NSU Motors,Herald Motors,Maruti Suzuki,1/23/2020 0:00,0
BR1240,DLR0211,Ren-M131,2717264,1,DT01100,1,2020,McLaren Motors,ZIL Motors,Renault,1/5/2020 0:00,0
BR1440,DLR0231,Mar-M151,12650379,3,DT01120,1,2020,Oltcit Motors,Atlantic Motor Company,Maruti Suzuki,1/25/2020 0:00,0
BR1195,DLR0040,Mit-M277,12731434,2,DT01108,1,2020,Mastretta Motors,"Daewoo, Saehan, Chevrolet Korea Motors",Mitsubishi,1/13/2020 0:00,0
BR0126,DLR0170,Ren-M130,8445560,2,DT01238,5,2020,Aixam-Mega (including Arola) Motors,Saleen Motors,Renault,5/22/2020 0:00,0
BR1495,DLR0070,Lin-M30,3396415,1,DT01138,2,2020,Panther Motors,Gilbern Motors,Lincoln,2/12/2020 0:00,0
BR1560,DLR0243,Hyu-M163,9443751,3,DT01132,2,2020,Plymouth Motors,Howard Motor Company Building,Hyundai,2/6/2020 0:00,0


In [0]:
df = df.repartition(4)

In [0]:
df.withColumn('partition_id', spark_partition_id()).groupBy('partition_id').count().display()


partition_id,count
0,23
1,24
2,24
3,24


## Scenario 8 - quering df using sql as a table

In [0]:
df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('dbfs:/FileStore/rawsource/sales_data_first.csv')

In [0]:
df.createOrReplaceGlobalTempView('sales_view')

In [0]:
%sql
select * from sales_view

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Month,Year,BranchName,DealerName,Product_Name,Date
BR0006,DLR0168,Ren-M128,12971088,3,DT01236,5,2020,AC Cars Motors,Saab Motors,Renault,5/20/2020 0:00
BR0011,DLR0069,Vol-M256,14181510,3,DT01225,5,2020,Acura Motors,Geo Motors,Volkswagen,5/9/2020 0:00
BR0021,DLR0070,Vol-M257,7738896,1,DT01226,5,2020,Aixam-Mega (including Arola) Motors,Gilbern Motors,Volkswagen,5/10/2020 0:00
BR0031,DLR0071,Vol-M258,10067596,2,DT01227,5,2020,Alfa Romeo Motors,Ginetta Motors,Volkswagen,5/11/2020 0:00
BR0041,DLR0072,Vol-M259,13055810,2,DT01228,5,2020,Alpine Motors,Glas Motors,Volkswagen,5/12/2020 0:00
BR0051,DLR0073,Vol-M260,2224368,1,DT01229,5,2020,Alvis Motors,GMC Motors,Volkswagen,5/13/2020 0:00
BR0061,DLR0074,Nis-M261,11270580,3,DT01230,5,2020,"AMC, Eagle Motors",GTA Spano Motors,Nissan,5/14/2020 0:00
BR0066,DLR0169,Ren-M129,693559,1,DT01237,5,2020,Acura Motors,SAIC Motor Motors,Renault,5/21/2020 0:00
BR0071,DLR0075,Nis-M262,18784710,3,DT01231,5,2020,Anadol Motors,Gumpert Motors,Nissan,5/15/2020 0:00
BR0081,DLR0076,Nis-M263,2354637,3,DT01232,5,2020,Ariel Motors,Healey Motors,Nissan,5/16/2020 0:00


## Scenario 9 - Create a df that should read data from the data lake and should not fail if there are any erros

In [0]:
try:
    df = spark.read.format('csv').option('header', True).option('inferSchema', True).load('dbfs:/FileStore/rawsource/sales_data_first_first.csv')

    df.display()

except Exception as e:
    print(f'You have an error - {e}')


You have an error - [PATH_NOT_FOUND] Path does not exist: dbfs:/FileStore/rawsource/sales_data_first_first.csv.


### Scenario 10 - Incrementally stream the data from  a delta table and handle the updates as well

In [0]:
%sql
CREATE TABLE deltasource
(
  id INT,
  name STRING,
  salary INT
)
USING DELTA
LOCATION '/FileStore/deltasource/source1'

In [0]:
%sql
ALTER TABLE deltasource SET TBLPROPERTIES ('delta.enableDeletionVectors' = False)

In [0]:
%sql
Insert into deltasource
values(1, 'Priya', 2000),
(2, 'Ansh', 2500),
(3, 'Rahul', 2300),
(4, 'Millie', 3000)

num_affected_rows,num_inserted_rows
4,4


In [0]:
%sql
select * from deltasource 

id,name,salary
1,Priya,2000
2,Ansh,2500
3,Rahul,2300
4,Millie,3000
1,Priya,2000
2,Ansh,2500
3,Rahul,2300
4,Millie,3000
1,Priya,2000
2,Ansh,2500


In [0]:
df =spark.readStream.table('deltasource')

In [0]:
df.writeStream.format('delta')\
        .option('checkpointLocation', '/FileStore/deltasource/sink1/checkpoint')\
        .option('path', '/FileStore/deltasource/sink1/data')\
        .trigger(processingTime = '3 Seconds')\
        .start()

Out[76]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f898fb11a60>

In [0]:
%sql
DESCRIBE HISTORY deltasource

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2025-03-07T13:24:45.000+0000,4838080632151284,nothingxd007@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [])",,List(841088840789268),0307-103934-r96e51eh,3.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 4, numOutputBytes -> 1082)",,Databricks-Runtime/12.2.x-scala2.12
3,2025-03-07T13:22:57.000+0000,4838080632151284,nothingxd007@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [])",,List(841088840789268),0307-103934-r96e51eh,2.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 4, numOutputBytes -> 1082)",,Databricks-Runtime/12.2.x-scala2.12
2,2025-03-07T13:11:02.000+0000,4838080632151284,nothingxd007@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [])",,List(841088840789268),0307-103934-r96e51eh,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 4, numOutputBytes -> 1082)",,Databricks-Runtime/12.2.x-scala2.12
1,2025-03-07T13:09:16.000+0000,4838080632151284,nothingxd007@gmail.com,SET TBLPROPERTIES,"Map(properties -> {""delta.enableDeletionVectors"":""false""})",,List(841088840789268),0307-103934-r96e51eh,0.0,WriteSerializable,True,Map(),,Databricks-Runtime/12.2.x-scala2.12
0,2025-03-07T13:08:11.000+0000,4838080632151284,nothingxd007@gmail.com,CREATE TABLE,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(841088840789268),0307-103934-r96e51eh,,WriteSerializable,True,Map(),,Databricks-Runtime/12.2.x-scala2.12


In [0]:
%sql
DESCRIBE EXTENDED deltasource

col_name,data_type,comment
id,int,
name,string,
salary,int,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,default,
Table,deltasource,
Created Time,Fri Mar 07 13:08:20 UTC 2025,
Last Access,UNKNOWN,
