# Streaming Data Processing using Spark on Databricks

Read the csv file containing the AWS keys to the databricks

In [0]:
dbutils.fs.ls("/user/hive/warehouse/") 

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import urllib

# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

Load the streaming data into the dataframes (df_pin, df_geo and df_user), from the kinesis streams

In [0]:

kinesis_streams = ["streaming-0ecf5ea19ac5-pin", "streaming-0ecf5ea19ac5-geo", "streaming-0ecf5ea19ac5-user"]
for stream_name in kinesis_streams:

    df_stream = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName',f'{stream_name}') \
    .option('initialPosition','earliest') \
    .option('region','us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
    
    if 'pin' in stream_name:
        df_pin = df_stream.selectExpr("CAST(data as STRING)")
        df_pin = df_pin.select(json_tuple('data', "index", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src",  "save_location", "category", "downloaded").alias("index", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src",  "save_location", "category", "downloaded"))
        display(df_pin.limit(5))
    elif 'geo' in stream_name:
        df_geo = df_stream.selectExpr("CAST(data as STRING)")
        df_geo = df_geo.select(json_tuple('data', 'ind', 'country', 'latitude', 'longitude', 'timestamp').alias('ind', 'country', 'latitude', 'longitude', 'timestamp'))
        display(df_geo.limit(5))
    elif 'user' in stream_name:
        df_user = df_stream.selectExpr("CAST(data as STRING)")
        df_user = df_user.select(json_tuple('data', 'ind', "first_name", "last_name", 'age', 'date_joined').alias('ind', "first_name", "last_name", 'age', 'date_joined'))
        display(df_user.limit(5))
        
  

index,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category,downloaded
7528,fbe53c66-3442-4773-b19e-d3ec6f54dddf,No Title Data Available,No description available Story format,User Info Error,User Info Error,"N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",multi-video(story page format),Image src error.,Local save in /data/mens-fashion,mens-fashion,0
2863,9bf39437-42a6-4f02-99a0-9a0383d8cd70,25 Super Fun Summer Crafts for Kids - Of Life and Lisa,Keep the kids busy this summer with these easy diy crafts and projects. Creative and…,124k,Of Life & Lisa | Lifestyle Blog,"Summer Crafts For Kids,Fun Crafts For Kids,Summer Kids,Toddler Crafts,Crafts To Do,Diy For Kids,Summer Snow,Diys For Summer,Craft Ideas For Girls",image,https://i.pinimg.com/originals/b3/bc/e2/b3bce2964e8c8975387b39660eed5f16.jpg,Local save in /data/diy-and-crafts,diy-and-crafts,1
5730,1e1f0c8b-9fcf-460b-9154-c775827206eb,Island Oasis Coupon Organizer,"Description Coupon Organizer in a fun colorful fabric -island oasis, Great Size for the ""basic"" couponer - holds up to 500 coupons with ease, and is made long enough so that you…",0,Consuelo Aguirre,"Grocery Items,Grocery Coupons,Care Organization,Coupon Organization,Extreme Couponing,Couponing 101,Life Binder,Save My Money,Love Coupons",image,https://i.pinimg.com/originals/65/bb/ea/65bbeaf458907bb079317d8303c4fa0e.jpg,Local save in /data/finance,finance,1
8304,5b6d0913-25e4-43ab-839d-85d5516f78a4,The #1 Reason You’re Not His Priority Anymore - Matthew Coast,#lovequotes #matchmaker #matchmadeinheaven #loveyourself #respectyourself,51k,Commitment Connection,"Wise Quotes,Quotable Quotes,Words Quotes,Wise Words,Quotes To Live By,Great Quotes,Motivational Quotes,Inspirational Quotes,Funny Quotes",image,https://i.pinimg.com/originals/c6/64/ee/c664ee71524fb5a6e7b7b49233f93b43.png,Local save in /data/quotes,quotes,1
8731,ea760f71-febf-4023-b592-d17396659039,20 Koi Fish Tattoos For Lucky Men,"Koi fish tattoos are a popular choice for men who want to make a statement, thanks to their rich symbolism and bold design.",211k,TheTrendSpotter,"Dr Tattoo,Wörter Tattoos,Pisces Tattoos,Tatoo Art,Dream Tattoos,Dope Tattoos,Mini Tattoos,Finger Tattoos,Body Art Tattoos",image,https://i.pinimg.com/originals/8a/0c/0a/8a0c0a7b6236565c519acd41ad1a52c0.jpg,Local save in /data/tattoos,tattoos,1


# Data Cleaning
## Data cleaning for df_pin
- Removing duplicate rows in the dataframe
- Renaming the column index to ind
- Re-ordering the column names in the dataframe
- Replacing the values of follower_count column wherever necessary.
- Converting the columns follower_count, ind, downloaded into a integer data type
- Removing any additional strings from the save_location column
- Replacing all the NA with None
- Dropping the rows where all columns have null values

In [0]:
#clean df_pin
df_pin = df_pin.dropDuplicates()
df_pin = df_pin.withColumnRenamed('index', 'ind')
df_pin = df_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", 
"tag_list", "is_image_or_video", "image_src", "save_location", "category", "downloaded")
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%k]', '000'))
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%M]', '000000'))
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%User Info Error%]', ''))
df_pin = df_pin.withColumn('ind', df_pin['ind'].cast(IntegerType()))
df_pin = df_pin.withColumn('downloaded', df_pin['downloaded'].cast(IntegerType()))
df_pin = df_pin.withColumn('follower_count', df_pin['follower_count'].cast(IntegerType()))
df_pin = df_pin.withColumn('save_location', regexp_replace('save_location', 'Local save in *', ''))
df_pin.na.fill('None', ['is_image_or_video', 'image_src'])
df_pin.na.drop(how = "all")

display(df_pin.limit(5))

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category,downloaded
9875,782dcbad-ff91-40a6-ba60-216efe29adb7,European Bucket List: 35 Things NOT To Miss When Traveling Europe,"35 European bucket list destinations for any traveler heading to Europe. From Cinque Terre, Italy to Iceland - there are so many amazing sites to see in Europe.",28000,Nicki,"Backpacking Europe,Europe Travel Guide,Travel Guides,Travel Packing,Traveling Europe,Travelling,Travel Backpack,Budget Travel,2 Week Europe Itinerary",image,https://i.pinimg.com/originals/71/04/1a/71041ad83ede43d9665741e719c58a86.jpg,/data/travel,travel,1
7166,4a844b03-e161-47a1-904b-591eb5dc4fb1,The Killers - Mr. Brightside - Women's T-Shirt - Heather Dark Grey / S,"Women's T-shirt. Design inspired by the rock band The Killers' hit ""Mr. Brightside"". One of the greatest song from the album Hot Fuss released in 2004. Soft and light, 100% cott…",27,Mala Rock | Rock T-shirts,"Mr Brightside,Rock T Shirts,Greatest Songs,Timeless Classic,Rock Bands,Album,T Shirts For Women,Inspired,Hot",image,https://i.pinimg.com/originals/8c/42/39/8c42391d35fcad51a4a79f7cd81bf26d.jpg,/data/mens-fashion,mens-fashion,1
2074,86ed09a7-842d-496d-9501-010c654eb340,35 Christmas Decorating Ideas We Bet You Haven't Thought Of,20 Christmas Decorating Ideas We Bet You Haven't Thought Of via @PureWow,868000,PureWow,"Holiday Centerpieces,Xmas Decorations,Centerpiece Ideas,Table Centerpieces,Valentine Decorations,Wedding Centerpieces,Outdoor Decorations,Christmas Centerpieces With Candles,Christmas Dining Table Decorations",image,https://i.pinimg.com/originals/e9/b9/f0/e9b9f01cc3b2cf41948b45854335396c.jpg,/data/christmas,christmas,1
3454,46bd3f86-b09d-4e29-9033-7ff2df595e51,What can you use to color resin?,HELPFUL RESOURCES – Check out my resin colorants resources page here with links to all the products mentioned in this article (and more). Let me know if you have any that you lo…,6000,Mixed Media Crafts,"Epoxy Resin Art,Diy Resin Art,Diy Resin Crafts,Resin Molds,Ice Resin,Resin Pour,Diy Epoxy,Diy Resin Painting,Diy Resin Dice",image,https://i.pinimg.com/originals/d4/12/78/d4127833023ca32600571ddca16f1556.jpg,/data/diy-and-crafts,diy-and-crafts,1
4913,4d2d79c6-9ca8-46c9-a38e-931c5d967804,How to Work From Home as an Event Planner,Do you love planning parties? Do you want to work-at-home? This post covers everything you need to know about becoming a home-based event planner!,110000,The Work at Home Woman,"Event Planning Tips,Event Planning Business,Party Planning,Business Ideas,Craft Business,Business Chic,Business Inspiration,Business Goals,Business Opportunities",image,https://i.pinimg.com/originals/4d/a3/a4/4da3a4730418239f1b43ce81fb032c15.jpg,/data/event-planning,event-planning,1


## Data cleaning for df_geo
- Removing duplicate rows in the dataframe
- Converting the columns ind to integer type, latitude and longitude to double type, timestamp into a timestamp data type
- Creating new column coordinates with the values to be the array of latitude and longitude column and deleting these two columns
- Re-ordering the column names in the dataframe

In [0]:
#clean df_geo
df_geo = df_geo.dropDuplicates()
df_geo = df_geo.withColumn('longitude', df_geo['longitude'].cast(DoubleType()))
df_geo = df_geo.withColumn('ind', df_geo['ind'].cast(IntegerType()))
df_geo = df_geo.withColumn("timestamp", df_geo["timestamp"].cast(TimestampType()))
df_geo = df_geo.withColumn('latitude', df_geo['latitude'].cast(DoubleType()))
df_geo = df_geo.withColumn('longitude', df_geo['longitude'].cast(DoubleType()))
df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
df_geo = df_geo.drop('latitude', 'longitude')
df_geo = df_geo.select("ind", "country", "coordinates", "timestamp")
display(df_geo.limit(5))

ind,country,coordinates,timestamp
8887,Botswana,"List(-28.0137, -160.708)",2021-09-19T05:27:43.000+0000
9546,Austria,"List(-70.0295, -155.428)",2018-02-10T06:16:40.000+0000
7166,Aruba,"List(-86.4063, -136.657)",2022-07-25T03:07:37.000+0000
8731,Aruba,"List(-83.104, -171.302)",2020-07-17T04:39:09.000+0000
8304,French Guiana,"List(-28.8852, -164.87)",2019-09-13T04:50:29.000+0000


## Data cleaning for df_user
- Removing duplicate rows in the dataframe
- Creating new column user_name by combining the first_name and last_name column and deleting these two columns
- Converting the date_joined column into a timestamp data type and age to integer data type
- Re-ordering the column names in the dataframe

In [0]:
#clean_df_user
df_user = df_user.dropDuplicates()
df_user = df_user.withColumn("user_name", concat(col("first_name"), lit(" "), col("last_name")))
df_user = df_user.drop("first_name", "last_name")
df_user = df_user.withColumn('date_joined', df_user['date_joined'].cast(TimestampType()))
df_user = df_user.withColumn('age', df_user['age'].cast(TimestampType()))
df_user = df_user.select("ind", "user_name", "age", "date_joined")
display(df_user.limit(5))

ind,user_name,age,date_joined
3089,Abigail Ali,,2015-10-24T11:23:51.000+0000
4137,Michael Decker,,2017-06-29T22:35:17.000+0000
7343,Anne Clayton,,2016-01-11T12:03:48.000+0000
3454,Robert Murphy,,2017-09-26T16:31:56.000+0000
10625,Christian Lang,,2017-10-10T20:09:33.000+0000


## Creating Delta Table
Creating three delta tables for the three kinesis data streams:
- 0ecf5ea19ac5_pin_table
- 0ecf5ea19ac5_geo_table
- 0ecf5ea19ac5_user_table

In [0]:
#create delta tables
tables = {df_pin :'0ecf5ea19ac5_pin_table', df_geo :'0ecf5ea19ac5_geo_table', df_user :'0ecf5ea19ac5_user_table'}
for df_name, delta_table in tables.items():
    dbutils.fs.rm(f"/tmp/kinesis/_checkpoints/{delta_table}", True)
    df_name.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", f"/tmp/kinesis/_checkpoints/{delta_table}") \
    .table(f"{delta_table}")
  