In [0]:
import pyspark
from pyspark.sql import SparkSession

In [0]:
# Created a spark session to start with & created a context object
# As databricks provides functionality of inbuild  sparksession hence not creating the same will be using the active session
# spark_session = spark.builder.getOrCreate()
# spark_context = spark_session.sparkContext

spark_session = spark.builder.getOrCreate()
print(spark_session.sparkContext.appName)
print(spark_session.sparkContext.version)

# -------------------------------------------  OUTPUT -----------------------------------------------
# Databricks Shell
# 3.3.0

In [0]:

data = spark.read.option("header","true").option("inferschema","true").csv("s3://Sample_bucket/Folder/Percent_office_hours.csv")
#Creating RDD from Dataframe
data_Rdd = data.rdd

In [0]:
# getting the number of partitions
print("The number of partitions are : ",data_Rdd.getNumPartitions())

print(data_Rdd.top(1))
print("Total no of records before transformations & cleaning : ",data_Rdd.count())

#Creating a Map function for Data of office hours percentage >= 95
transformed_data = data_Rdd.map(lambda x : (x["emp_code"],x["percent_office_hours"] >= 95, x["swipe_date"]))
cleaned_data = transformed_data.filter(lambda x : x[1] == True).collect()
print("Total no of records after transformations & cleaning : ",len(cleaned_data))
print("Sample from cleaned data :",cleaned_data[0])

# ----------------------------------------- OUTPUT ---------------------
# The number of partitions are :  5
# Sample before cleaning => [Row(emp_code=1123533, swipe_date=20240116, percent_office_hours=56)]
# Total no of records before transformations & cleaning :  1048575
# Total no of records after transformations & cleaning :  361596
# Sample from cleaned data : (1122437, True, 20240115)