In [0]:
%run ./custom_logging

### Author: Sailesh Chauhan
### Date: 26/07/2021
### Title: Find late ariving employe average late coming time from provided CPU Logs data in csv files.

In [0]:
%fs
ls dbfs:/FileStore/tables

path,name,size
dbfs:/FileStore/tables/CpuLogData2019_09_16.csv,CpuLogData2019_09_16.csv,177697
dbfs:/FileStore/tables/CpuLogData2019_09_17.csv,CpuLogData2019_09_17.csv,299062
dbfs:/FileStore/tables/CpuLogData2019_09_18.csv,CpuLogData2019_09_18.csv,286264
dbfs:/FileStore/tables/CpuLogData2019_09_19.csv,CpuLogData2019_09_19.csv,346015
dbfs:/FileStore/tables/CpuLogData2019_09_20.csv,CpuLogData2019_09_20.csv,136532
dbfs:/FileStore/tables/CpuLogData2019_09_21.csv,CpuLogData2019_09_21.csv,291933
dbfs:/FileStore/tables/LMS_DB.zip,LMS_DB.zip,636912


##### Mounting AzureStorage account container in databricks filestorage system

In [0]:
AZURE_KEY=spark.conf.get('spark.azure_key')
dbutils.fs.mount(
  source = "wasbs://cpu-logs-latecomer@cpulogs.blob.core.windows.net",
  mount_point = "/mnt/CPU_Logs_Late_comers",
  extra_configs ={"fs.azure.account.key.cpulogs.blob.core.windows.net":AZURE_KEY})

#### Loading all csv files for each cpu log dates from 2019-09-16 to 2019-09-21 into dataframe df. Then selecting required use columns DateTime and user_name.

In [0]:
try:
  file_location=['dbfs:/FileStore/tables/CpuLogData2019_09_21.csv','dbfs:/FileStore/tables/CpuLogData2019_09_20.csv','dbfs:/FileStore/tables/CpuLogData2019_09_19.csv','dbfs:/FileStore/tables/CpuLogData2019_09_18.csv','dbfs:/FileStore/tables/CpuLogData2019_09_17.csv','dbfs:/FileStore/tables/CpuLogData2019_09_16.csv']

  file_type='csv'

  infer_schema = "true"
  first_row_is_header = "true"
  delimiter = ","

  df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .load(file_location)

  df_CPU=df.select("DateTime","user_name")
  rows=df_CPU.count()
  df_CPU.describe()
  rows
  logger.info("All csv files loaded in data frame df_CPU")
except Exception as ex:
  logger.error("files not loaded in data frame df_CPU : "+str(ex))
  
logging.shutdown()

##### Adding date column to df_CPU dataframe from DateTime attribute of df_CPU

In [0]:
from pyspark.sql.functions import date_format
try:
  df_CPU = df_CPU.withColumn('date',date_format('DateTime', 'yyyy-MM-dd'))
  df_CPU.printSchema()
  logger.debug("Column typecasted to time format")
except Exception as ex:
  logger.error("Failed column type casting :"+str(ex))

logging.shutdown()

##### Creating temparory view of dataframe df_CPU as cpu_all_dates

In [0]:
try:
  temp_view="cpu_all_dates"
  df_CPU.createOrReplaceTempView(temp_view)
  logger.debug("Created temprorary view cpu_all_dates")
except Exception as ex:
  logger.error("failed to create view cpu_all_dates")

##### Type casting DateTime and date attributes to timestamp

In [0]:
try:
  df=spark.sql("select to_timestamp(DateTime) as Datetime,user_name,to_timestamp(date) as date from cpu_all_dates")
  logger.debug("Type casting date to timestamp from string")
except Exception as ex:
  logger.error("date coversion failed "+str(ex))

logging.shutdown()

In [0]:
try:
  display(df.take(5))
  logger.debug("display top 5")
except Exception as ex:
  logger.error("Failed to display "+str(ex))
logging.shutdown()

Datetime,user_name,date
2019-09-19T08:40:02.000+0000,iamnzm@outlook.com,2019-09-19T00:00:00.000+0000
2019-09-19T08:45:02.000+0000,iamnzm@outlook.com,2019-09-19T00:00:00.000+0000
2019-09-19T08:50:01.000+0000,iamnzm@outlook.com,2019-09-19T00:00:00.000+0000
2019-09-19T08:55:01.000+0000,iamnzm@outlook.com,2019-09-19T00:00:00.000+0000
2019-09-19T09:00:01.000+0000,iamnzm@outlook.com,2019-09-19T00:00:00.000+0000


In [0]:
try:
  df.createOrReplaceTempView(temp_view)
  logger.debug("cpu_all_dates view created success")
except Exception as ex:
  logger.error('cpu_all_dates view failed '+str(ex))
  
logging.shutdown()

## SQL

In [0]:
%sql
select count(*) from cpu_all_dates

count(1)
4122


##### Creating temparory view cpu_all_loginTime

In [0]:
%sql
create temp view cpu_all_loginTime as select min(datetime) as log_In_Time,user_name,date from cpu_all_dates group by date,user_name order by log_In_Time

In [0]:
%sql
select * from cpu_all_loginTime

log_In_Time,user_name,date
2019-09-16T12:55:01.000+0000,bhagyashrichalke21@gmail.com,2019-09-16T00:00:00.000+0000
2019-09-16T12:55:02.000+0000,salinabodale73@gmail.com,2019-09-16T00:00:00.000+0000
2019-09-16T12:55:03.000+0000,rahilstar11@gmail.com,2019-09-16T00:00:00.000+0000
2019-09-16T13:00:01.000+0000,iamnzm@outlook.com,2019-09-16T00:00:00.000+0000
2019-09-16T13:00:01.000+0000,deepshukla292@gmail.com,2019-09-16T00:00:00.000+0000
2019-09-16T13:00:04.000+0000,sharlawar77@gmail.com,2019-09-16T00:00:00.000+0000
2019-09-17T08:25:01.000+0000,iamnzm@outlook.com,2019-09-17T00:00:00.000+0000
2019-09-17T09:30:01.000+0000,deepshukla292@gmail.com,2019-09-17T00:00:00.000+0000
2019-09-17T09:40:01.000+0000,rahilstar11@gmail.com,2019-09-17T00:00:00.000+0000
2019-09-17T10:10:01.000+0000,salinabodale73@gmail.com,2019-09-17T00:00:00.000+0000


##### Creating temproray view cpu_all_delayedby_latecount for employe late login time in seconds for each user each date.

In [0]:
%sql
create temp view cpu_all_delayedby_latecount as select ((bigint(log_in_time)) - first(bigint(log_in_time)) over(partition by date order by log_in_time)) as delayedby,user_name,date from cpu_all_loginTime

In [0]:
%sql
select * from cpu_all_delayedby_latecount

delayedby,user_name,date
0,bhagyashrichalke21@gmail.com,2019-09-16T00:00:00.000+0000
1,salinabodale73@gmail.com,2019-09-16T00:00:00.000+0000
2,rahilstar11@gmail.com,2019-09-16T00:00:00.000+0000
300,iamnzm@outlook.com,2019-09-16T00:00:00.000+0000
300,deepshukla292@gmail.com,2019-09-16T00:00:00.000+0000
303,sharlawar77@gmail.com,2019-09-16T00:00:00.000+0000
0,iamnzm@outlook.com,2019-09-17T00:00:00.000+0000
3900,deepshukla292@gmail.com,2019-09-17T00:00:00.000+0000
4500,rahilstar11@gmail.com,2019-09-17T00:00:00.000+0000
6300,salinabodale73@gmail.com,2019-09-17T00:00:00.000+0000


#### Result table to find number of times employees logged in late and average late arrival of each employee

In [0]:
%sql
select from_unixtime(round(sum(delayedby)/6,2),'HH:mm') as Avg_late_hrs_min,user_name,count(user_name) as number_of_times_late from cpu_all_delayedby_latecount where delayedby!=0  group by user_name order by Avg_late_hrs_min desc

Avg_late_hrs_min,user_name,number_of_times_late
01:25,salinabodale73@gmail.com,6
01:25,bhagyashrichalke21@gmail.com,5
01:22,rahilstar11@gmail.com,6
01:08,markfernandes66@gmail.com,5
01:06,sharlawar77@gmail.com,5
00:58,damodharn21@gmail.com,3
00:32,iamnzm@outlook.com,3
00:21,deepshukla292@gmail.com,4


##### Writing result in csv file format to mounted Azure conatiner storage

In [0]:
try:
  df = spark.sql("""
            select from_unixtime(round(sum(delayedby)/6,2),'HH:mm') as Avg_late_hrs_min,user_name,count(user_name) as number_of_times_late from cpu_all_delayedby_latecount where delayedby!=0  group by user_name order by Avg_late_hrs_min desc""")
  df.write.option("header",'true').csv('/mnt/CPU_Logs_Late_comers/Employe_Late')
  logger.debug('file written succesfully')
except Exception as ex:
  logger.error('file writing unsuccesful '+str(ex))