## Import Libraries

In [1]:
from pyspark.sql.functions import hour,minute,second,col,avg,when
from pyspark.sql import SQLContext
import pyspark.sql.functions as sql_functions
import datetime
sqlContext = SQLContext(sc)

## Read csv file from hdfs server

In [2]:
df_log = sqlContext.read.csv("file:///home/hadoopuser/Documents/Pyspark/data/sparklog2019-10-24",header = True, inferSchema = True)
df_log.show(10)

+--------------------+-------------------+-------------------+-------------------+-------------------+
|           user_name|         start_time|           end_time|          idle_time|       working_hour|
+--------------------+-------------------+-------------------+-------------------+-------------------+
|  sahil24c@gmail.com|2019-10-24 08:30:02|2019-10-24 19:25:02|2019-10-24 05:05:00|2019-10-24 05:50:00|
|magadum.iranna@gm...|2019-10-24 08:30:02|2019-10-24 19:25:01|2019-10-24 02:15:00|2019-10-24 08:39:59|
|  yathink3@gmail.com|2019-10-24 08:30:02|2019-10-24 19:25:01|2019-10-24 01:30:00|2019-10-24 09:24:59|
|  shelkeva@gmail.com|2019-10-24 08:45:01|2019-10-24 18:25:02|2019-10-24 00:30:00|2019-10-24 09:10:01|
|puruissimple@gmai...|2019-10-24 08:50:02|2019-10-24 19:25:01|2019-10-24 03:15:00|2019-10-24 07:19:59|
|sangita.awaghad19...|2019-10-24 08:50:01|2019-10-24 19:25:01|2019-10-24 01:55:00|2019-10-24 08:40:00|
|vaishusawant143@g...|2019-10-24 08:55:01|2019-10-24 19:25:01|2019-10-24 

In [3]:
print('Rows : ',df_log.count(), 'Columns : ',len(df_log.columns))    # total no of row and column

Rows :  88 Columns :  5


### Find total number of hours for each user in working hours 

In [4]:
df_work_hours = df_log.drop('idle_time', 'start_time', 'end_time')

In [5]:
df_hours = df_work_hours.withColumn('work_hours', hour(df_log['working_hour']))
df_hours.show(2)

+--------------------+-------------------+----------+
|           user_name|       working_hour|work_hours|
+--------------------+-------------------+----------+
|  sahil24c@gmail.com|2019-10-24 05:50:00|         5|
|magadum.iranna@gm...|2019-10-24 08:39:59|         8|
+--------------------+-------------------+----------+
only showing top 2 rows



In [6]:
df_hours = df_hours.withColumn('hours_inSec', df_hours['work_hours'] * 3600)
df_hours.show(2)

+--------------------+-------------------+----------+-----------+
|           user_name|       working_hour|work_hours|hours_inSec|
+--------------------+-------------------+----------+-----------+
|  sahil24c@gmail.com|2019-10-24 05:50:00|         5|      18000|
|magadum.iranna@gm...|2019-10-24 08:39:59|         8|      28800|
+--------------------+-------------------+----------+-----------+
only showing top 2 rows



### Find total number of mins for each user in working hours

In [7]:
df_mins = df_work_hours.withColumn('minutes', minute(df_work_hours['working_hour']))
df_mins.show(2)

+--------------------+-------------------+-------+
|           user_name|       working_hour|minutes|
+--------------------+-------------------+-------+
|  sahil24c@gmail.com|2019-10-24 05:50:00|     50|
|magadum.iranna@gm...|2019-10-24 08:39:59|     39|
+--------------------+-------------------+-------+
only showing top 2 rows



In [8]:
df_mins = df_mins.withColumn('mins_inSec', df_mins['minutes'] * 60)
df_mins.show(2)

+--------------------+-------------------+-------+----------+
|           user_name|       working_hour|minutes|mins_inSec|
+--------------------+-------------------+-------+----------+
|  sahil24c@gmail.com|2019-10-24 05:50:00|     50|      3000|
|magadum.iranna@gm...|2019-10-24 08:39:59|     39|      2340|
+--------------------+-------------------+-------+----------+
only showing top 2 rows



### Find total number of sec for each user in working hours

In [9]:
df_sec = df_work_hours.withColumn('second', second(df_work_hours['working_hour']))
df_sec.show(2)

+--------------------+-------------------+------+
|           user_name|       working_hour|second|
+--------------------+-------------------+------+
|  sahil24c@gmail.com|2019-10-24 05:50:00|     0|
|magadum.iranna@gm...|2019-10-24 08:39:59|    59|
+--------------------+-------------------+------+
only showing top 2 rows



### Joining all data frame in one

In [10]:
##Joining the dataframes
df_hours_min = df_hours.join(df_mins, on = ['user_name'], how = 'inner')
df_hours_min = df_hours_min.drop('working_hour','work_hours','minutes')
df_hours_min.show(2)

+--------------------+-----------+----------+
|           user_name|hours_inSec|mins_inSec|
+--------------------+-----------+----------+
|  sahil24c@gmail.com|      18000|      3000|
|magadum.iranna@gm...|      28800|      2340|
+--------------------+-----------+----------+
only showing top 2 rows



In [11]:
df_h_m_s = df_hours_min.join(df_sec, on = ['user_name'], how = 'inner') 
df_h_m_s = df_h_m_s.drop('working_hour')
df_h_m_s.show(2)

+--------------------+-----------+----------+------+
|           user_name|hours_inSec|mins_inSec|second|
+--------------------+-----------+----------+------+
|  sahil24c@gmail.com|      18000|      3000|     0|
|magadum.iranna@gm...|      28800|      2340|    59|
+--------------------+-----------+----------+------+
only showing top 2 rows



In [12]:
df_final = df_h_m_s.withColumn('working_hours', (df_h_m_s['hours_inSec'] + df_h_m_s['mins_inSec'] + df_h_m_s['second'])/3600)
df_final.show(2)

+--------------------+-----------+----------+------+-----------------+
|           user_name|hours_inSec|mins_inSec|second|    working_hours|
+--------------------+-----------+----------+------+-----------------+
|  sahil24c@gmail.com|      18000|      3000|     0|5.833333333333333|
|magadum.iranna@gm...|      28800|      2340|    59| 8.66638888888889|
+--------------------+-----------+----------+------+-----------------+
only showing top 2 rows



### Calculate average working hours

In [13]:
#calculating average hours
avg_work_hour = df_final.select(avg('working_hours'))
avg_work_hour.show()

+------------------+
|avg(working_hours)|
+------------------+
| 7.404494949494952|
+------------------+



In [14]:
avg_count = df_final.filter(sql_functions.col("working_hours") < 7.404494949494952)
avg_count.count()

33

In [15]:
df_final.filter(df_final['working_hours'] < 7.404494949494952).select('user_name').show()

+--------------------+
|           user_name|
+--------------------+
|  sahil24c@gmail.com|
|puruissimple@gmai...|
|vishnu23kumar@gma...|
|sargampandey27oct...|
|ayush.saraf47@gma...|
|ruchikachile30199...|
|“shivnajalisangal...|
| addyp1911@gmail.com|
|dipakalagate1991@...|
|gaikwadr576@gmail...|
|tekina.makin@gmai...|
|mishrasushil889@g...|
| blsonalib@gmail.com|
|hakepratiksha55@g...|
|vaibhavpratihar17...|
|bsaivenkatavikas@...|
| youremail@email.com|
|polelaxman001@gma...|
|er.mukulvij96@gma...|
| dileep.bs@yahoo.com|
+--------------------+
only showing top 20 rows

