## Import Libraries

In [1]:
from pyspark.sql.functions import hour,minute,second,col,avg,when
from pyspark.sql import SQLContext
import pyspark.sql.functions as sql_functions
import datetime
sqlContext = SQLContext(sc)

## Read csv file from hdfs server

In [2]:
df_log = sqlContext.read.csv("file:///home/hadoopuser/Documents/Pyspark/data/sparklog2019-10-24",header = True, inferSchema = True)
df_log.show(5)

+--------------------+-------------------+-------------------+-------------------+-------------------+
|           user_name|         start_time|           end_time|          idle_time|       working_hour|
+--------------------+-------------------+-------------------+-------------------+-------------------+
|  sahil24c@gmail.com|2019-10-24 08:30:02|2019-10-24 19:25:02|2019-10-24 05:05:00|2019-10-24 05:50:00|
|magadum.iranna@gm...|2019-10-24 08:30:02|2019-10-24 19:25:01|2019-10-24 02:15:00|2019-10-24 08:39:59|
|  yathink3@gmail.com|2019-10-24 08:30:02|2019-10-24 19:25:01|2019-10-24 01:30:00|2019-10-24 09:24:59|
|  shelkeva@gmail.com|2019-10-24 08:45:01|2019-10-24 18:25:02|2019-10-24 00:30:00|2019-10-24 09:10:01|
|puruissimple@gmai...|2019-10-24 08:50:02|2019-10-24 19:25:01|2019-10-24 03:15:00|2019-10-24 07:19:59|
+--------------------+-------------------+-------------------+-------------------+-------------------+
only showing top 5 rows



In [3]:
print('Rows : ',df_log.count(), 'Columns : ',len(df_log.columns))    # total no of row and column

Rows :  88 Columns :  5


### Find total number of hours for each user in idle hour

In [5]:
df_idle_hours = df_log.select('user_name', 'idle_time')
df_hours = df_idle_hours.withColumn('hours', hour(df_log['idle_time']))
df_hours.show(2)

+--------------------+-------------------+-----+
|           user_name|          idle_time|hours|
+--------------------+-------------------+-----+
|  sahil24c@gmail.com|2019-10-24 05:05:00|    5|
|magadum.iranna@gm...|2019-10-24 02:15:00|    2|
+--------------------+-------------------+-----+
only showing top 2 rows



In [6]:
df_hours = df_hours.withColumn('hours_inSec', df_hours['hours'] * 3600)
df_hours.show(2)

+--------------------+-------------------+-----+-----------+
|           user_name|          idle_time|hours|hours_inSec|
+--------------------+-------------------+-----+-----------+
|  sahil24c@gmail.com|2019-10-24 05:05:00|    5|      18000|
|magadum.iranna@gm...|2019-10-24 02:15:00|    2|       7200|
+--------------------+-------------------+-----+-----------+
only showing top 2 rows



### Find total number of mins for each user in idle hour

In [7]:
df_mins = df_idle_hours.withColumn('minutes', minute(df_idle_hours['idle_time']))
df_mins.show(2)

+--------------------+-------------------+-------+
|           user_name|          idle_time|minutes|
+--------------------+-------------------+-------+
|  sahil24c@gmail.com|2019-10-24 05:05:00|      5|
|magadum.iranna@gm...|2019-10-24 02:15:00|     15|
+--------------------+-------------------+-------+
only showing top 2 rows



In [8]:
df_mins = df_mins.withColumn('mins_inSec', df_mins['minutes'] * 60)
df_mins.show(2)

+--------------------+-------------------+-------+----------+
|           user_name|          idle_time|minutes|mins_inSec|
+--------------------+-------------------+-------+----------+
|  sahil24c@gmail.com|2019-10-24 05:05:00|      5|       300|
|magadum.iranna@gm...|2019-10-24 02:15:00|     15|       900|
+--------------------+-------------------+-------+----------+
only showing top 2 rows



### Joining all data frame in one

In [9]:
##Joining the dataframes
df_hours_min = df_hours.join(df_mins, on = ['user_name'], how = 'inner')
df_hours_min = df_hours_min.drop('idle_time','hours','minutes')
df_hours_min.show(2)

+--------------------+-----------+----------+
|           user_name|hours_inSec|mins_inSec|
+--------------------+-----------+----------+
|  sahil24c@gmail.com|      18000|       300|
|magadum.iranna@gm...|       7200|       900|
+--------------------+-----------+----------+
only showing top 2 rows



In [10]:
df_final = df_hours_min.withColumn('idle_hours', (df_hours_min['hours_inSec'] + df_hours_min['mins_inSec'])/3600)
df_final.show(2)

+--------------------+-----------+----------+-----------------+
|           user_name|hours_inSec|mins_inSec|       idle_hours|
+--------------------+-----------+----------+-----------------+
|  sahil24c@gmail.com|      18000|       300|5.083333333333333|
|magadum.iranna@gm...|       7200|       900|             2.25|
+--------------------+-----------+----------+-----------------+
only showing top 2 rows



### Calculate average idle hours

In [11]:
#calculating average hours
avg_work_hour = df_final.select(avg('idle_hours'))
avg_work_hour.show()

+------------------+
|   avg(idle_hours)|
+------------------+
|2.0369318181818183|
+------------------+



In [12]:
avg_count = df_final.filter(sql_functions.col("idle_hours") > 2.03693)
avg_count.count()

27

In [16]:
df_final.filter(df_final['idle_hours'] > 2.03693).select('user_name').show(5)

+--------------------+
|           user_name|
+--------------------+
|  sahil24c@gmail.com|
|magadum.iranna@gm...|
|puruissimple@gmai...|
|     you@example.com|
|vishnu23kumar@gma...|
+--------------------+
only showing top 5 rows

