In [51]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
appName("Sneha Spark Session").\
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [52]:
spark

In [53]:
log_data = [('INFO','2015-8-8 20:49:22'),
            ('WARN','2015-1-14 20:05:00'),
            ('INFO','2017-6-14 00:08:35'),
            ('INFO','2016-1-18 11:50:14'),
            ('DEBUG','2017-7-1 12:55:02'),
            ('INFO','2014-2-26 12:34:21'),
            ('INFO','2015-7-12 11:13:47'),
            ('INFO','2017-4-15 01:20:18'),
            ('DEBUG','2016-11-2 20:19:23'),
            ('INFO','2012-8-20 10:09:44')]

In [54]:
log_df = spark.createDataFrame(log_data).toDF('loglevel','logtime')

In [55]:
log_df.show()

+--------+------------------+
|loglevel|           logtime|
+--------+------------------+
|    INFO| 2015-8-8 20:49:22|
|    WARN|2015-1-14 20:05:00|
|    INFO|2017-6-14 00:08:35|
|    INFO|2016-1-18 11:50:14|
|   DEBUG| 2017-7-1 12:55:02|
|    INFO|2014-2-26 12:34:21|
|    INFO|2015-7-12 11:13:47|
|    INFO|2017-4-15 01:20:18|
|   DEBUG|2016-11-2 20:19:23|
|    INFO|2012-8-20 10:09:44|
+--------+------------------+



In [56]:
log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: string (nullable = true)



In [57]:
from pyspark.sql.functions import *

In [58]:
new_log_df = log_df.withColumn("logtime",to_timestamp("logtime"))

In [59]:
new_log_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
+--------+-------------------+



In [60]:
new_log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: timestamp (nullable = true)



In [61]:
new_log_df.createOrReplaceTempView("serverlogs")

In [62]:
spark.sql("select * from serverlogs").show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
+--------+-------------------+



In [63]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month from serverlogs").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
+--------+--------+



In [64]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month, count(*) as total_occurance from serverlogs group by loglevel,month").show()

+--------+--------+---------------+
|loglevel|   month|total_occurance|
+--------+--------+---------------+
|    INFO|    June|              1|
|   DEBUG|    July|              1|
|    INFO|February|              1|
|    WARN| January|              1|
|    INFO|  August|              2|
|   DEBUG|November|              1|
|    INFO|   April|              1|
|    INFO| January|              1|
|    INFO|    July|              1|
+--------+--------+---------------+



In [65]:
schema = "loglevel string,logtime timestamp"

In [66]:
log_df = spark.read.format('csv').schema(schema).load('/public/trendytech/datasets/logdata1m.csv')

In [67]:
log_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-07-21 18:34:18|
|   DEBUG|2014-12-26 06:38:42|
+--------+-------------------+
only showing top 20 rows



In [68]:
log_df.count()

1000000

In [69]:
log_df.createOrReplaceTempView("serverlogs")

In [70]:
spark.sql("select * from serverlogs")

loglevel,logtime
INFO,2015-08-08 20:49:22
WARN,2015-01-14 20:05:00
INFO,2017-06-14 00:08:35
INFO,2016-01-18 11:50:14
DEBUG,2017-07-01 12:55:02
INFO,2014-02-26 12:34:21
INFO,2015-07-12 11:13:47
INFO,2017-04-15 01:20:18
DEBUG,2016-11-02 20:19:23
INFO,2012-08-20 10:09:44


In [71]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month from serverlogs").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
|   DEBUG|   April|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
|    INFO|December|
|   DEBUG|November|
|    INFO|    July|
|   DEBUG|December|
+--------+--------+
only showing top 20 rows



In [72]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month, count(*) as total_occurance from serverlogs group by loglevel,month").show()

+--------+---------+---------------+
|loglevel|    month|total_occurance|
+--------+---------+---------------+
|    WARN|     June|           8191|
|    INFO|     June|          29143|
|   ERROR| November|           3389|
|   FATAL|  January|             94|
|    WARN| December|           8328|
|    WARN|    March|           8165|
|   DEBUG|     July|          42085|
|   ERROR|    April|           4107|
|   ERROR|  January|           4054|
|   FATAL|September|             81|
|   FATAL|    April|             83|
|    INFO|September|          29038|
|   FATAL| November|          16797|
|   FATAL|  October|             92|
|    INFO| February|          28983|
|    WARN|    April|           8277|
|   DEBUG| December|          41749|
|   FATAL| December|             94|
|    WARN|      May|           8403|
|   ERROR|     June|           4059|
+--------+---------+---------------+
only showing top 20 rows



In [73]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month, count(*) as total_occurance from serverlogs group by loglevel,month order by month").show()

+--------+--------+---------------+
|loglevel|   month|total_occurance|
+--------+--------+---------------+
|   DEBUG|   April|          41869|
|    INFO|   April|          29302|
|    WARN|   April|           8277|
|   ERROR|   April|           4107|
|   FATAL|   April|             83|
|   FATAL|  August|             80|
|   ERROR|  August|           3987|
|   DEBUG|  August|          42147|
|    INFO|  August|          28993|
|    WARN|  August|           8381|
|   DEBUG|December|          41749|
|    INFO|December|          28874|
|   FATAL|December|             94|
|    WARN|December|           8328|
|   ERROR|December|           4106|
|   FATAL|February|             72|
|   ERROR|February|           4013|
|    WARN|February|           8266|
|    INFO|February|          28983|
|   DEBUG|February|          41734|
+--------+--------+---------------+
only showing top 20 rows



In [74]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month,date_format(logtime,'M') as monthnum, count(*) as total_occurance from serverlogs group by loglevel,month,monthnum order by monthnum").show()

+--------+--------+--------+---------------+
|loglevel|   month|monthnum|total_occurance|
+--------+--------+--------+---------------+
|   ERROR| January|       1|           4054|
|   DEBUG| January|       1|          41961|
|    INFO| January|       1|          29119|
|   FATAL| January|       1|             94|
|    WARN| January|       1|           8217|
|    INFO| October|      10|          29018|
|    WARN| October|      10|           8226|
|   FATAL| October|      10|             92|
|   ERROR| October|      10|           4040|
|   DEBUG| October|      10|          41936|
|   DEBUG|November|      11|          33366|
|   FATAL|November|      11|          16797|
|    WARN|November|      11|           6616|
|   ERROR|November|      11|           3389|
|    INFO|November|      11|          23301|
|    INFO|December|      12|          28874|
|   ERROR|December|      12|           4106|
|   FATAL|December|      12|             94|
|   DEBUG|December|      12|          41749|
|    WARN|

In [75]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month,date_format(logtime,'MM') as monthnum, count(*) as total_occurance from serverlogs group by loglevel,month,monthnum order by monthnum").show()

+--------+--------+--------+---------------+
|loglevel|   month|monthnum|total_occurance|
+--------+--------+--------+---------------+
|    INFO| January|      01|          29119|
|   DEBUG| January|      01|          41961|
|   FATAL| January|      01|             94|
|   ERROR| January|      01|           4054|
|    WARN| January|      01|           8217|
|    INFO|February|      02|          28983|
|    WARN|February|      02|           8266|
|   DEBUG|February|      02|          41734|
|   ERROR|February|      02|           4013|
|   FATAL|February|      02|             72|
|   DEBUG|   March|      03|          41652|
|   FATAL|   March|      03|             70|
|    WARN|   March|      03|           8165|
|   ERROR|   March|      03|           4122|
|    INFO|   March|      03|          29095|
|    WARN|   April|      04|           8277|
|   ERROR|   April|      04|           4107|
|   FATAL|   April|      04|             83|
|   DEBUG|   April|      04|          41869|
|    INFO|

In [76]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month,int(date_format(logtime,'MM')) as monthnum, count(*) as total_occurance from serverlogs group by loglevel,month,monthnum order by monthnum").show(60)

+--------+---------+--------+---------------+
|loglevel|    month|monthnum|total_occurance|
+--------+---------+--------+---------------+
|   ERROR|  January|       1|           4054|
|   FATAL|  January|       1|             94|
|    WARN|  January|       1|           8217|
|   DEBUG|  January|       1|          41961|
|    INFO|  January|       1|          29119|
|   ERROR| February|       2|           4013|
|    WARN| February|       2|           8266|
|    INFO| February|       2|          28983|
|   FATAL| February|       2|             72|
|   DEBUG| February|       2|          41734|
|    INFO|    March|       3|          29095|
|   FATAL|    March|       3|             70|
|   ERROR|    March|       3|           4122|
|   DEBUG|    March|       3|          41652|
|    WARN|    March|       3|           8165|
|   DEBUG|    April|       4|          41869|
|    INFO|    April|       4|          29302|
|   FATAL|    April|       4|             83|
|    WARN|    April|       4|     

In [77]:
spark.sql("""select loglevel,date_format(logtime,'MMMM')as month,
            first(date_format(logtime,'MM')) as monthnum, 
            count(*) as total_occurance from serverlogs 
            group by loglevel,month order by monthnum""").show(60)

+--------+---------+--------+---------------+
|loglevel|    month|monthnum|total_occurance|
+--------+---------+--------+---------------+
|   DEBUG|  January|      01|          41961|
|   FATAL|  January|      01|             94|
|   ERROR|  January|      01|           4054|
|    INFO|  January|      01|          29119|
|    WARN|  January|      01|           8217|
|    INFO| February|      02|          28983|
|   FATAL| February|      02|             72|
|    WARN| February|      02|           8266|
|   DEBUG| February|      02|          41734|
|   ERROR| February|      02|           4013|
|   FATAL|    March|      03|             70|
|   ERROR|    March|      03|           4122|
|   DEBUG|    March|      03|          41652|
|    WARN|    March|      03|           8165|
|    INFO|    March|      03|          29095|
|   ERROR|    April|      04|           4107|
|    WARN|    April|      04|           8277|
|    INFO|    April|      04|          29302|
|   FATAL|    April|      04|     

In [78]:
result_df = spark.sql("""select loglevel,date_format(logtime,'MMMM')as month,
            first(date_format(logtime,'MM')) as monthnum, 
            count(*) as total_occurance from serverlogs 
            group by loglevel,month order by monthnum""")

In [79]:
result_df.show()

+--------+--------+--------+---------------+
|loglevel|   month|monthnum|total_occurance|
+--------+--------+--------+---------------+
|   FATAL| January|      01|             94|
|    INFO| January|      01|          29119|
|   ERROR| January|      01|           4054|
|    WARN| January|      01|           8217|
|   DEBUG| January|      01|          41961|
|   ERROR|February|      02|           4013|
|   DEBUG|February|      02|          41734|
|   FATAL|February|      02|             72|
|    INFO|February|      02|          28983|
|    WARN|February|      02|           8266|
|   ERROR|   March|      03|           4122|
|    WARN|   March|      03|           8165|
|    INFO|   March|      03|          29095|
|   DEBUG|   March|      03|          41652|
|   FATAL|   March|      03|             70|
|   ERROR|   April|      04|           4107|
|    WARN|   April|      04|           8277|
|    INFO|   April|      04|          29302|
|   DEBUG|   April|      04|          41869|
|   FATAL|

In [81]:
final_result= result_df.drop("monthnum")

In [82]:
final_result.show()

+--------+--------+---------------+
|loglevel|   month|total_occurance|
+--------+--------+---------------+
|   DEBUG| January|          41961|
|    WARN| January|           8217|
|   ERROR| January|           4054|
|    INFO| January|          29119|
|   FATAL| January|             94|
|   FATAL|February|             72|
|    INFO|February|          28983|
|   DEBUG|February|          41734|
|   ERROR|February|           4013|
|    WARN|February|           8266|
|   DEBUG|   March|          41652|
|   ERROR|   March|           4122|
|   FATAL|   March|             70|
|    INFO|   March|          29095|
|    WARN|   March|           8165|
|   ERROR|   April|           4107|
|    WARN|   April|           8277|
|   FATAL|   April|             83|
|    INFO|   April|          29302|
|   DEBUG|   April|          41869|
+--------+--------+---------------+
only showing top 20 rows



In [85]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month from serverlogs").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
|   DEBUG|   April|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
|    INFO|December|
|   DEBUG|November|
|    INFO|    July|
|   DEBUG|December|
+--------+--------+
only showing top 20 rows



In [88]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month from serverlogs").groupBy('loglevel').pivot('month').count().show()

+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|loglevel|April|August|December|February|January| July| June|March|  May|November|October|September|
+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|    INFO|29302| 28993|   28874|   28983|  29119|29300|29143|29095|28900|   23301|  29018|    29038|
|   ERROR| 4107|  3987|    4106|    4013|   4054| 3976| 4059| 4122| 4086|    3389|   4040|     4161|
|    WARN| 8277|  8381|    8328|    8266|   8217| 8222| 8191| 8165| 8403|    6616|   8226|     8352|
|   FATAL|   83|    80|      94|      72|     94|   98|   78|   70|   60|   16797|     92|       81|
|   DEBUG|41869| 42147|   41749|   41734|  41961|42085|41774|41652|41785|   33366|  41936|    41433|
+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+



In [89]:
spark.sql("select loglevel,date_format(logtime,'MM')as month from serverlogs").groupBy('loglevel').pivot('month').count().show()

+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|loglevel|   01|   02|   03|   04|   05|   06|   07|   08|   09|   10|   11|   12|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|    INFO|29119|28983|29095|29302|28900|29143|29300|28993|29038|29018|23301|28874|
|   ERROR| 4054| 4013| 4122| 4107| 4086| 4059| 3976| 3987| 4161| 4040| 3389| 4106|
|    WARN| 8217| 8266| 8165| 8277| 8403| 8191| 8222| 8381| 8352| 8226| 6616| 8328|
|   DEBUG|41961|41734|41652|41869|41785|41774|42085|42147|41433|41936|33366|41749|
|   FATAL|   94|   72|   70|   83|   60|   78|   98|   80|   81|   92|16797|   94|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+



In [94]:
month_list = ['January','February','March','April','May','June','July','August','September','October','November','December']

In [95]:
spark.sql("select loglevel,date_format(logtime,'MMMM')as month from serverlogs").groupBy('loglevel').pivot('month',month_list).count().show()

+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|loglevel|January|February|March|April|  May| June| July|August|September|October|November|December|
+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|    INFO|  29119|   28983|29095|29302|28900|29143|29300| 28993|    29038|  29018|   23301|   28874|
|   ERROR|   4054|    4013| 4122| 4107| 4086| 4059| 3976|  3987|     4161|   4040|    3389|    4106|
|    WARN|   8217|    8266| 8165| 8277| 8403| 8191| 8222|  8381|     8352|   8226|    6616|    8328|
|   FATAL|     94|      72|   70|   83|   60|   78|   98|    80|       81|     92|   16797|      94|
|   DEBUG|  41961|   41734|41652|41869|41785|41774|42085| 42147|    41433|  41936|   33366|   41749|
+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+

