# Q.1 Process JSON file

In [1]:
import findspark

In [5]:
import os
#Assuming JAVA_HOME is set in enviroment variables  
#os.environ["SPARK_HOME"] = r"C:\Users\rosha\Downloads\Spark\spark-3.1.1-bin-hadoop2.7"

In [6]:
findspark.init()

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("Spark application") \
    .getOrCreate()

In [8]:
spark

In [37]:
from pyspark.sql.types import *
from pyspark.sql.types import StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
'''
schema = StructType([
    StructField("name",StringType(),True),
    StructField("wins", ArrayType(
      StructType([
          StructField("Type", StringType()),
          StructField("Award", StringType()),
      ])
    ),True)
])
'''

In [49]:
#df = spark.read.schema(schema).option("multiline","true").json(r"data\sample.json")
df = spark.read.json(r"data\sample.json")

In [50]:
df.show(1)

+-------+--------------------+
|   name|                wins|
+-------+--------------------+
|Gilbert|[[straight, 7♣], ...|
+-------+--------------------+
only showing top 1 row



In [51]:
df.show()

+-------+--------------------+
|   name|                wins|
+-------+--------------------+
|Gilbert|[[straight, 7♣], ...|
|  Alexa|[[two pair, 4♠], ...|
|    May|                  []|
|Deloise|[[three of a kind...|
+-------+--------------------+



In [52]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- wins: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)



### Explode

In [53]:
from pyspark.sql.functions import explode

df.select(df.name, explode(df.wins)).show(truncate=False)

+-------+---------------------+
|name   |col                  |
+-------+---------------------+
|Gilbert|[straight, 7♣]       |
|Gilbert|[one pair, 10♥]      |
|Alexa  |[two pair, 4♠]       |
|Alexa  |[two pair, 9♠]       |
|Deloise|[three of a kind, 5♣]|
+-------+---------------------+



### Flatten

In [55]:
from pyspark.sql.functions import flatten

df.select(df.name, flatten(df.wins)).show(truncate=False)

+-------+-----------------------------+
|name   |flatten(wins)                |
+-------+-----------------------------+
|Gilbert|[straight, 7♣, one pair, 10♥]|
|Alexa  |[two pair, 4♠, two pair, 9♠] |
|May    |[]                           |
|Deloise|[three of a kind, 5♣]        |
+-------+-----------------------------+



# Q.2 Process Log File

In [170]:
log_df = spark.read.text(r"C:\Users\rosha\Downloads\Spark\data\gsk.log")

In [171]:
log_df.show(truncate=False)

+----------------------------------------------------------------------------+
|value                                                                       |
+----------------------------------------------------------------------------+
|03/22 08:51:01 INFO :..settcpimage: Associate with TCP/IP image name = TCPCS|
|03/22 08:51:02 INFO :..reg_process: registering process with the system     |
|03/22 08:51:02 INFO :..reg_process: attempt OS/390 registration             |
|03/22 08:51:02 INFO :..reg_process: return from registration rc=0           |
+----------------------------------------------------------------------------+



In [172]:
log_df.printSchema()

root
 |-- value: string (nullable = true)



In [189]:
from pyspark.sql.functions import split, regexp_extract
'''
split_df = log_df.select(regexp_extract('value', r'^(\d{2}/\d{2}\s+\d{2}:\d{2}:\d{2})', 1).alias('timestamp')
                        ,regexp_extract('value', r'^(.*\s+)(\w{4})(\s+):', 2).alias('level')
                        ,regexp_extract('value', r'^(.*\s+)(:..\w+:)', 2).alias('object')
                        ,regexp_extract('value', r'^.*\s+:..\w+:\s+(\w+.*$)', 1).alias('text')
                        )
'''
split_df = log_df.select(regexp_extract('value', r'^(.*)\s+(\w+)\s+(:..\w+:)\s+(\w+.*)', 1).alias('timestamp')
                        ,regexp_extract('value', r'^(.*)\s+(\w+)\s+(:..\w+:)\s+(\w+.*)', 2).alias('level')
                        ,regexp_extract('value', r'^(.*)\s+(\w+)\s+(:..\w+:)\s+(\w+.*)', 3).alias('object')
                        ,regexp_extract('value', r'^(.*)\s+(\w+)\s+(:..\w+:)\s+(\w+.*)', 4).alias('text')
                        )

#r'^(\d{4}-\d{2}-\d{2})(\s+)(\d{2}:\d{2}:\d{2})'
#(\s+) - One or many spaces
#(.*) - Any Character with Zero or more repetitions
#\d{2} - Any digit with m Repetitions
split_df.show(truncate=False)

+--------------+-----+---------------+----------------------------------------+
|timestamp     |level|object         |text                                    |
+--------------+-----+---------------+----------------------------------------+
|03/22 08:51:01|INFO |:..settcpimage:|Associate with TCP/IP image name = TCPCS|
|03/22 08:51:02|INFO |:..reg_process:|registering process with the system     |
|03/22 08:51:02|INFO |:..reg_process:|attempt OS/390 registration             |
|03/22 08:51:02|INFO |:..reg_process:|return from registration rc=0           |
+--------------+-----+---------------+----------------------------------------+



In [190]:
split_df.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- level: string (nullable = true)
 |-- object: string (nullable = true)
 |-- text: string (nullable = true)



In [158]:
from pyspark.sql.functions import to_timestamp
log_df = split_df.select(to_timestamp(split_df.timestamp, 'MM/yy HH:mm:ss').alias('Date'),'level','object','text')
log_df.show()

+-------------------+-----+---------------+--------------------+
|               Date|level|         object|                text|
+-------------------+-----+---------------+--------------------+
|2022-03-01 08:51:01| INFO|:..settcpimage:|Associate with TC...|
|2022-03-01 08:51:02| INFO|:..reg_process:|registering proce...|
|2022-03-01 08:51:02| INFO|:..reg_process:|attempt OS/390 re...|
|2022-03-01 08:51:02| INFO|:..reg_process:|return from regis...|
+-------------------+-----+---------------+--------------------+



In [159]:
log_df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- level: string (nullable = true)
 |-- object: string (nullable = true)
 |-- text: string (nullable = true)



In [None]:
'''
abc…	      Letters
123…	      Digits
\d	          Any Digit
\D	          Any Non-digit character
.	          Any Character
\.	          Period
[abc]	      Only a, b, or c
[^abc]	      Not a, b, nor c
[a-z]	      Characters a to z
[0-9]	      Numbers 0 to 9
\w	          Any Alphanumeric character
\W	          Any Non-alphanumeric character
{m}	          m Repetitions
{m,n}	      m to n Repetitions
*	          Zero or more repetitions
+	          One or more repetitions
?	          Optional character
\s	          Any Whitespace
\S	          Any Non-whitespace character
^…$	          Starts and ends
(…)	          Capture Group
(a(bc))	      Capture Sub-group
(.*)	      Capture all6-
(abc|def)	  Matches abc or def




^            Assert position at the beginning of the line
(\w+)        Capture one or more word characters (a-zA-Z0-9_) into group 1
[ \t]*       Match any number of spaces or tab characters ([ \t] can be replaced with \h in some regex flavours such as PCRE)
.*           Match any character (except newline unless the s modifier is used)
\bby         Match a word boundary \b, followed by by literally
[ \t]+       Match one or more spaces or tab characters
(\w+)        Capture one or more word characters (a-zA-Z0-9_) into group 3
[ \t]*       Match any number of spaces or tab characters
.*           Match any character any number of times
$            Assert position at the end of the line

(.*\bby[ \t]+(\w+)[ \t]*.*)        Capture the following into group 2

'''

#https://regexone.com/

In [167]:
data = [('2345', 'Checked by John'),
('2398', 'Verified by Stacy'),
('2328', 'Verified by Srinivas than some random text'),        
('3983', 'Double Checked on 2/23/17 by Marsha')]

sc = spark.sparkContext

df = sc.parallelize(data).toDF(['ID', 'Notes'])

df.show()

+----+--------------------+
|  ID|               Notes|
+----+--------------------+
|2345|     Checked by John|
|2398|   Verified by Stacy|
|2328|Verified by Srini...|
|3983|Double Checked on...|
+----+--------------------+



In [168]:
from pyspark.sql.functions import regexp_extract, col

result = df.withColumn('Employee', regexp_extract(col('Notes'), '(.)(by)(\s+)(\w+)', 4))

result.show()

+----+--------------------+--------+
|  ID|               Notes|Employee|
+----+--------------------+--------+
|2345|     Checked by John|    John|
|2398|   Verified by Stacy|   Stacy|
|2328|Verified by Srini...|Srinivas|
|3983|Double Checked on...|  Marsha|
+----+--------------------+--------+

