### ***PySpark Configuration and Data Creation***

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [None]:
from pyspark.sql.types import *
data  = ["D/O New Delhi" , "C\O Tarun123 New Delhi","C//O Tarun ₹ New Delhi"]
schema = StructType(
    [
      StructField('Address',StringType())
    ]
)

df = spark.createDataFrame(data = data, schema = schema)
df.show(),df.printSchema()

TypeError: StructType can not accept object 'D/O New Delhi' in type <class 'str'>

In [None]:
import csv
with open('dummy.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["id","Name","Address","Date"])
    writer.writerow([1,"Vinayak", "D/O New Delhi","2023-01-01 10:22"])
    writer.writerow([2,"Kanishk","C\O Tarun123 New Delhi","2023-01-01 12:22"])
    writer.writerow([3,"Neel", "C//O Tarun ₹ New Delhi","2023-01-02 18:23"])


### ***Question 1 - REGEX filter***

In [None]:
df = spark.read.options(delimiter = ',', inferschema = True, header= True ).csv("/content/dummy.csv")
df.show(truncate=False)

+---+-------+----------------------+----------------+
|id |Name   |Address               |Date            |
+---+-------+----------------------+----------------+
|1  |Vinayak|D/O New Delhi         |2023-01-01 10:22|
|2  |Kanishk|C\O Tarun123 New Delhi|2023-01-01 12:22|
|3  |Neel   |C//O Tarun ₹ New Delhi|2023-01-02 18:23|
+---+-------+----------------------+----------------+



In [None]:
exp = '^(D/O|C\\\\O) [A-Za-z0-9 ]+$'
# 4 backslashes because it needs to be escaped by both python and reqex separately
from pyspark.sql.functions import col
df.filter(col("Address").rlike(exp)).show(truncate=False)

+---+-------+----------------------+----------------+
|id |Name   |Address               |Date            |
+---+-------+----------------------+----------------+
|1  |Vinayak|D/O New Delhi         |2023-01-01 10:22|
|2  |Kanishk|C\O Tarun123 New Delhi|2023-01-01 12:22|
+---+-------+----------------------+----------------+



### ***Question 2 - DateTime Formating***

In [None]:
from pyspark.sql.functions import col, unix_timestamp, from_unixtime
# unix_timestamp converts timestamp to unix timestamp and from_unixtime converts unix timestpam to timestamp
df_formatted = df.withColumn("Date", from_unixtime(unix_timestamp(col("Date"), "yyyy-MM-dd HH:mm"), "yyyy-MM-dd HH:mm:ss"))

In [None]:
df_formatted.show(truncate=False)

+---+-------+----------------------+-------------------+
|id |Name   |Address               |Date               |
+---+-------+----------------------+-------------------+
|1  |Vinayak|D/O New Delhi         |2023-01-01 10:22:00|
|2  |Kanishk|C\O Tarun123 New Delhi|2023-01-01 12:22:00|
|3  |Neel   |C//O Tarun ₹ New Delhi|2023-01-02 18:23:00|
+---+-------+----------------------+-------------------+



In [None]:
df_formatted.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Date: string (nullable = true)

