In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyngrok

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

import findspark
findspark.init()


Collecting pyngrok
  Downloading pyngrok-7.1.5-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.5


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [None]:
data = [
    (1,"abc","aef","ahi","akl",123),
    (2,"qbc","qef","qhi","qkl",124),
    (3,"wbc","wef","whi","wkl",125),
]
columns_schema = ["id","firstname","lastname","country","state","salary"]
df = spark.createDataFrame(data = data, schema = columns_schema)

In [None]:
df.show()

+---+---------+--------+-------+-----+------+
| id|firstname|lastname|country|state|salary|
+---+---------+--------+-------+-----+------+
|  1|      abc|     aef|    ahi|  akl|   123|
|  2|      qbc|     qef|    qhi|  qkl|   124|
|  3|      wbc|     wef|    whi|  wkl|   125|
+---+---------+--------+-------+-----+------+



In [None]:
df

DataFrame[id: bigint, firstname: string, lastname: string, country: string, state: string, salary: bigint]

In [None]:
from pyspark.sql.types import *
schema = StructType(
    [
      StructField('id',IntegerType()),
      StructField('first_name',StringType()),
      StructField('last_name',StringType()),
    ]
)

df1 = spark.read.csv("sample_data/names.csv", header = True , schema = schema)
df1.show(10)

+---+----------+---------+
| id|first_name|last_name|
+---+----------+---------+
|613|   Lucinda|    Tench|
|181|   Yevette|   Ebanks|
|963| Ekaterina|    Hamly|
|140|    Sherye| Paxforde|
|800|     Gayla|     Girk|
|985|    Dannie|   Berzin|
|931|       Cal|  Benitti|
|975|   Hilario|   Warbys|
|607|      Geri| Creeghan|
|777|      Vita|   Haldin|
+---+----------+---------+
only showing top 10 rows



In [None]:
df2 = spark.read.options(delimiter='|', inferschema=True, header=True).csv("sample_data/MOCK.csv")
df2.show()

+-----------+-----------+----+-----+
|CountryName|CountryCode|Year|Value|
+-----------+-----------+----+-----+
|     Russia|         RU|  92|   60|
|   Colombia|         CO|  83|   87|
|      China|         CN|  33|   13|
|     France|         FR|  47|   27|
|  Argentina|         AR|  36|   98|
|     Brazil|         BR|  54|   48|
|     Poland|         PL|  25|   98|
|  Indonesia|         ID|  97|  100|
|     Sweden|         SE|  15|   96|
|     Sweden|         SE|  51|   84|
|     Zambia|         ZM|  12|  100|
|      China|         CN|  69|   49|
|  Indonesia|         ID|  96|   38|
|     Norway|         NO|  85|   93|
|     Mexico|         MX|  78|   86|
|     Brazil|         BR|  21|   42|
|     Russia|         RU|  96|   83|
|     Brazil|         BR|  31|   80|
|     Brazil|         BR|  52|   53|
|      China|         CN|  94|   80|
+-----------+-----------+----+-----+
only showing top 20 rows



In [None]:
df3 = spark.read.csv("sample_data/MOCK.csv")
df3.show()

+--------------------+
|                 _c0|
+--------------------+
|CountryName|Count...|
|     Russia|RU|92|60|
|   Colombia|CO|83|87|
|      China|CN|33|13|
|     France|FR|47|27|
|  Argentina|AR|36|98|
|     Brazil|BR|54|48|
|     Poland|PL|25|98|
| Indonesia|ID|97|100|
|     Sweden|SE|15|96|
|     Sweden|SE|51|84|
|    Zambia|ZM|12|100|
|      China|CN|69|49|
|  Indonesia|ID|96|38|
|     Norway|NO|85|93|
|     Mexico|MX|78|86|
|     Brazil|BR|21|42|
|     Russia|RU|96|83|
|     Brazil|BR|31|80|
|     Brazil|BR|52|53|
+--------------------+
only showing top 20 rows



In [None]:
df2.printSchema()

root
 |-- CountryName: string (nullable = true)
 |-- CountryCode: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Value: integer (nullable = true)



In [None]:
df2.columns

['CountryName', 'CountryCode', 'Year', 'Value']

In [None]:
df2.count()

1000

In [None]:
df2.show(2)

+-----------+-----------+----+-----+
|CountryName|CountryCode|Year|Value|
+-----------+-----------+----+-----+
|     Russia|         RU|  92|   60|
|   Colombia|         CO|  83|   87|
+-----------+-----------+----+-----+
only showing top 2 rows



In [None]:
df4 = df2.withColumnRenamed('CountryName', 'Country Name').withColumnRenamed('CountryCode', 'Country Code')
df4.show()

+------------+------------+----+-----+
|Country Name|Country Code|Year|Value|
+------------+------------+----+-----+
|      Russia|          RU|  92|   60|
|    Colombia|          CO|  83|   87|
|       China|          CN|  33|   13|
|      France|          FR|  47|   27|
|   Argentina|          AR|  36|   98|
|      Brazil|          BR|  54|   48|
|      Poland|          PL|  25|   98|
|   Indonesia|          ID|  97|  100|
|      Sweden|          SE|  15|   96|
|      Sweden|          SE|  51|   84|
|      Zambia|          ZM|  12|  100|
|       China|          CN|  69|   49|
|   Indonesia|          ID|  96|   38|
|      Norway|          NO|  85|   93|
|      Mexico|          MX|  78|   86|
|      Brazil|          BR|  21|   42|
|      Russia|          RU|  96|   83|
|      Brazil|          BR|  31|   80|
|      Brazil|          BR|  52|   53|
|       China|          CN|  94|   80|
+------------+------------+----+-----+
only showing top 20 rows



In [None]:
import pyspark.sql.functions as psf
df4.filter(psf.col('Country Name')=='Russia').show()

+------------+------------+----+-----+
|Country Name|Country Code|Year|Value|
+------------+------------+----+-----+
|      Russia|          RU|  92|   60|
|      Russia|          RU|  96|   83|
|      Russia|          RU|  31|    4|
|      Russia|          RU|   2|   72|
|      Russia|          RU|  25|    1|
|      Russia|          RU|  49|   22|
|      Russia|          RU|  48|   11|
|      Russia|          RU|  81|   13|
|      Russia|          RU|  95|   44|
|      Russia|          RU|  12|   77|
|      Russia|          RU|  50|   54|
|      Russia|          RU|  11|   54|
|      Russia|          RU|  72|   12|
|      Russia|          RU|  72|   49|
|      Russia|          RU|  95|  100|
|      Russia|          RU|  70|   69|
|      Russia|          RU|  14|   45|
|      Russia|          RU|   2|    9|
|      Russia|          RU|  87|   85|
|      Russia|          RU|  96|   58|
+------------+------------+----+-----+
only showing top 20 rows



In [None]:
df4.orderBy('Value').show()

+--------------------+------------+----+-----+
|        Country Name|Country Code|Year|Value|
+--------------------+------------+----+-----+
|           Indonesia|          ID|  79|    1|
|               China|          CN|  47|    1|
|           Indonesia|          ID|  74|    1|
|           Indonesia|          ID|  79|    1|
|           Indonesia|          ID|  17|    1|
|              Russia|          RU|  25|    1|
|Palestinian Terri...|          PS|  75|    1|
|               Yemen|          YE|  69|    1|
|               China|          CN|  31|    1|
|             Iceland|          IS|  31|    1|
|              Serbia|          RS|  65|    1|
|             Nigeria|          NG|  62|    1|
|               China|          CN|  37|    2|
|               China|          CN|  57|    2|
|           Indonesia|          ID|  42|    2|
|      United Kingdom|          GB|  67|    2|
|               China|          CN|  94|    2|
|               China|          CN|   8|    2|
|         Phi

In [None]:
df4.select('Country Code').show()

+------------+
|Country Code|
+------------+
|          RU|
|          CO|
|          CN|
|          FR|
|          AR|
|          BR|
|          PL|
|          ID|
|          SE|
|          SE|
|          ZM|
|          CN|
|          ID|
|          NO|
|          MX|
|          BR|
|          RU|
|          BR|
|          BR|
|          CN|
+------------+
only showing top 20 rows



In [None]:
df4.groupBy('Country Name').agg(psf.max('Value')).show()

+------------+----------+
|Country Name|max(Value)|
+------------+----------+
|        Chad|         4|
|      Russia|       100|
|    Paraguay|        84|
|       Yemen|        74|
|     Senegal|        43|
|      Sweden|        98|
| Philippines|       100|
|    Malaysia|        85|
|   Singapore|        73|
|      Malawi|        98|
|        Iraq|        55|
|     Germany|        86|
| Afghanistan|        97|
|    Cambodia|         4|
| Ivory Coast|        78|
|      Rwanda|        29|
|       Sudan|        32|
|       Palau|        96|
|      France|        97|
|      Greece|        94|
+------------+----------+
only showing top 20 rows



In [None]:
import csv
from pyspark.sql.functions import col, from_unixtime, unix_timestamp
with open('dummy.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["id","Name","Address","Date"])
    writer.writerow([1,"ABC", "D/O New Delhi","2023-01-01 10:22"])
    writer.writerow([2,"DEF","C\O Tarun123 New Delhi","2023-01-01 12:22"])
    writer.writerow([3,"GHI", "C//O Tarun ₹ New Delhi","2023-01-02 18:23"])

df5 = spark.read.options( inferschema=True, header=True).csv("dummy.csv")

exp = '^(D/O|C\\\\O) [A-Za-z0-9 ]+$'

df5.filter(col("Address").rlike(exp)).show(truncate=False)

df6 = df5.withColumn("Date", from_unixtime(unix_timestamp(col("Date"), "yyyy-MM-dd HH:mm"), "yyyy-MM-dd HH:mm:ss"))
df6.show()

+---+----+----------------------+----------------+
|id |Name|Address               |Date            |
+---+----+----------------------+----------------+
|1  |ABC |D/O New Delhi         |2023-01-01 10:22|
|2  |DEF |C\O Tarun123 New Delhi|2023-01-01 12:22|
+---+----+----------------------+----------------+

+---+----+--------------------+-------------------+
| id|Name|             Address|               Date|
+---+----+--------------------+-------------------+
|  1| ABC|       D/O New Delhi|2023-01-01 10:22:00|
|  2| DEF|C\O Tarun123 New ...|2023-01-01 12:22:00|
|  3| GHI|C//O Tarun ₹ New ...|2023-01-02 18:23:00|
+---+----+--------------------+-------------------+



In [None]:
#AryamaanBorah
import csv
from pyspark.sql.functions import col, from_unixtime, unix_timestamp
with open('dummy.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["id","Name","Address","Date"])
    writer.writerow([1,"ABC", "D/O New Delhi","2023-01-01 10:22"])
    writer.writerow([2,"DEF","C\O Tarun123 New Delhi","2023-01-01 12:22"])
    writer.writerow([3,"GHI", "C//O Tarun ₹ New Delhi","2023-01-02 18:23"])

df5 = spark.read.options( inferschema=True, header=True).csv("dummy.csv")

exp = '^(D/O|C\\\\O) [A-Za-z0-9 ]+$'

df5.filter(col("Address").rlike(exp)).show(truncate=False)

df6 = df5.withColumn("Date", from_unixtime(unix_timestamp(col("Date"), "yyyy-MM-dd HH:mm"), "yyyy-MM-dd HH:mm:ss"))
df6.show()

+---+----+----------------------+----------------+
|id |Name|Address               |Date            |
+---+----+----------------------+----------------+
|1  |ABC |D/O New Delhi         |2023-01-01 10:22|
|2  |DEF |C\O Tarun123 New Delhi|2023-01-01 12:22|
+---+----+----------------------+----------------+

+---+----+--------------------+-------------------+
| id|Name|             Address|               Date|
+---+----+--------------------+-------------------+
|  1| ABC|       D/O New Delhi|2023-01-01 10:22:00|
|  2| DEF|C\O Tarun123 New ...|2023-01-01 12:22:00|
|  3| GHI|C//O Tarun ₹ New ...|2023-01-02 18:23:00|
+---+----+--------------------+-------------------+



In [None]:
import logging
logging.getLogger().setLevel(logging.INFO)

def func():
  try:
    data = [
    (1,"abc","aef","ahi","akl",123),
    (2,"qbc","qef","qhi","qkl",124),
    (3,"wbc","wef","whi","wkl",125),
    ]
    columns_schema = ["id","firstname","lastname","country","state","salary"]
    df = spark.createDataFrame(data = data, schema = columns_schema)
    df.show()
    logging.info("Successfully Created")
  except Exception as e:
    logging.error("an error has occured")
  finally:
    spark.stop

func()

INFO:root:Successfully Created


+---+---------+--------+-------+-----+------+
| id|firstname|lastname|country|state|salary|
+---+---------+--------+-------+-----+------+
|  1|      abc|     aef|    ahi|  akl|   123|
|  2|      qbc|     qef|    qhi|  qkl|   124|
|  3|      wbc|     wef|    whi|  wkl|   125|
+---+---------+--------+-------+-----+------+

