In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Basic Transformations'). \
    master('yarn'). \
    getOrCreate()

In [2]:
data = [(67532, "9876543210:1234567890"),
(67890, "876543210:123456780"),
(67980, "765432190:1234567980")
]

In [3]:
df = spark.createDataFrame(data, ["A", "B"])

In [4]:
df.show(truncate=False)

+-----+---------------------+
|A    |B                    |
+-----+---------------------+
|67532|9876543210:1234567890|
|67890|876543210:123456780  |
|67980|765432190:1234567980 |
+-----+---------------------+



In [5]:
from pyspark.sql.functions import *

In [6]:
df2 = df.select("A",explode(split(col("B"), ":")).alias("B"))

df3 = df.select(explode(split(col("B"), ":")).alias("B"))

In [7]:
df2.show()

+-----+----------+
|    A|         B|
+-----+----------+
|67532|9876543210|
|67532|1234567890|
|67890| 876543210|
|67890| 123456780|
|67980| 765432190|
|67980|1234567980|
+-----+----------+



In [8]:
df3.show()

+----------+
|         B|
+----------+
|9876543210|
|1234567890|
| 876543210|
| 123456780|
| 765432190|
|1234567980|
+----------+



In [9]:
%%sh

cat new22.csv

H|ID|NAME|SALARY|LOCATION
101|ABC|10|HYD
102|BCD|20|CHE
103|KLO|30|MUM
104|UIN|40|BLR
105|GPA|50|PUN
T|5


In [10]:
df = spark.read.option("header", "true").option("delimiter", "|").csv("new22.csv")

In [11]:
df.show()

+---+---+----+------+--------+
|  H| ID|NAME|SALARY|LOCATION|
+---+---+----+------+--------+
|101|ABC|  10|   HYD|    null|
|102|BCD|  20|   CHE|    null|
|103|KLO|  30|   MUM|    null|
|104|UIN|  40|   BLR|    null|
|105|GPA|  50|   PUN|    null|
|  T|  5|null|  null|    null|
+---+---+----+------+--------+



In [12]:
cols = df.columns
cols.pop(0)

'H'

In [13]:
cols

['ID', 'NAME', 'SALARY', 'LOCATION']

In [14]:
df.columns[:-1]

['H', 'ID', 'NAME', 'SALARY']

In [15]:
df.columns

['H', 'ID', 'NAME', 'SALARY', 'LOCATION']

In [16]:
new_list = []
for i, old_col in enumerate(df.columns[:-1]):
  new_list.append(col(old_col).alias(cols[i]))

In [17]:
for i, old_col in enumerate(df.columns[:-1]):
    print(str(i)+old_col)

0H
1ID
2NAME
3SALARY


In [18]:
new_list

[Column<b'H AS `ID`'>,
 Column<b'ID AS `NAME`'>,
 Column<b'NAME AS `SALARY`'>,
 Column<b'SALARY AS `LOCATION`'>]

In [19]:
df = df.select(*new_list)

df.show()

+---+----+------+--------+
| ID|NAME|SALARY|LOCATION|
+---+----+------+--------+
|101| ABC|    10|     HYD|
|102| BCD|    20|     CHE|
|103| KLO|    30|     MUM|
|104| UIN|    40|     BLR|
|105| GPA|    50|     PUN|
|  T|   5|  null|    null|
+---+----+------+--------+



In [20]:
total_records_footer = df.filter(col('ID')=='T').head().NAME
print('total records as per footer', total_records_footer)

total records as per footer 5


In [21]:
df = df.filter('ID != "T"')

In [22]:
df.show()

+---+----+------+--------+
| ID|NAME|SALARY|LOCATION|
+---+----+------+--------+
|101| ABC|    10|     HYD|
|102| BCD|    20|     CHE|
|103| KLO|    30|     MUM|
|104| UIN|    40|     BLR|
|105| GPA|    50|     PUN|
+---+----+------+--------+



In [23]:
total_count = df.count()
if int(total_records_footer) == int(total_count):
  print("count is matching")

count is matching


In [24]:
total_count

5

### using RDD

In [25]:
rdd1 = spark.sparkContext.textFile("new22.csv")

In [26]:
rdd1.collect()

['H|ID|NAME|SALARY|LOCATION',
 '101|ABC|10|HYD',
 '102|BCD|20|CHE',
 '103|KLO|30|MUM',
 '104|UIN|40|BLR',
 '105|GPA|50|PUN',
 'T|5']

In [27]:
count = rdd1.count()

In [28]:
filt_df = rdd1.zipWithIndex(). \
    filter(lambda x: x[1] >0 and x[1] != count-1). \
    map(lambda x: x[0]). \
    map(lambda x: x.split("|")). \
    toDF(["ID", "NAME", "SALARY", "LOCATION"])

In [29]:
filt_df.show()

+---+----+------+--------+
| ID|NAME|SALARY|LOCATION|
+---+----+------+--------+
|101| ABC|    10|     HYD|
|102| BCD|    20|     CHE|
|103| KLO|    30|     MUM|
|104| UIN|    40|     BLR|
|105| GPA|    50|     PUN|
+---+----+------+--------+



In [30]:
df.show()

+---+----+------+--------+
| ID|NAME|SALARY|LOCATION|
+---+----+------+--------+
|101| ABC|    10|     HYD|
|102| BCD|    20|     CHE|
|103| KLO|    30|     MUM|
|104| UIN|    40|     BLR|
|105| GPA|    50|     PUN|
+---+----+------+--------+

