In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Spark Metastore'). \
    master('yarn'). \
    getOrCreate()

In [2]:
data = [(100, 'Mariele',402.19),
(101, 'Natka',692.77),
(102, 'Joleen',658.17),
(103, 'Alexine',182.05)]


In [3]:
raw_df = spark.createDataFrame(data, ["id", "name", "salary"])

In [4]:
raw_df.show()

+---+-------+------+
| id|   name|salary|
+---+-------+------+
|100|Mariele|402.19|
|101|  Natka|692.77|
|102| Joleen|658.17|
|103|Alexine|182.05|
+---+-------+------+



In [5]:
raw_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)



In [6]:
from pyspark.sql.functions import *

___update each id of the employee to id+1, name to uppercase, and salary to 10% increase___

In [7]:
upd_df = raw_df. \
    withColumn("id", col("id")+1). \
    withColumn("name", upper(col("name"))). \
    withColumn("salary", round(col("salary")*110/100).cast("integer"))

In [8]:
upd_df.show()

+---+-------+------+
| id|   name|salary|
+---+-------+------+
|101|MARIELE|   442|
|102|  NATKA|   762|
|103| JOLEEN|   724|
|104|ALEXINE|   200|
+---+-------+------+



In [9]:
upd_df2 = raw_df. \
    withColumn("id", col("id") + lit(1)). \
    withColumn("name", upper(col("name"))). \
    withColumn("salary", round(col("salary") + (lit(0.1)*col("salary"))).cast("integer"))


In [10]:
upd_df2.show()

+---+-------+------+
| id|   name|salary|
+---+-------+------+
|101|MARIELE|   442|
|102|  NATKA|   762|
|103| JOLEEN|   724|
|104|ALEXINE|   200|
+---+-------+------+



In [11]:
employeeRecords = [
  (1, "John",100000),
  (2, "Paul",200000),
  (3, "Peter",300000)
]

In [12]:
emp_df = spark.createDataFrame(employeeRecords, "id INT, name STRING, salary INT")

In [13]:
emp_df.show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1| John|100000|
|  2| Paul|200000|
|  3|Peter|300000|
+---+-----+------+



Given the above dataframe print "name" and "salary" 

In [14]:
emp_df.select("name","salary").show()

+-----+------+
| name|salary|
+-----+------+
| John|100000|
| Paul|200000|
|Peter|300000|
+-----+------+



what are the other ways you can select 

In [15]:
emp_df.select(col("name"), emp_df["salary"]).show()

+-----+------+
| name|salary|
+-----+------+
| John|100000|
| Paul|200000|
|Peter|300000|
+-----+------+



Write a code to return dataframe having [1,2,3,4,5,6]

df1 [1,2,3,4,5,6,7,8,9,10]

df2[1,3,5,11]

df3[2,4,6,12]

In [16]:
from pyspark.sql.types import *

In [17]:
df1 = spark.createDataFrame([1,2,3,4,5,6,7,8,9,10], IntegerType())
df2 = spark.createDataFrame([1,3,5,11], IntegerType())
df3 = spark.createDataFrame([2,4,6,12], IntegerType())

In [18]:
df4 = df2.union(df3)

In [19]:
res = df4.intersect(df1).orderBy("value").show()

+-----+
|value|
+-----+
|    1|
|    2|
|    3|
|    4|
|    5|
|    6|
+-----+



### Count null values of individual columns using Spark.

```Input data
----------------------
Col1,  col2,  col3,  col4
1  ,   null,   che,   1000
2  ,   mani, hyd,   null
3  ,  Smith, null,  1200
4  ,    ram,  Del,    3200
5  ,  Krish,  null,   1400

Output is
---------------------
Col1, col2, col3, col4
0    ,   1  ,    2    ,   1


In [20]:
data = [(1 , None, "che", 1000),
(2 , "mani", "hyd", None),
(3 , "Smith", None, 1200),
(4 , "ram", "Del", 3200),
(5 , "Krish", None, 1400)]

In [21]:
raw_df = spark.createDataFrame(data, ["col1", "col2", "col3", "col4"])

In [22]:
raw_df.show()

+----+-----+----+----+
|col1| col2|col3|col4|
+----+-----+----+----+
|   1| null| che|1000|
|   2| mani| hyd|null|
|   3|Smith|null|1200|
|   4|  ram| Del|3200|
|   5|Krish|null|1400|
+----+-----+----+----+



In [23]:
res = raw_df. \
    select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in raw_df.columns])

In [24]:
res.show()

+----+----+----+----+
|col1|col2|col3|col4|
+----+----+----+----+
|   0|   1|   2|   1|
+----+----+----+----+



In [25]:
res2 = raw_df. \
    select([count(when(col(c).isNull(), c)).alias(c) for c in raw_df.columns]). \
    show()

+----+----+----+----+
|col1|col2|col3|col4|
+----+----+----+----+
|   0|   1|   2|   1|
+----+----+----+----+



### How to check if two dataframes are equal or not?

In [26]:
data = [(123, 'ABC'),
(123, 'DEF'),
(123, 'XYZ'),
(456, 'PQR'),
(456, 'MNO'),
(789, 'UVW')]

In [27]:
raw_df = spark.createDataFrame(data, "packid INT, MoleculeName STRING")

In [28]:
raw_df.show()

+------+------------+
|packid|MoleculeName|
+------+------------+
|   123|         ABC|
|   123|         DEF|
|   123|         XYZ|
|   456|         PQR|
|   456|         MNO|
|   789|         UVW|
+------+------------+



In [29]:
res = raw_df. \
    groupBy("packid"). \
    agg(concat_ws("|", collect_list("MoleculeName")).alias("PackMoleculeString")). \
    sort("packid")

In [30]:
res.show()

+------+------------------+
|packid|PackMoleculeString|
+------+------------------+
|   123|       ABC|DEF|XYZ|
|   456|           PQR|MNO|
|   789|               UVW|
+------+------------------+



https://kontext.tech/column/spark/455/tutorial-add-constant-column-to-pyspark-dataframe
    

In [31]:
data = [('01/01/2021', 5000),
('02/01/2021', 3000),
('03/01/2021', -4000),
('04/01/2021', 6000)]

In [32]:
df = spark.createDataFrame(data, "date STRING, amount INT")

In [33]:
df.show()

+----------+------+
|      date|amount|
+----------+------+
|01/01/2021|  5000|
|02/01/2021|  3000|
|03/01/2021| -4000|
|04/01/2021|  6000|
+----------+------+



In [34]:
from pyspark.sql.window import Window

In [35]:
win_spec = Window.orderBy("date").rowsBetween(Window.unboundedPreceding, Window.currentRow)
res = df. \
    withColumn("amount_new", sum("amount").over(win_spec))

In [36]:
res.show()

+----------+------+----------+
|      date|amount|amount_new|
+----------+------+----------+
|01/01/2021|  5000|      5000|
|02/01/2021|  3000|      8000|
|03/01/2021| -4000|      4000|
|04/01/2021|  6000|     10000|
+----------+------+----------+



### Remove first 3 lines from csv file

In [37]:
%%sh

cat new21.csv

country,city,value
India,Pune,100
India,Mumbai,200
India,Pune,400
India,Mumbai,200
India,Pune,600
Europe,Germany,300
Europe,NL,100

In [38]:
df = spark. \
    read. \
    csv("new21.csv")

In [39]:
df.show()

+-------+-------+-----+
|    _c0|    _c1|  _c2|
+-------+-------+-----+
|country|   city|value|
|  India|   Pune|  100|
|  India| Mumbai|  200|
|  India|   Pune|  400|
|  India| Mumbai|  200|
|  India|   Pune|  600|
| Europe|Germany|  300|
| Europe|     NL|  100|
+-------+-------+-----+



In [40]:
df2 = df.withColumn("Index", monotonically_increasing_id()). \
    filter('Index > 2'). \
    drop("Index")

In [41]:
df2.show()

+------+-------+---+
|   _c0|    _c1|_c2|
+------+-------+---+
| India|   Pune|400|
| India| Mumbai|200|
| India|   Pune|600|
|Europe|Germany|300|
|Europe|     NL|100|
+------+-------+---+



#### However the above approach can fail. It's only going to work if the first 3 rows are in the first partition. Indeed, the contract in the API is just "The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive". It is therefore not very safe to assume that they will always start from zero. 

In [42]:
df2 = df.limit(3)

In [43]:
df2

_c0,_c1,_c2
country,city,value
India,Pune,100
India,Mumbai,200


In [44]:
df3 = df.subtract(df2)

In [45]:
df3.show()

+------+-------+---+
|   _c0|    _c1|_c2|
+------+-------+---+
|Europe|Germany|300|
| India|   Pune|600|
|Europe|     NL|100|
| India|   Pune|400|
+------+-------+---+



#### Using rdd to skip lines

In [46]:
rdd1 = spark.sparkContext.textFile("new21.csv")

In [47]:
rdd1.collect()

['country,city,value',
 'India,Pune,100',
 'India,Mumbai,200',
 'India,Pune,400',
 'India,Mumbai,200',
 'India,Pune,600',
 'Europe,Germany,300',
 'Europe,NL,100']

In [48]:
filt_df = rdd1.zipWithIndex(). \
    filter(lambda x: x[1] > 2). \
    map(lambda x: x[0]). \
    map(lambda x: x.split(",")). \
    toDF()

In [49]:
filt_df.show()

+------+-------+---+
|    _1|     _2| _3|
+------+-------+---+
| India|   Pune|400|
| India| Mumbai|200|
| India|   Pune|600|
|Europe|Germany|300|
|Europe|     NL|100|
+------+-------+---+



### load dataframe, skip first 2 lines except header

In [50]:
%%sh

cat new21.csv

country,city,value
India,Pune,100
India,Mumbai,200
India,Pune,400
India,Mumbai,200
India,Pune,600
Europe,Germany,300
Europe,NL,100

In [51]:
df_with_head = spark.read.option("header", "true").csv("new21.csv")

In [52]:
df_with_head.show()

+-------+-------+-----+
|country|   city|value|
+-------+-------+-----+
|  India|   Pune|  100|
|  India| Mumbai|  200|
|  India|   Pune|  400|
|  India| Mumbai|  200|
|  India|   Pune|  600|
| Europe|Germany|  300|
| Europe|     NL|  100|
+-------+-------+-----+



In [53]:
rdd1 = df_with_head.rdd

In [54]:
rdd1.collect()

[Row(country='India', city='Pune', value='100'),
 Row(country='India', city='Mumbai', value='200'),
 Row(country='India', city='Pune', value='400'),
 Row(country='India', city='Mumbai', value='200'),
 Row(country='India', city='Pune', value='600'),
 Row(country='Europe', city='Germany', value='300'),
 Row(country='Europe', city='NL', value='100')]

In [55]:
filt_df = rdd1.zipWithIndex(). \
    filter(lambda x: x[1] > 1). \
    map(lambda x: x[0]). \
    toDF()

In [56]:
filt_df.show()

+-------+-------+-----+
|country|   city|value|
+-------+-------+-----+
|  India|   Pune|  400|
|  India| Mumbai|  200|
|  India|   Pune|  600|
| Europe|Germany|  300|
| Europe|     NL|  100|
+-------+-------+-----+



### Get the last row of a dataframe

In [57]:
expr = [last(col).alias(col) for col in df.columns]

df.agg(*expr).show()

+------+---+---+
|   _c0|_c1|_c2|
+------+---+---+
|Europe| NL|100|
+------+---+---+



### Get Last N row from the dataframe

In [58]:
df2 = df.withColumn("index", monotonically_increasing_id())

df3 = df2. \
    orderBy(desc("index")). \
    drop("index")

# N=5
df3.show(5)

+------+-------+---+
|   _c0|    _c1|_c2|
+------+-------+---+
|Europe|     NL|100|
|Europe|Germany|300|
| India|   Pune|600|
| India| Mumbai|200|
| India|   Pune|400|
+------+-------+---+
only showing top 5 rows



https://stackoverflow.com/questions/44077404/how-to-skip-lines-while-reading-a-csv-file-as-a-dataframe-using-pyspark

https://stackoverflow.com/questions/61781152/pyspark-remove-first-row-from-dataframe

### Load csv file with multiple delimiters

In [59]:
%%sh

cat delim.csv

name@|#age@|#gender
"Name1"@|#34@|#Male
"Name2"@|#60@|#Female


In [60]:
df = spark. \
    read. \
    option("header", "true"). \
    option("sep", "@|#"). \
    csv("delim.csv")

In [61]:
df.show()

+-----+---+------+
| name|age|gender|
+-----+---+------+
|Name1| 34|  Male|
|Name2| 60|Female|
+-----+---+------+



In [62]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)



In [63]:
spark.sql("USE subhayang")

In [64]:
spark.sql("SHOW tables")

database,tableName,isTemporary
subhayang,alter_col_pos,False
subhayang,covid_ind,False
subhayang,covid_ind_ext_optm,False
subhayang,cust_bucketed,False
subhayang,customers,False
subhayang,customers_bucketed,False
subhayang,eloc,False
subhayang,employee_internal,False
subhayang,employees_demo,False
subhayang,h,False


In [65]:
spark.sql("select * from customers limit 10")

customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode
3110,Phillip,Smith,XXXXXXXXX,XXXXXXXXX,7983 Amber Robin ...,Irwin,PA,15642
3111,Mary,Brown,XXXXXXXXX,XXXXXXXXX,8344 Sunny Embers...,Caguas,PR,725
3112,Kimberly,Marsh,XXXXXXXXX,XXXXXXXXX,2423 Tawny Rabbit...,Milwaukee,WI,53209
3113,Aaron,Smith,XXXXXXXXX,XXXXXXXXX,9539 Rustic Beaco...,Caguas,PR,725
3114,Mary,Briggs,XXXXXXXXX,XXXXXXXXX,5292 Heather Close,Billings,MT,59102
3115,Tammy,Leblanc,XXXXXXXXX,XXXXXXXXX,4494 Harvest Bay,Caguas,PR,725
3116,Mary,Smith,XXXXXXXXX,XXXXXXXXX,5841 Noble Loop,Dayton,OH,45424
3117,Jacob,Murphy,XXXXXXXXX,XXXXXXXXX,3460 Middle Shado...,Rowland Heights,CA,91748
3118,Mary,Francis,XXXXXXXXX,XXXXXXXXX,2544 Honey Rise Loop,Jacksonville,NC,28540
3119,Shirley,Hinton,XXXXXXXXX,XXXXXXXXX,1927 Harvest Lane,Caguas,PR,725
