In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/itv020649/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
order_schema = 'order_id long,order_date date,cust_id long,order_status string'

In [4]:
df = spark.read \
.format("csv") \
.schema(order_schema) \
.option("dateFormat","mm-dd-yyyy") \
.load("/public/trendytech/datasets/orders_sample2.csv")

In [5]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-01-25|  11599|         CLOSED|
|       2|2013-01-25|    256|PENDING_PAYMENT|
|       3|2013-01-25|  12111|       COMPLETE|
|       4|2013-01-25|   8827|         CLOSED|
|       5|2013-01-25|  11318|       COMPLETE|
|       6|2013-01-25|   7130|       COMPLETE|
|       7|2013-01-25|   4530|       COMPLETE|
|       8|2013-01-25|   2911|     PROCESSING|
|       9|2013-01-25|   5657|PENDING_PAYMENT|
|      10|2013-01-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



#### ways to create DF

spark.read spark.sql() spark.table() spark.range()

In [6]:
spark.sql("show databases").filter("namespace like '%sukum16%'")

namespace
sukum16_retail


In [7]:
spark.sql("use sukum16_retail")

In [8]:
spark.sql("show tables")

database,tableName,isTemporary
sukum16_retail,orders,False
sukum16_retail,orders_ext,False


In [9]:
df = spark.sql("select * from orders")

In [10]:
df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [11]:
spark.table("sukum16_retail.orders")

order_id,order_date,customer_id,order_status
1,2013-07-25 00:00:...,11599,CLOSED
2,2013-07-25 00:00:...,256,PENDING_PAYMENT
3,2013-07-25 00:00:...,12111,COMPLETE
4,2013-07-25 00:00:...,8827,CLOSED
5,2013-07-25 00:00:...,11318,COMPLETE
6,2013-07-25 00:00:...,7130,COMPLETE
7,2013-07-25 00:00:...,4530,COMPLETE
8,2013-07-25 00:00:...,2911,PROCESSING
9,2013-07-25 00:00:...,5657,PENDING_PAYMENT
10,2013-07-25 00:00:...,5648,PENDING_PAYMENT


In [12]:
spark.range(5)

id
0
1
2
3
4


In [13]:
spark.range(0,8)
spark.range(0,8,2)

id
0
2
4
6


to create a (mostly dummy)data frame from local list
spark.createDataFrame(list)
like to create a rdd from local list
spark.sparkContext.parallelize(list)
[]--list
()--tuple
like:
[(),
(),
()]

distrubting and structuring

In [14]:
! hadoop fs -cat /public/trendytech/retail_db/orders/part-00000 |head
#spark.createDataFrame

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT
cat: Unable to write to output stream.


In [15]:
orders_list = [(1,'2013-07-25 00:00:00.0',11599,'CLOSED'),
(2,'2013-07-25 00:00:00.0',256,'PENDING_PAYMENT'),
(3,'2013-07-25 00:00:00.0',12111,'COMPLETE')]

In [16]:
orders_raw_df = spark.createDataFrame(orders_list)

In [17]:
orders_raw_df.show()

+---+--------------------+-----+---------------+
| _1|                  _2|   _3|             _4|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
+---+--------------------+-----+---------------+



In [18]:
orders_raw_df.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)
 |-- _4: string (nullable = true)



how to fix the column names and how to inforce schema/datatype

In [19]:
orders_raw_df = spark.createDataFrame(orders_list)

In [20]:
orders_df = orders_raw_df.toDF('order_id','order_date','customer_id','order_status')

In [21]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [22]:
orders_schema = ["order_id","order_date","customer_id","order_status"]

In [23]:
orders_raw_df = spark.createDataFrame(orders_list,orders_schema)

In [24]:
orders_raw_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [25]:
orders_raw_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [26]:
order_schema = 'order_id string,order_date string,cust_id int,order_status string'

In [27]:
orders_raw_df = spark.createDataFrame(orders_list,orders_schema)

In [28]:
orders_raw_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [29]:
orders_raw_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [30]:
from pyspark.sql.functions import to_timestamp

In [31]:
df = orders_raw_df.withColumn("order_date",to_timestamp('order_date'))

In [32]:
df.show()

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
+--------+-------------------+-----------+---------------+



In [33]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



### Converting RDD to data Frame

RDD already distrubted we are now structuring

In [34]:
orderRdd = spark.sparkContext.textFile("/public/trendytech/orders/orders.csv")

In [35]:
orderRdd.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [36]:
new_orderRdd = orderRdd.map(lambda x: (int(x.split(",")[0]),x.split(",")[1],int(x.split(",")[2]),x.split(",")[3]))

In [37]:
new_orderRdd.take(5)

[(1, '2013-07-25 00:00:00.0', 11599, 'CLOSED'),
 (2, '2013-07-25 00:00:00.0', 256, 'PENDING_PAYMENT'),
 (3, '2013-07-25 00:00:00.0', 12111, 'COMPLETE'),
 (4, '2013-07-25 00:00:00.0', 8827, 'CLOSED'),
 (5, '2013-07-25 00:00:00.0', 11318, 'COMPLETE')]

In [38]:
order_schema = 'order_id string,order_date string,cust_id int,order_status string'

In [39]:
df = spark.createDataFrame(new_orderRdd,order_schema)

In [40]:
df.show()

+--------+--------------------+-------+---------------+
|order_id|          order_date|cust_id|   order_status|
+--------+--------------------+-------+---------------+
|       1|2013-07-25 00:00:...|  11599|         CLOSED|
|       2|2013-07-25 00:00:...|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|  12111|       COMPLETE|
|       4|2013-07-25 00:00:...|   8827|         CLOSED|
|       5|2013-07-25 00:00:...|  11318|       COMPLETE|
|       6|2013-07-25 00:00:...|   7130|       COMPLETE|
|       7|2013-07-25 00:00:...|   4530|       COMPLETE|
|       8|2013-07-25 00:00:...|   2911|     PROCESSING|
|       9|2013-07-25 00:00:...|   5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|   5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|    918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|   1837|         CLOSED|
|      13|2013-07-25 00:00:...|   9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|   9842|     PROCESSING|
|      15|2013-07-25 00:00:...|   2568|       CO

In [41]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [42]:
df1 = new_orderRdd.toDF(order_schema)

In [43]:
df1.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### Nested Schema

In [44]:
! hadoop fs -ls /public/trendytech/datasets/customer_nested/

Found 3 items
-rw-r--r--   3 itv005857 supergroup          0 2023-05-18 17:40 /public/trendytech/datasets/customer_nested/_SUCCESS
-rw-r--r--   3 itv005857 supergroup         90 2023-05-18 17:40 /public/trendytech/datasets/customer_nested/part-00000-950ffc21-f8aa-4e00-8181-8ab726051097-c000.json
-rw-r--r--   3 itv005857 supergroup        173 2023-05-18 17:40 /public/trendytech/datasets/customer_nested/part-00001-950ffc21-f8aa-4e00-8181-8ab726051097-c000.json


In [45]:
! hadoop fs -cat /public/trendytech/datasets/customer_nested/part-00001-950ffc21-f8aa-4e00-8181-8ab726051097-c000.json| head

{"customer_id":2,"fullname":{"firstname":"ram","lastname":"kumar"},"city":"hyderabad"}
{"customer_id":3,"fullname":{"firstname":"vijay","lastname":"shankar"},"city":"pune"}


In [46]:
#schema DDL
ddlSchema = "customer_id long,fullname struct<firstname:string,lastname:string>,city string"


In [47]:
df = spark.read \
.format("json") \
.schema(ddlSchema) \
.load("/public/trendytech/datasets/customer_nested")

In [48]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
|          1| {sumit, mittal}|bangalore|
+-----------+----------------+---------+



In [49]:
from pyspark.sql.types import *

In [50]:
#struct type
customer_schema =StructType([
    StructField("customer_id",LongType()),
    StructField("fullname",StructType([StructField("firstname",StringType()),StructField("lastname",StringType())])),
    StructField("city",StringType())
])

In [51]:
df = spark.read \
.format("json") \
.schema(customer_schema) \
.load("/public/trendytech/datasets/customer_nested/*")

In [52]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
|          1| {sumit, mittal}|bangalore|
+-----------+----------------+---------+



In [53]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)



In [54]:
customer_list = [
    (1,("suraj","kumar"),"deoghar"),
    (2,("kunal","kumar"),"gurugram")]

In [55]:
ddlSchema = "customer_id long,fullname struct<firstname:string,lastname:string>,city string"


In [56]:
df = spark.createDataFrame(customer_list,customer_schema)

In [57]:
df.show()

+-----------+--------------+--------+
|customer_id|      fullname|    city|
+-----------+--------------+--------+
|          1|{suraj, kumar}| deoghar|
|          2|{kunal, kumar}|gurugram|
+-----------+--------------+--------+



In [58]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)



### Dataframe Transformations | select Vs selectExpr

to add a column: withColumn

to rename existing column: WithColumnRenamed

to drop a column: drop

select vs selectExpr

In [59]:
raw_df = spark.read \
.format("csv") \
.option("inferschema","true") \
.load("/public/trendytech/retail_db/order_items/part-00000")

In [60]:
! hadoop fs -cat /public/trendytech/retail_db/order_items/part-00000|head

1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99
6,4,365,5,299.95,59.99
7,4,502,3,150.0,50.0
8,4,1014,4,199.92,49.98
9,5,957,1,299.98,299.98
10,5,365,5,299.95,59.99
cat: Unable to write to output stream.


In [61]:
raw_df.show()

+---+---+----+---+------+------+
|_c0|_c1| _c2|_c3|   _c4|   _c5|
+---+---+----+---+------+------+
|  1|  1| 957|  1|299.98|299.98|
|  2|  2|1073|  1|199.99|199.99|
|  3|  2| 502|  5| 250.0|  50.0|
|  4|  2| 403|  1|129.99|129.99|
|  5|  4| 897|  2| 49.98| 24.99|
|  6|  4| 365|  5|299.95| 59.99|
|  7|  4| 502|  3| 150.0|  50.0|
|  8|  4|1014|  4|199.92| 49.98|
|  9|  5| 957|  1|299.98|299.98|
| 10|  5| 365|  5|299.95| 59.99|
| 11|  5|1014|  2| 99.96| 49.98|
| 12|  5| 957|  1|299.98|299.98|
| 13|  5| 403|  1|129.99|129.99|
| 14|  7|1073|  1|199.99|199.99|
| 15|  7| 957|  1|299.98|299.98|
| 16|  7| 926|  5| 79.95| 15.99|
| 17|  8| 365|  3|179.97| 59.99|
| 18|  8| 365|  5|299.95| 59.99|
| 19|  8|1014|  4|199.92| 49.98|
| 20|  8| 502|  1|  50.0|  50.0|
+---+---+----+---+------+------+
only showing top 20 rows



In [62]:
raw_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)



In [63]:
refined_df = raw_df.toDF("order_item_id","order_id","product_id","quantity","subtotal","product_price")

In [64]:
refined_df.show()

+-------------+--------+----------+--------+--------+-------------+
|order_item_id|order_id|product_id|quantity|subtotal|product_price|
+-------------+--------+----------+--------+--------+-------------+
|            1|       1|       957|       1|  299.98|       299.98|
|            2|       2|      1073|       1|  199.99|       199.99|
|            3|       2|       502|       5|   250.0|         50.0|
|            4|       2|       403|       1|  129.99|       129.99|
|            5|       4|       897|       2|   49.98|        24.99|
|            6|       4|       365|       5|  299.95|        59.99|
|            7|       4|       502|       3|   150.0|         50.0|
|            8|       4|      1014|       4|  199.92|        49.98|
|            9|       5|       957|       1|  299.98|       299.98|
|           10|       5|       365|       5|  299.95|        59.99|
|           11|       5|      1014|       2|   99.96|        49.98|
|           12|       5|       957|       1|  29

In [65]:
df1 = refined_df.drop("subtotal")

In [66]:
df1.show()

+-------------+--------+----------+--------+-------------+
|order_item_id|order_id|product_id|quantity|product_price|
+-------------+--------+----------+--------+-------------+
|            1|       1|       957|       1|       299.98|
|            2|       2|      1073|       1|       199.99|
|            3|       2|       502|       5|         50.0|
|            4|       2|       403|       1|       129.99|
|            5|       4|       897|       2|        24.99|
|            6|       4|       365|       5|        59.99|
|            7|       4|       502|       3|         50.0|
|            8|       4|      1014|       4|        49.98|
|            9|       5|       957|       1|       299.98|
|           10|       5|       365|       5|        59.99|
|           11|       5|      1014|       2|        49.98|
|           12|       5|       957|       1|       299.98|
|           13|       5|       403|       1|       129.99|
|           14|       7|      1073|       1|       199.9

In [67]:
df1.select("order_item_id","order_id","product_id").show()

+-------------+--------+----------+
|order_item_id|order_id|product_id|
+-------------+--------+----------+
|            1|       1|       957|
|            2|       2|      1073|
|            3|       2|       502|
|            4|       2|       403|
|            5|       4|       897|
|            6|       4|       365|
|            7|       4|       502|
|            8|       4|      1014|
|            9|       5|       957|
|           10|       5|       365|
|           11|       5|      1014|
|           12|       5|       957|
|           13|       5|       403|
|           14|       7|      1073|
|           15|       7|       957|
|           16|       7|       926|
|           17|       8|       365|
|           18|       8|       365|
|           19|       8|      1014|
|           20|       8|       502|
+-------------+--------+----------+
only showing top 20 rows



In [68]:
from pyspark.sql.functions import expr

In [69]:
#df1.select("*","product_price * quantity as subtotal") --wrong
df1.select("*",expr("product_price * quantity as subtotal")).show()

+-------------+--------+----------+--------+-------------+--------+
|order_item_id|order_id|product_id|quantity|product_price|subtotal|
+-------------+--------+----------+--------+-------------+--------+
|            1|       1|       957|       1|       299.98|  299.98|
|            2|       2|      1073|       1|       199.99|  199.99|
|            3|       2|       502|       5|         50.0|   250.0|
|            4|       2|       403|       1|       129.99|  129.99|
|            5|       4|       897|       2|        24.99|   49.98|
|            6|       4|       365|       5|        59.99|  299.95|
|            7|       4|       502|       3|         50.0|   150.0|
|            8|       4|      1014|       4|        49.98|  199.92|
|            9|       5|       957|       1|       299.98|  299.98|
|           10|       5|       365|       5|        59.99|  299.95|
|           11|       5|      1014|       2|        49.98|   99.96|
|           12|       5|       957|       1|    

In [70]:
#selectExpr to consider calculation
df1.selectExpr("*","product_price * quantity as subtotal").show()

+-------------+--------+----------+--------+-------------+--------+
|order_item_id|order_id|product_id|quantity|product_price|subtotal|
+-------------+--------+----------+--------+-------------+--------+
|            1|       1|       957|       1|       299.98|  299.98|
|            2|       2|      1073|       1|       199.99|  199.99|
|            3|       2|       502|       5|         50.0|   250.0|
|            4|       2|       403|       1|       129.99|  129.99|
|            5|       4|       897|       2|        24.99|   49.98|
|            6|       4|       365|       5|        59.99|  299.95|
|            7|       4|       502|       3|         50.0|   150.0|
|            8|       4|      1014|       4|        49.98|  199.92|
|            9|       5|       957|       1|       299.98|  299.98|
|           10|       5|       365|       5|        59.99|  299.95|
|           11|       5|      1014|       2|        49.98|   99.96|
|           12|       5|       957|       1|    

In [71]:
raw_df = spark.read \
.format("csv") \
.option("inferschema","true") \
.load("/public/trendytech/retail_db/products/part-00000")

In [72]:
raw_df.show()

+---+---+--------------------+----+------+--------------------+
|_c0|_c1|                 _c2| _c3|   _c4|                 _c5|
+---+---+--------------------+----+------+--------------------+
|  1|  2|Quest Q64 10 FT. ...|null| 59.98|http://images.acm...|
|  2|  2|Under Armour Men'...|null|129.99|http://images.acm...|
|  3|  2|Under Armour Men'...|null| 89.99|http://images.acm...|
|  4|  2|Under Armour Men'...|null| 89.99|http://images.acm...|
|  5|  2|Riddell Youth Rev...|null|199.99|http://images.acm...|
|  6|  2|Jordan Men's VI R...|null|134.99|http://images.acm...|
|  7|  2|Schutt Youth Recr...|null| 99.99|http://images.acm...|
|  8|  2|Nike Men's Vapor ...|null|129.99|http://images.acm...|
|  9|  2|Nike Adult Vapor ...|null|  50.0|http://images.acm...|
| 10|  2|Under Armour Men'...|null|129.99|http://images.acm...|
| 11|  2|Fitness Gear 300 ...|null|209.99|http://images.acm...|
| 12|  2|Under Armour Men'...|null|139.99|http://images.acm...|
| 13|  2|Under Armour Men'...|null| 89.9

In [73]:
df1 = raw_df.toDF("product_id","product_catagory_id","product_name","product_desc","product_price","product_image")

In [74]:
df1.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_catagory_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_desc: string (nullable = true)
 |-- product_price: double (nullable = true)
 |-- product_image: string (nullable = true)



In [75]:
#to transform existing column
df2 = df1.withColumn("product_price",expr("product_price * 1.2"))

In [76]:
df2.show()

+----------+-------------------+--------------------+------------+------------------+--------------------+
|product_id|product_catagory_id|        product_name|product_desc|     product_price|       product_image|
+----------+-------------------+--------------------+------------+------------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|        null|            71.976|http://images.acm...|
|         2|                  2|Under Armour Men'...|        null|           155.988|http://images.acm...|
|         3|                  2|Under Armour Men'...|        null|107.98799999999999|http://images.acm...|
|         4|                  2|Under Armour Men'...|        null|107.98799999999999|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|        null|           239.988|http://images.acm...|
|         6|                  2|Jordan Men's VI R...|        null|           161.988|http://images.acm...|
|         7|                  2|Schut

Nike 20%

Armour 10$

other 0%

In [77]:
df2 = df1.withColumn("PRODUCT_PRICE",expr("CASE WHEN product_name LIKE '%Nike%' THEN product_price * 1.2 WHEN product_name like '%Armour%' then product_price * 1.1 else product_price end"))

In [78]:
df2.show()

+----------+-------------------+--------------------+------------+------------------+--------------------+
|product_id|product_catagory_id|        product_name|product_desc|     PRODUCT_PRICE|       product_image|
+----------+-------------------+--------------------+------------+------------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|        null|             59.98|http://images.acm...|
|         2|                  2|Under Armour Men'...|        null|142.98900000000003|http://images.acm...|
|         3|                  2|Under Armour Men'...|        null|            98.989|http://images.acm...|
|         4|                  2|Under Armour Men'...|        null|            98.989|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|        null|            199.99|http://images.acm...|
|         6|                  2|Jordan Men's VI R...|        null|            134.99|http://images.acm...|
|         7|                  2|Schut

In [81]:
df2 = df1.withColumnRenamed("product_name","product_name1")

In [82]:
df2.show()

+----------+-------------------+--------------------+------------+-------------+--------------------+
|product_id|product_catagory_id|       product_name1|product_desc|product_price|       product_image|
+----------+-------------------+--------------------+------------+-------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|        null|        59.98|http://images.acm...|
|         2|                  2|Under Armour Men'...|        null|       129.99|http://images.acm...|
|         3|                  2|Under Armour Men'...|        null|        89.99|http://images.acm...|
|         4|                  2|Under Armour Men'...|        null|        89.99|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|        null|       199.99|http://images.acm...|
|         6|                  2|Jordan Men's VI R...|        null|       134.99|http://images.acm...|
|         7|                  2|Schutt Youth Recr...|        null|        99.99|ht

### Remove dups from DF

distinct()

dropDuplicates()

In [83]:
mylist = [
    (1,"kapil",34),
    (1,"kapil",34),
    (1,"satish",26),
    (2,"satish",26)
]

In [84]:
df = spark.createDataFrame(mylist).toDF("id","name","age")

In [85]:
df.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| kapil| 34|
|  1| kapil| 34|
|  1|satish| 26|
|  2|satish| 26|
+---+------+---+



In [93]:
#transformation distinct (to be used if the whole record is duplicate)
#df1 = df.distinct() --considring all column
#df1 = df.select("id").distinct()---only selecting the id
#transformation dropDuplicate
#new_df = df.dropDuplicates()---same as distinct---more flexible
new_df = df.dropDuplicates(["name","age"])

In [94]:
new_df.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| kapil| 34|
|  1|satish| 26|
+---+------+---+



In [95]:
df.dropDuplicates(["id"])

id,name,age
1,kapil,34
2,satish,26
