In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [2]:
data = [('A', 1),
       ('B', 3),
       ('C', 4),
       ('D', 5),]

In [3]:
df = spark.createDataFrame(data, ["COLA", "COLB"])

In [4]:
df.show()

+----+----+
|COLA|COLB|
+----+----+
|   A|   1|
|   B|   3|
|   C|   4|
|   D|   5|
+----+----+



In [5]:
from pyspark.sql import Window

from pyspark.sql.functions import *

In [6]:
df2 = df.withColumn("add above", sum(col("COLB")).over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))).orderBy("COLA")

In [7]:
df2.show()

+----+----+---------+
|COLA|COLB|add above|
+----+----+---------+
|   A|   1|       10|
|   B|   3|       13|
|   C|   4|        4|
|   D|   5|        9|
+----+----+---------+



In [8]:
data = [(1, '27+06+2021'),
(2, '27+07+2021'),
(3, '25+08+2020')]

In [9]:
df = spark.createDataFrame(data, ["COLA", "COLB"])

In [10]:
df.show()

+----+----------+
|COLA|      COLB|
+----+----------+
|   1|27+06+2021|
|   2|27+07+2021|
|   3|25+08+2020|
+----+----------+



In [11]:
df2 = df.withColumn("id_is_odd_or_even", when(col("COLA")% 2 == 0, "EVEN").otherwise("ODD"))

df2.show()

+----+----------+-----------------+
|COLA|      COLB|id_is_odd_or_even|
+----+----------+-----------------+
|   1|27+06+2021|              ODD|
|   2|27+07+2021|             EVEN|
|   3|25+08+2020|              ODD|
+----+----------+-----------------+



In [12]:
df3 = df.withColumn("actual_date_format", to_date(col("COLB"), "dd+MM+yyyy")). \
    withColumn("day_of_week_string", date_format("actual_date_format", "E")). \
    withColumn("week_of_year", weekofyear("actual_date_format")). \
    withColumn("custom_date_fomat", concat("actual_date_format", lit(" "), "day_of_week_string", lit(" "), "week_of_year"))

df3.show()

+----+----------+------------------+------------------+------------+-----------------+
|COLA|      COLB|actual_date_format|day_of_week_string|week_of_year|custom_date_fomat|
+----+----------+------------------+------------------+------------+-----------------+
|   1|27+06+2021|        2021-06-27|               Sun|          25|2021-06-27 Sun 25|
|   2|27+07+2021|        2021-07-27|               Tue|          30|2021-07-27 Tue 30|
|   3|25+08+2020|        2020-08-25|               Tue|          35|2020-08-25 Tue 35|
+----+----------+------------------+------------------+------------+-----------------+



In [13]:
data = [(1, 'A'),
       (2, 'B'),
       (3, 'C'),
       (4, 'D')]

In [14]:
df = spark.createDataFrame(data, ["COLA", "COLB"])

In [15]:
df.show()

+----+----+
|COLA|COLB|
+----+----+
|   1|   A|
|   2|   B|
|   3|   C|
|   4|   D|
+----+----+



In [16]:
df2 = df.withColumn("COLC", expr("repeat(COLB, COLA)"))

df2.show()

+----+----+----+
|COLA|COLB|COLC|
+----+----+----+
|   1|   A|   A|
|   2|   B|  BB|
|   3|   C| CCC|
|   4|   D|DDDD|
+----+----+----+



In [17]:
%%sh

cat new23.csv

fruits_list,store
apple banana mango,more
mango,reliance
grapes papaya,smart
angeer mango,kk
orange,gr stores
orange mango,heritage
watermelon mango,Walmart
musk melon,amazon


In [18]:
rdd1 = spark.sparkContext.textFile("/user/itv736079/new23.csv")

rdd1.collect()

['fruits_list,store',
 'apple banana mango,more',
 'mango,reliance',
 'grapes papaya,smart',
 'angeer mango,kk',
 'orange,gr stores',
 'orange mango,heritage',
 'watermelon mango,Walmart',
 'musk melon,amazon']

In [19]:
rdd2 = rdd1.map(lambda x: x.split(',')). \
    map(lambda x: [x[1] for i in x[0].split(' ') if 'mango' in i])

rdd2.collect()

[[], ['more'], ['reliance'], [], ['kk'], [], ['heritage'], ['Walmart'], []]

In [20]:
%%sh

cat file1.csv

2,8
4,4
6,5
5,6
3,7
9,3


In [21]:
%%sh

cat file2.csv

2,8
4,4
6,5
5,6
3,7
9,3


In [22]:
df1 = spark.read.csv("/user/itv736079/file1.csv", "a INT, b INT")
df2 = spark.read.csv("/user/itv736079/file2.csv", "a INT, b INT")

In [23]:
from pyspark.sql.types import *

In [24]:
df3 = df1.withColumn("add1", (df1.a+df1.b).cast(IntegerType())). \
    withColumn("new", monotonically_increasing_id())

df4 = df2.withColumn("add2", (df2.a+df2.b).cast(IntegerType())). \
    withColumn("new", monotonically_increasing_id())

In [25]:
df5 = df3.join(df4, df3.new == df4.new, "inner"). \
    withColumn("add", (col("add1")+col("add2")).cast(IntegerType())). \
    drop("a", "b", "add1", "add2", "new")

In [26]:
df5.show()

+---+
|add|
+---+
| 20|
| 16|
| 22|
| 22|
| 20|
| 24|
+---+



In [27]:
data = [('05/01/2021', 'A', 400),
('15/01/2021', 'A', 300),
('06/01/2021', 'A', 700),
('10/01/2021', 'A', 100),
('12/01/2021', 'B', 300),
('14/01/2021', 'B', 200)]

In [28]:
df = spark.createDataFrame(data, ["date", "common_col", "amount"])