In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Spark Metastore'). \
    master('yarn'). \
    getOrCreate()

In [2]:
df = spark.read. \
    option("inferSchema", "true"). \
    option("header", "true"). \
    csv("/user/itv736079/new21.csv")

df.show()

+-------+-------+-----+
|country|   city|value|
+-------+-------+-----+
|  India|   Pune|  100|
|  India| Mumbai|  200|
|  India|   Pune|  400|
|  India| Mumbai|  200|
|  India|   Pune|  600|
| Europe|Germany|  300|
| Europe|     NL|  100|
+-------+-------+-----+



In [3]:
df_new = spark.createDataFrame([("India","Pune",100),
("India","Mumbai",200),
("India","Pune",400),
("India","Mumbai",200),
("India","Pune",600),
("Europe","Germany",300),
("Europe","NL",100)], ["country","city","value"])

In [4]:
df_new.show()

+-------+-------+-----+
|country|   city|value|
+-------+-------+-----+
|  India|   Pune|  100|
|  India| Mumbai|  200|
|  India|   Pune|  400|
|  India| Mumbai|  200|
|  India|   Pune|  600|
| Europe|Germany|  300|
| Europe|     NL|  100|
+-------+-------+-----+



In [5]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [6]:
df2 = df.filter((col("country") == "India") & (col("city") == "Pune")) \
.groupBy("country", "city") \
.agg(sum("value").alias("value")) \
.union(df.filter(col("city") !=  "Pune"))

In [7]:
df2.show()

+-------+-------+-----+
|country|   city|value|
+-------+-------+-----+
|  India|   Pune| 1100|
|  India| Mumbai|  200|
|  India| Mumbai|  200|
| Europe|Germany|  300|
| Europe|     NL|  100|
+-------+-------+-----+



In [11]:
cal_window = Window.partitionBy(["Country", "city"])
df3 = df \
    .withColumn("value", sum("value").over(Window.partitionBy("country", "city"))) 
df3.show()

+-------+-------+-----+
|country|   city|value|
+-------+-------+-----+
| Europe|Germany|  300|
|  India| Mumbai|  400|
|  India| Mumbai|  400|
| Europe|     NL|  100|
|  India|   Pune| 1100|
|  India|   Pune| 1100|
|  India|   Pune| 1100|
+-------+-------+-----+



In [23]:

df3 = df \
    .withColumn("id", when(col("city") == "Pune", 1).otherwise(0)) \
    .withColumn("value", when(col("city") == "Pune",  sum("value").over(Window.partitionBy("country", "city"))).otherwise(col("value")))

df3.show()

+-------+-------+-----+---+
|country|   city|value| id|
+-------+-------+-----+---+
| Europe|Germany|  300|  0|
|  India| Mumbai|  200|  0|
|  India| Mumbai|  200|  0|
| Europe|     NL|  100|  0|
|  India|   Pune| 1100|  1|
|  India|   Pune| 1100|  1|
|  India|   Pune| 1100|  1|
+-------+-------+-----+---+

