In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [None]:
# Question 1

# There is parquet file stored datalake, write a pyspark code read the file and create a dataframe
# Then remove the duplicate records
# Write back to datalake

df = spark.read.parquet("datasets/parquet/")

df_no_duplicates = df.dropDuplicates()

df_no_duplicates.show(10, truncate = False)

df_no_duplicates.write.mode("overwrite").save("datasets/output/")

In [None]:
df_no_duplicates.rdd.getNumPartitions()

In [None]:
# Question 2
"""
Question 2.
Input
col1 col2 col3
a aa 1
a aa 2
b bb 5
b bb 3
b bb 4

output:
col1 col2 col3
a aa [1,2]
b bb [5,3,4]
"""

data = [("a", "aa", 1),
        ("a", "aa", 2),
        ("b", "bb", 5),
        ("b", "bb", 3),
        ("b", "bb", 4)]

schema = ["col1", "col2", "col3"]

df = spark.createDataFrame(data, schema = schema)

df.show()

In [None]:
from pyspark.sql.functions import *
df_grouped = df.groupBy("col1", "col2").agg(collect_list("col3")).alias("col3")
df_grouped.show()

In [None]:
df.createOrReplaceTempView("temp")

spark.sql("""
select col1, col2, collect_list(col3) as col3
from temp
group by col1, col2
""").show()

In [None]:
# Question 3
"""

input json:

{"dept_id": 101, "e_id": [10101, 10102, 10103]}
{"dept_id": 102, "e_id": [10201, 10202]}


output:
+--------+------+
|dept_id | e_id |
+--------+------+
|     101|10101 |
|     101|10102 |
|     101|10103 |
|     102|10201 |
|     102|10202 |
+--------+------+

"""

data = [
    {"dept_id": 101, "e_id": [10101, 10102, 10103]},
    {"dept_id": 102, "e_id": [10201, 10202]}
]

df = spark.createDataFrame(data)

df.show()

In [None]:
df_explode = df.withColumn("e_id", explode("e_id")).select("dept_id", "e_id")

df_explode.show()

In [None]:
# Question 4

data = [
    ("2023-01-01", "AAPL", 155.00),
    ("2023-01-01", "GOOG", 2550.00),
    ("2023-01-01", "MSFT", 310.00),
    ("2023-01-02", "AAPL", 150.00),
    ("2023-01-02", "GOOG", 2500.00),
    ("2023-01-02", "MSFT", 300.00),
]

# createdataframeinpyspark
# find avg stock value on daily basis for each stock
# findmaxavgstockvalueofeachstock

# Sample data
data = [
    ("2023-01-01", "AAPL", 155.00),
    ("2023-01-01", "GOOG", 2550.00),
    ("2023-01-01", "MSFT", 310.00),
    ("2023-01-02", "AAPL", 150.00),
    ("2023-01-02", "GOOG", 2500.00),
    ("2023-01-02", "MSFT", 300.00),
]

# Create DataFrame
schema = ["date", "stock", "price"]
df = spark.createDataFrame(data, schema=schema)

# Convert string to date type
df = df.withColumn("date", to_date("date"))

# Average stock value on daily basis for each stock
df_grouped = df.groupBy("date", "stock").agg(avg("price").alias("avg_price"))
df_grouped.show()

# Max average stock value for each stock
df_max_avg = df_grouped.groupBy("stock").agg(max("avg_price").alias("max_avg_price"))
df_max_avg.show()

In [7]:
# Question 5

from pyspark.sql.functions import *

data = [
    (3000, "22-may"),
    (5000, "23-may"),
    (5000, "25-may"),
    (10000, "22-june"),
    (1250, "03-july")
]

df = spark.createDataFrame(data, ["revenue", "date"])

df = df.withColumn("date", to_date(concat(lit("2024-"), initcap("date")), "yyyy-dd-MMMM"))

df.show()

+-------+----------+
|revenue|      date|
+-------+----------+
|   3000|2024-05-22|
|   5000|2024-05-23|
|   5000|2024-05-25|
|  10000|2024-06-22|
|   1250|2024-07-03|
+-------+----------+



In [10]:
df_new = df.withColumn("month", month("date"))

df_grouped= df_new.groupBy("month").agg(sum("revenue").alias("totalRevenew"))

df_grouped.show()

+-----+------------+
|month|totalRevenew|
+-----+------------+
|    6|       10000|
|    5|       13000|
|    7|        1250|
+-----+------------+



In [None]:
# reading textfile with properschema
"""
Name~|Age
Brayan,gomez~|25
John,Cleark~|30
Sumit,Sen~|31
"""

In [11]:
! hadoop fs -cat /user/itv016422/messedupfile.txt

Name~|Age
Brayan,gomez~|25
John,Cleark~|30
Sumit,Sen~|31



In [25]:
input_file = spark.read.text("/user/itv016422/messedupfile.txt")
input_file.show()

+----------------+
|           value|
+----------------+
|       Name~|Age|
|Brayan,gomez~|25|
| John,Cleark~|30|
|   Sumit,Sen~|31|
|                |
+----------------+



In [26]:
header = input_file.first()[0]
schema = header.split("~|")
schema

['Name', 'Age']

In [39]:
rdd = input_file.filter((input_file.value != header) & (input_file.value != "")).rdd
rdd.collect()


[Row(value='Brayan,gomez~|25'),
 Row(value='John,Cleark~|30'),
 Row(value='Sumit,Sen~|31')]

In [40]:
rdd2 = rdd.map(lambda x: x[0].split("~|"))
df = rdd2.toDF(schema)
df.show()

+------------+---+
|        Name|Age|
+------------+---+
|Brayan,gomez| 25|
| John,Cleark| 30|
|   Sumit,Sen| 31|
+------------+---+



In [41]:
# Question 7

data = [
    (1, 8),
    (2, 8),
    (3, 8),
    (4, 7),
    (5, 9),
    (6, 9),
]

columns = ["employee_id", "team_id"]

df = spark.createDataFrame(data, columns)
df.show()

+-----------+-------+
|employee_id|team_id|
+-----------+-------+
|          1|      8|
|          2|      8|
|          3|      8|
|          4|      7|
|          5|      9|
|          6|      9|
+-----------+-------+



In [49]:
df_result = df.groupBy("team_id").agg(collect_list("employee_id").alias("emp_list")) \
              .withColumn("team_size", size("emp_list")) \
              .select(explode("emp_list"), "team_size")


df_result.show()

+---+---------+
|col|team_size|
+---+---------+
|  4|        1|
|  5|        2|
|  6|        2|
|  1|        3|
|  2|        3|
|  3|        3|
+---+---------+

