## PySpark COde Practice

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive

#### 1. Calculate the difference (in days) between the start_date and end_date.

In [None]:
df = spark.createDataFrame(
      [
        ("2023-01-01", "2023-02-01"),
        ("2023-03-15", "2023-03-20")
      ],
      ["start_date", "end_date"]
    )

df.show()
dif_df = df.withColumn("date_diffrence", datediff(col("end_date"), col("start_date")))
dif_df.show()

+----------+----------+
|start_date|  end_date|
+----------+----------+
|2023-01-01|2023-02-01|
|2023-03-15|2023-03-20|
+----------+----------+

+----------+----------+--------------+
|start_date|  end_date|date_diffrence|
+----------+----------+--------------+
|2023-01-01|2023-02-01|            31|
|2023-03-15|2023-03-20|             5|
+----------+----------+--------------+



#### 2. Extracting Year from a Date Column Sometimes, you may need to extract specific parts of a date, such as the year, especially when grouping data by year.

In [None]:
# Define schema for the dataframe
schema = StructType(
    [
        StructField('event_date', StringType(), True)
    ]
)

df =spark.createDataFrame([("2023-04-15",),
 ("2022-11-30",),
 ("2021-08-25",)], schema)
df.show()

#Convert event date column as date type

df = df.withColumn('event_date', to_date(col('event_date'), 'yyyy-MM-dd'))
df.withColumn('year', year(col('event_date'))).show()




+----------+
|event_date|
+----------+
|2023-04-15|
|2022-11-30|
|2021-08-25|
+----------+

+----------+----+
|event_date|year|
+----------+----+
|2023-04-15|2023|
|2022-11-30|2022|
|2021-08-25|2021|
+----------+----+



####3. Filter Records Based on Date

Filtering records based on specific date ranges is a common operation in time-series data.

#### Scenario:
Filter records where event_date is after 2023-06-01.

In [None]:
# Define schema for the dataframe
schema = StructType([
 StructField("event_date", StringType(), True)
])
# Create DataFrame
df = spark.createDataFrame([
 ("2023-05-15",),
 ("2023-07-20",),
 ("2023-06-05",)
], schema)
df.show()
#Convert event date to data type and filter for dates after '2023-06-01'
df = df.withColumn('event_date', to_date(col("event_date"), 'yyyy-MM-dd'))
df.filter(col('event_date') > '2023-06-01').show()

+----------+
|event_date|
+----------+
|2023-05-15|
|2023-07-20|
|2023-06-05|
+----------+

+----------+
|event_date|
+----------+
|2023-07-20|
|2023-06-05|
+----------+



#### 4. Add Days to Date

Adding a certain number of days to a date is crucial when planning tasks or calculating deadlines.

#### Scenario:
Add 30 days to each date in the order_date column.

In [None]:
df = spark.createDataFrame([
    ("2023-01-10",),
    ("2023-06-15",),
    ("2023-07-30",)
], ["order_date"])

#convert order_date to DataType and add 30 days
df = df.withColumn("order_date", to_date(col('order_date'), 'yyyy-MM-dd'))
add_df = df.withColumn("new_order_date", date_add(col('order_date'), 30))
add_df.show()



+----------+--------------+
|order_date|new_order_date|
+----------+--------------+
|2023-01-10|    2023-02-09|
|2023-06-15|    2023-07-15|
|2023-07-30|    2023-08-29|
+----------+--------------+



####5. Find the Maximum Date in a Column

Finding the latest date in a dataset is helpful when dealing with transaction logs or event timestamps.

#### Scenario

Find the latest payment_date in the column.


In [None]:
df = spark.createDataFrame([
 ("2023-02-15",),
 ("2023-06-25",),
 ("2023-01-10",)
], ["payment_date"])

df.show()
#Convert date into Date type and get the latest payment date
df = df.withColumn('payment_date', to_date(col('payment_date'), "yyyy-MM-dd"))
df.agg(max('payment_date').alias("latest_payment_date")).show()


+------------+
|payment_date|
+------------+
|  2023-02-15|
|  2023-06-25|
|  2023-01-10|
+------------+

+-------------------+
|latest_payment_date|
+-------------------+
|         2023-06-25|
+-------------------+



In [None]:
#df.groupBy("Catagory").agg(sum('price')).alias('total_price').show()

In [None]:
def is_palindrom(n):
  return str(n) == str(n)[::-1]

  print(is_palindrom(121))

In [None]:
# df1.join(df2, on='id', 'inner').show

In [None]:
# def sqr(x):
#   return x*x

# sqr_udf = udf(sqr, IntegerType())
#df.withColumn("Squre", sqr_udf(df["value"]))

In [None]:
# filterd_df = df.filter(df['age']>30)
# filterd_df.show()

In [None]:
from collections import Counter
def word_frequency(word):
  return Counter(word.split())

print(word_frequency('A column or function parameter with name function parameter'))

Counter({'function': 2, 'parameter': 2, 'A': 1, 'column': 1, 'or': 1, 'with': 1, 'name': 1})


In [None]:
def getWordFrequency(sentence):
  word_fre= {}
  words = sentence.split()
  for word in words:
    if word in word_fre:
      word_fre[word] +=1
    else:
      word_fre[word] = 1
  return word_fre

print(getWordFrequency('A column or function parameter with name function parameter'))

{'A': 1, 'column': 1, 'or': 1, 'function': 2, 'parameter': 2, 'with': 1, 'name': 1}


Spark data design

In [None]:
from datetime import date
data = [
    ("cust_1", "2023-01-01", 100),
    ("cust_1", "2023-01-02", 150),
    ("cust_1", "2023-01-03", 200),
    ("cust_2", "2023-01-01", 167),
    ("cust_2", "2023-01-02", 175),
    ("cust_2", "2023-01-03", 187),
    ("cust_3", "2023-01-01", 125),
    ("cust_3", "2023-01-02", 150),
    ("cust_3", "2023-01-03", 175),
    ("cust_4", "2023-01-01", 145),
    ("cust_4", "2023-01-02", 155)
]

schema = StructType([
        StructField('customer_id', StringType(), True),
        StructField('date', StringType(), True),
        StructField('amount', StringType(), True)
    ]
)
# Print data
df = spark.createDataFrame(data, schema)
#df.show()

#Convert to proper date format
df = df.withColumn('date', to_date(col('date'), 'yyyy-MM-dd'))
#df.show()

#Define windo space
window_space = Window.partitionBy('customer_id').orderBy('date')

# 1. Previous transaction amount
df =df.withColumn("prev_amoun", lag("amount", 1).over(window_space))
# df.show()

# 2. Running total
df = df.withColumn("running_total", sum("amount").over(window_space))
# df.show()

# 3. Rank by amount per customer
rank_space = Window.partitionBy("customer_id").orderBy(col("amount").desc())
df = df.withColumn("rank", rank().over(rank_space))

df.show()


+-----------+----------+------+----------+-------------+----+
|customer_id|      date|amount|prev_amoun|running_total|rank|
+-----------+----------+------+----------+-------------+----+
|     cust_1|2023-01-03|   200|       150|        450.0|   1|
|     cust_1|2023-01-02|   150|       100|        250.0|   2|
|     cust_1|2023-01-01|   100|      NULL|        100.0|   3|
|     cust_2|2023-01-03|   187|       175|        529.0|   1|
|     cust_2|2023-01-02|   175|       167|        342.0|   2|
|     cust_2|2023-01-01|   167|      NULL|        167.0|   3|
|     cust_3|2023-01-03|   175|       150|        450.0|   1|
|     cust_3|2023-01-02|   150|       125|        275.0|   2|
|     cust_3|2023-01-01|   125|      NULL|        125.0|   3|
|     cust_4|2023-01-02|   155|       145|        300.0|   1|
|     cust_4|2023-01-01|   145|      NULL|        145.0|   2|
+-----------+----------+------+----------+-------------+----+



Practice with salses

In [None]:
#Sales dataframe
sales_data = [
    ('A', '2021-01-01', '1'),
    ('A', '2021-01-02', '2'),
    ('A', '2021-01-03', '3'),
    ('B', '2021-01-01', '1'),
    ('B', '2021-01-02', '2'),
    ('B', '2021-01-03', '3'),
    ('C', '2021-01-01', '1'),
    ('C', '2021-01-02', '3'),
    ('C', '2021-01-03', '4'),
    ('D', '2021-01-01', '1'),
    ('D', '2021-01-02', '1'),
    ('D', '2021-01-03', '2'),
    ('E', '2021-01-01', '3'),
    ('E', '2021-01-02', '4'),
    ('E', '2021-01-03', '5')
]
sales_schema = StructType([
    StructField('customer_id', StringType(), True),
    StructField('date', StringType(), True),
    StructField('product_id', StringType(), True)
])
sales_df = spark.createDataFrame(sales_data, sales_schema)
print('Customer Data')
sales_df.show()

#Menu Dataframe
menu_date = [
    ('1', 'Sushi', 10),
    ('2', 'Curry', 15),
    ('3', 'Pizza', 20),
    ('4', 'Salad', 5),
    ('5', 'Pasta', 12)
]
menu_schema = StructType([
    StructField('product_id', StringType(), True),
    StructField('product_name', StringType(), True),
    StructField('price', IntegerType(), True)
])

menu_df = spark.createDataFrame(menu_date, menu_schema)
print('Menu Data')
menu_df.show()

#Member Dataframe
member_data = [
    ('A', '2021-01-07'),
    ('B', '2021-01-09'),
    ('C', '2021-01-01'),
    ('D', '2021-01-02'),
    ('E', '2021-01-01')
]
member_schema = StructType([
    StructField('customer_id', StringType(), True),
    StructField('join_date', StringType(), True)
])
member_df = spark.createDataFrame(member_data, member_schema)
print('Member Data')
member_df.show()






Customer Data
+-----------+----------+----------+
|customer_id|      date|product_id|
+-----------+----------+----------+
|          A|2021-01-01|         1|
|          A|2021-01-02|         2|
|          A|2021-01-03|         3|
|          B|2021-01-01|         1|
|          B|2021-01-02|         2|
|          B|2021-01-03|         3|
|          C|2021-01-01|         1|
|          C|2021-01-02|         3|
|          C|2021-01-03|         4|
|          D|2021-01-01|         1|
|          D|2021-01-02|         1|
|          D|2021-01-03|         2|
|          E|2021-01-01|         3|
|          E|2021-01-02|         4|
|          E|2021-01-03|         5|
+-----------+----------+----------+

Menu Data
+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         1|       Sushi|   10|
|         2|       Curry|   15|
|         3|       Pizza|   20|
|         4|       Salad|    5|
|         5|       Pasta|   12|
+----------+------------+-----+

Me

Task:
For each customer, return the latest order amount, the total amount spent, and the number of orders. Output should include the customer's name.

In [None]:
# Sample Data and schema
order_data = [
    (1, 101, '2024-01-01', 100),
    (2, 102, '2024-01-02', 150),
    (3, 103, '2024-01-03', 200),
    (4, 101, '2024-01-04', 120),
    (5, 102, '2024-01-05', 180),
    (6, 103, '2024-01-06', 220),
    (7, 101, '2024-01-07', 90),
    (8, 102, '2024-01-08', 160),
    (9, 103, '2024-01-09', 210),
    (10, 101, '2024-01-10', 110)
]
order_schema = StructType([
    StructField('order_id', IntegerType(), True),
    StructField('customer_id', IntegerType(), True),
    StructField('order_date', StringType(), True),
    StructField('amount', IntegerType(), True)
])
customer_data = [
    (101, 'Alice'),
    (102, 'Bob'),
    (103, 'Charlie')
]
customer_schema = StructType([
    StructField('customer_id', IntegerType(), True),
    StructField('customer_name', StringType(), True)
])

# Create Data frame
order_df= spark.createDataFrame(order_data, order_schema)
customer_df = spark.createDataFrame(customer_data, customer_schema)
# Represent
order_df.show()
customer_df.show()

#Change data type in order table date format
order_df=order_df.withColumn('order_date', to_date(col("order_date"), 'yyyy-MM-dd'))
order_df.printSchema()

# 1. Latest order amount using window function
window_space = Window.partitionBy("customer_id").orderBy(col("order_date").desc())
latest_orders_df = order_df.withColumn('rn', row_number().over(window_space))
latest_orders_df = latest_orders_df.filter(col('rn')==1)
latest_orders_df.show()

# 2. Total amount and order  count per customer
agg_df = customer_df.join(order_df, on= 'customer_id')




+--------+-----------+----------+------+
|order_id|customer_id|order_date|amount|
+--------+-----------+----------+------+
|       1|        101|2024-01-01|   100|
|       2|        102|2024-01-02|   150|
|       3|        103|2024-01-03|   200|
|       4|        101|2024-01-04|   120|
|       5|        102|2024-01-05|   180|
|       6|        103|2024-01-06|   220|
|       7|        101|2024-01-07|    90|
|       8|        102|2024-01-08|   160|
|       9|        103|2024-01-09|   210|
|      10|        101|2024-01-10|   110|
+--------+-----------+----------+------+

+-----------+-------------+
|customer_id|customer_name|
+-----------+-------------+
|        101|        Alice|
|        102|          Bob|
|        103|      Charlie|
+-----------+-------------+

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- amount: integer (nullable = true)

+--------+-----------+----------+------+---+
|order_id|cus

## Question:
 Calculate the average salary and count of employees for each department.

In [None]:
#Sample data
data = [
    ("Sales", 5000, "John"),
    ("Sales", 5000, "Doe"),
    ("HR", 5000, "Jane"),
    ("HR", 5000, "Alice"),
    ("IT", 5000, "Bob"),
    ("IT", 5000, "Charlie")
]
# Schema
schema = StructType([
    StructField('department', StringType(), True),
    StructField('salary', IntegerType(), True),
    StructField('employee_name', StringType(), True)
])

# Create Data frame
df = spark.createDataFrame(data, schema)
df.show()

# Group By department and calculate average salary and employe count.
result_df = df.groupBy('department').agg(
    avg("salary").alias("avg_salary"),
    count("employee_name").alias("employee_counr")
)

#Show result
result_df.show()


+----------+------+-------------+
|department|salary|employee_name|
+----------+------+-------------+
|     Sales|  5000|         John|
|     Sales|  5000|          Doe|
|        HR|  5000|         Jane|
|        HR|  5000|        Alice|
|        IT|  5000|          Bob|
|        IT|  5000|      Charlie|
+----------+------+-------------+

+----------+----------+--------------+
|department|avg_salary|employee_counr|
+----------+----------+--------------+
|     Sales|    5000.0|             2|
|        HR|    5000.0|             2|
|        IT|    5000.0|             2|
+----------+----------+--------------+



## Question:
 How would you handle null values in a DataFrame? For example, drop rows with null values in the age column.


In [None]:
data = [("Alice", 30), ("Bob", None), ("Catherine", 25),
(None, 35), ("Eve", None)]
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)
df.show()