In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from datetime import date, datetime
spark = SparkSession.builder.getOrCreate()
from pyspark.sql import functions

In [2]:
print([i for i in dir(functions) if 'date' in i.lower()])

['curdate', 'current_date', 'date_add', 'date_diff', 'date_format', 'date_from_unix_date', 'date_part', 'date_sub', 'date_trunc', 'dateadd', 'datediff', 'datepart', 'make_date', 'to_date', 'unix_date']


In [3]:
print([i for i in dir(functions) if 'time' in i.lower()])

['convert_timezone', 'current_timestamp', 'current_timezone', 'from_unixtime', 'from_utc_timestamp', 'localtimestamp', 'make_timestamp', 'make_timestamp_ltz', 'make_timestamp_ntz', 'timestamp_micros', 'timestamp_millis', 'timestamp_seconds', 'to_timestamp', 'to_timestamp_ltz', 'to_timestamp_ntz', 'to_unix_timestamp', 'to_utc_timestamp', 'try_to_timestamp', 'unix_timestamp', 'window_time']


In [5]:
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|
|           205|      3|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)



In [6]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show()
df.printSchema()
df.withColumn("date", functions.date_format('transaction_time', 'yyyy-MM-dd HH:mm')).show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|
|           205|      3|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)

+--------------+-------+------+---------+----------------+----+
|transaction_id|user_id|amount| location|transaction_time|date|
+--------------+-------+------+---------+----------------+---

In [9]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show(2)
df.printSchema()
df = df.withColumn("date", functions.to_date('transaction_time', 'MM-dd-yyyy HH:mm'))
df.show(2)
df.printSchema()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
+--------------+-------+------+---------+----------------+
only showing top 2 rows

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)

+--------------+-------+------+---------+----------------+----------+
|transaction_id|user_id|amount| location|transaction_time|      date|
+--------------+-------+------+---------+----------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03|
+--------------+-------+------+---------+----------

In [10]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show(2)
df.printSchema()
df = df.withColumn("date", functions.to_date('transaction_time', 'MM-dd-yyyy HH:mm'))
df = df.withColumn("add_2_days", functions.date_add('date',2))

df.show(2)
df.printSchema()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
+--------------+-------+------+---------+----------------+
only showing top 2 rows

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)

+--------------+-------+------+---------+----------------+----------+----------+
|transaction_id|user_id|amount| location|transaction_time|      date|add_2_days|
+--------------+-------+------+---------+----------------+----------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03|2025-10-05|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03|2025-10-

In [11]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show(2)
df.printSchema()
df = df.withColumn("date", functions.to_date('transaction_time', 'MM-dd-yyyy HH:mm'))
df = df.withColumn("date_timestamp", functions.to_timestamp('transaction_time', 'MM-dd-yyyy HH:mm'))
df = df.withColumn("add_2_days", functions.date_add('date',2))

df.show(2)
df.printSchema()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
+--------------+-------+------+---------+----------------+
only showing top 2 rows

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)

+--------------+-------+------+---------+----------------+----------+-------------------+----------+
|transaction_id|user_id|amount| location|transaction_time|      date|     date_timestamp|add_2_days|
+--------------+-------+------+---------+----------------+----------+-------------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03|2025-10-03 12:00:00|2025-10-0

In [13]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show(2)
df.printSchema()
df = df.withColumn("date", functions.to_date('transaction_time', 'MM-dd-yyyy HH:mm'))
df = df.withColumn("date_timestamp", functions.to_timestamp('transaction_time', 'MM-dd-yyyy HH:mm'))
df = df.withColumn("add_2_days", functions.date_add('date',2))
df = df.withColumn('timestamp',functions.current_timestamp())

df.show(2)
df.printSchema()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
+--------------+-------+------+---------+----------------+
only showing top 2 rows

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)

+--------------+-------+------+---------+----------------+----------+-------------------+----------+--------------------+
|transaction_id|user_id|amount| location|transaction_time|      date|     date_timestamp|add_2_days|           timestamp|
+--------------+-------+------+---------+----------------+----------+-------------------+----------+--------------------+
|           201|      1|   500|Hyde

In [14]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show(2)
df.printSchema()
df = df.withColumn("date", functions.to_date('transaction_time', 'MM-dd-yyyy HH:mm'))
df = df.withColumn("date_timestamp", functions.to_timestamp('transaction_time', 'MM-dd-yyyy HH:mm'))
df = df.withColumn("add_2_days", functions.date_add('date',2))
df = df.withColumn('timestamp',functions.current_timestamp())
df = df.withColumn('date_diff_col', functions.date_diff("date","add_2_days") )

df.show(2)
df.printSchema()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
+--------------+-------+------+---------+----------------+
only showing top 2 rows

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)

+--------------+-------+------+---------+----------------+----------+-------------------+----------+--------------------+-------------+
|transaction_id|user_id|amount| location|transaction_time|      date|     date_timestamp|add_2_days|           timestamp|date_diff_col|
+--------------+-------+------+---------+----------------+----------+-------------------+----------+--------------------+--------

In [15]:
df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+----------+-------------------+----------+--------------------+-------------+
|transaction_id|user_id|amount| location|transaction_time|      date|     date_timestamp|add_2_days|           timestamp|date_diff_col|
+--------------+-------+------+---------+----------------+----------+-------------------+----------+--------------------+-------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03|2025-10-03 12:00:00|2025-10-05|2025-03-11 10:53:...|           -2|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03|2025-10-03 12:04:00|2025-10-05|2025-03-11 10:53:...|           -2|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2025-10-03|2025-10-03 14:00:00|2025-10-05|2025-03-11 10:53:...|           -2|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2025-10-03|2025-10-03 14:01:00|2025-10-05|2025-03-11 10:53:...|           -2|
|           205|      3|  1000| Banglore|10-03-2

In [16]:
df.withColumn('location_upper', functions.upper('location')).show()
df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+----------+-------------------+----------+--------------------+-------------+--------------+
|transaction_id|user_id|amount| location|transaction_time|      date|     date_timestamp|add_2_days|           timestamp|date_diff_col|location_upper|
+--------------+-------+------+---------+----------------+----------+-------------------+----------+--------------------+-------------+--------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03|2025-10-03 12:00:00|2025-10-05|2025-03-11 10:54:...|           -2|     HYDERABAD|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03|2025-10-03 12:04:00|2025-10-05|2025-03-11 10:54:...|           -2|     HYDERABAD|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2025-10-03|2025-10-03 14:00:00|2025-10-05|2025-03-11 10:54:...|           -2|     HYDERABAD|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2025-10-03|2025-10-03 14:01:00|2025-

In [None]:
df.withColumn('location_upper', functions.upper('transaction_id')).show()
df.show()
df.printSchema()