In [3]:
from pyspark.sql import SparkSession
import pandas as pd
from datetime import date, datetime
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import col, upper, lower, length, concat_ws, regexp_replace, trim, lpad, rpad, substring
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|
|           205|      3|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)



## apply date_format function on string column

In [6]:

from pyspark.sql.functions import col, date_format
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df.withColumn("date",date_format(col('transaction_time'), 'MM-yyyy-dd HH:mm')).show()

+--------------+-------+------+---------+----------------+----+
|transaction_id|user_id|amount| location|transaction_time|date|
+--------------+-------+------+---------+----------------+----+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|NULL|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|NULL|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|NULL|
|           204|      2|   250|  Chennai|10-03-2025 14:01|NULL|
|           205|      3|  1000| Banglore|10-03-2025 15:30|NULL|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|NULL|
+--------------+-------+------+---------+----------------+----+



## Convert string column to datetime format

In [9]:
from pyspark.sql.functions import to_date
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df = df.withColumn("date",to_date(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df.printSchema()

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)
 |-- date: date (nullable = true)



In [12]:
from pyspark.sql.functions import to_date, to_datetime
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df = df.withColumn("date",to_date_time(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df.printSchema()

ImportError: cannot import name 'to_datetime' from 'pyspark.sql.functions' (C:\Users\Navya\anaconda3\Lib\site-packages\pyspark\sql\functions.py)

In [13]:
from pyspark.sql import functions
print([i for i in dir(functions) if 'date' in i.lower()])

['curdate', 'current_date', 'date_add', 'date_diff', 'date_format', 'date_from_unix_date', 'date_part', 'date_sub', 'date_trunc', 'dateadd', 'datediff', 'datepart', 'make_date', 'to_date', 'unix_date']


In [14]:
from pyspark.sql import functions
print([i for i in dir(functions) if 'time' in i.lower()])

['convert_timezone', 'current_timestamp', 'current_timezone', 'from_unixtime', 'from_utc_timestamp', 'localtimestamp', 'make_timestamp', 'make_timestamp_ltz', 'make_timestamp_ntz', 'timestamp_micros', 'timestamp_millis', 'timestamp_seconds', 'to_timestamp', 'to_timestamp_ltz', 'to_timestamp_ntz', 'to_unix_timestamp', 'to_utc_timestamp', 'try_to_timestamp', 'unix_timestamp', 'window_time']


In [17]:
from pyspark.sql.functions import to_date, to_timestamp
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df = df.withColumn("date",to_timestamp(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+-------------------+
|transaction_id|user_id|amount| location|transaction_time|               date|
+--------------+-------+------+---------+----------------+-------------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03 12:00:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03 12:04:00|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2025-10-03 14:00:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2025-10-03 14:01:00|
|           205|      3|  1000| Banglore|10-03-2025 15:30|2025-10-03 15:30:00|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|2025-10-03 15:34:00|
+--------------+-------+------+---------+----------------+-------------------+

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullabl

In [18]:
from pyspark.sql.functions import to_date, to_timestamp
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df = df.withColumn("to_timestamp",to_timestamp(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("to_date",to_date(col('transaction_time'), 'MM-dd-yyyy HH:mm'))

df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+-------------------+----------+
|transaction_id|user_id|amount| location|transaction_time|       to_timestamp|   to_date|
+--------------+-------+------+---------+----------------+-------------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03 12:00:00|2025-10-03|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03 12:04:00|2025-10-03|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2025-10-03 14:00:00|2025-10-03|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2025-10-03 14:01:00|2025-10-03|
|           205|      3|  1000| Banglore|10-03-2025 15:30|2025-10-03 15:30:00|2025-10-03|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|2025-10-03 15:34:00|2025-10-03|
+--------------+-------+------+---------+----------------+-------------------+----------+

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- am

In [19]:
from pyspark.sql.functions import to_date, to_timestamp, date_add
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df = df.withColumn("to_timestamp",to_timestamp(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("to_date",to_date(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("add_2_to_date",date_add(col('to_date'), 2))

df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+-------------------+----------+-------------+
|transaction_id|user_id|amount| location|transaction_time|       to_timestamp|   to_date|add_2_to_date|
+--------------+-------+------+---------+----------------+-------------------+----------+-------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03 12:00:00|2025-10-03|   2025-10-05|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03 12:04:00|2025-10-03|   2025-10-05|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2025-10-03 14:00:00|2025-10-03|   2025-10-05|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2025-10-03 14:01:00|2025-10-03|   2025-10-05|
|           205|      3|  1000| Banglore|10-03-2025 15:30|2025-10-03 15:30:00|2025-10-03|   2025-10-05|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|2025-10-03 15:34:00|2025-10-03|   2025-10-05|
+--------------+-------+------+---------+----------------+------

In [20]:
from pyspark.sql.functions import to_date, to_timestamp, date_add, date_sub
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df = df.withColumn("to_timestamp",to_timestamp(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("to_date",to_date(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("add_2_to_date",date_add(col('to_date'), 2))
df = df.withColumn("sub_2_to_date",date_sub(col('to_date'), 2))

df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+-------------------+----------+-------------+-------------+
|transaction_id|user_id|amount| location|transaction_time|       to_timestamp|   to_date|add_2_to_date|sub_2_to_date|
+--------------+-------+------+---------+----------------+-------------------+----------+-------------+-------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03 12:00:00|2025-10-03|   2025-10-05|   2025-10-01|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03 12:04:00|2025-10-03|   2025-10-05|   2025-10-01|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2025-10-03 14:00:00|2025-10-03|   2025-10-05|   2025-10-01|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2025-10-03 14:01:00|2025-10-03|   2025-10-05|   2025-10-01|
|           205|      3|  1000| Banglore|10-03-2025 15:30|2025-10-03 15:30:00|2025-10-03|   2025-10-05|   2025-10-01|
|           206|      3|  1500|Hyderabad|10-03-2025 15:3

In [21]:
from pyspark.sql.functions import to_date, to_timestamp, date_add, date_sub, date_diff
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df = df.withColumn("to_timestamp",to_timestamp(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("to_date",to_date(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("add_2_to_date",date_add(col('to_date'), 2))
df = df.withColumn("sub_2_to_date",date_sub(col('to_date'), 2))
df = df.withColumn("date_diff", date_diff("to_date", "add_2_to_date"))

df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+-------------------+----------+-------------+-------------+---------+
|transaction_id|user_id|amount| location|transaction_time|       to_timestamp|   to_date|add_2_to_date|sub_2_to_date|date_diff|
+--------------+-------+------+---------+----------------+-------------------+----------+-------------+-------------+---------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03 12:00:00|2025-10-03|   2025-10-05|   2025-10-01|       -2|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03 12:04:00|2025-10-03|   2025-10-05|   2025-10-01|       -2|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2025-10-03 14:00:00|2025-10-03|   2025-10-05|   2025-10-01|       -2|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2025-10-03 14:01:00|2025-10-03|   2025-10-05|   2025-10-01|       -2|
|           205|      3|  1000| Banglore|10-03-2025 15:30|2025-10-03 15:30:00|2025-10-03|   2025-10-05| 

In [22]:
from pyspark.sql.functions import to_date, to_timestamp, date_add, date_sub, date_diff
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)
df = df.withColumn("to_timestamp",to_timestamp(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("to_date",to_date(col('transaction_time'), 'MM-dd-yyyy HH:mm'))
df = df.withColumn("add_2_to_date",date_add(col('to_date'), 2))
df = df.withColumn("sub_2_to_date",date_sub(col('to_date'), 2))
df = df.withColumn("date_diff", date_diff("to_date", "add_2_to_date"))
df = df.withColumn("time_stamp_diff", date_diff("to_date", "add_2_to_date"))

df.show()
df.printSchema()

+--------------+-------+------+---------+----------------+-------------------+----------+-------------+-------------+---------+----------+
|transaction_id|user_id|amount| location|transaction_time|       to_timestamp|   to_date|add_2_to_date|sub_2_to_date|date_diff|time_stamp|
+--------------+-------+------+---------+----------------+-------------------+----------+-------------+-------------+---------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2025-10-03 12:00:00|2025-10-03|   2025-10-05|   2025-10-01|       -2|        -2|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2025-10-03 12:04:00|2025-10-03|   2025-10-05|   2025-10-01|       -2|        -2|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2025-10-03 14:00:00|2025-10-03|   2025-10-05|   2025-10-01|       -2|        -2|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2025-10-03 14:01:00|2025-10-03|   2025-10-05|   2025-10-01|       -2|        -2|
|           205|      3|  1

In [None]:
_