In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)


In [2]:
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)

In [3]:
df.printSchema()

root
 |-- transaction_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- amount: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_time: string (nullable = true)



In [4]:
df.show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|  Chennai|10-03-2025 14:01|
|           205|      3|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+



In [5]:
from pyspark.sql import functions
df.withColumn('tr_us', functions.concat(col('transaction_id'), col('user_id'))).show()

+--------------+-------+------+---------+----------------+-----+
|transaction_id|user_id|amount| location|transaction_time|tr_us|
+--------------+-------+------+---------+----------------+-----+
|           201|      1|   500|Hyderabad|10-03-2025 12:00| 2011|
|           202|      1|   700|Hyderabad|10-03-2025 12:04| 2021|
|           203|      2|   200|Hyderabad|10-03-2025 14:00| 2032|
|           204|      2|   250|  Chennai|10-03-2025 14:01| 2042|
|           205|      3|  1000| Banglore|10-03-2025 15:30| 2053|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34| 2063|
+--------------+-------+------+---------+----------------+-----+



In [6]:
from pyspark.sql import functions
df.withColumn('tr_us', functions.concat(col('transaction_id'))).show()

+--------------+-------+------+---------+----------------+-----+
|transaction_id|user_id|amount| location|transaction_time|tr_us|
+--------------+-------+------+---------+----------------+-----+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|  201|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|  202|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|  203|
|           204|      2|   250|  Chennai|10-03-2025 14:01|  204|
|           205|      3|  1000| Banglore|10-03-2025 15:30|  205|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|  206|
+--------------+-------+------+---------+----------------+-----+



In [9]:
from pyspark.sql import functions
df.withColumn('tr_us', functions.concat(col('transaction_id'),functions.lit('123'))).show()

+--------------+-------+------+---------+----------------+------+
|transaction_id|user_id|amount| location|transaction_time| tr_us|
+--------------+-------+------+---------+----------------+------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|201123|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|202123|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|203123|
|           204|      2|   250|  Chennai|10-03-2025 14:01|204123|
|           205|      3|  1000| Banglore|10-03-2025 15:30|205123|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|206123|
+--------------+-------+------+---------+----------------+------+



In [10]:
from pyspark.sql import functions
df.withColumn('tr_us', functions.concat('transaction_id','user_id',functions.lit('123'))).show()

+--------------+-------+------+---------+----------------+-------+
|transaction_id|user_id|amount| location|transaction_time|  tr_us|
+--------------+-------+------+---------+----------------+-------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|2011123|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|2021123|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|2032123|
|           204|      2|   250|  Chennai|10-03-2025 14:01|2042123|
|           205|      3|  1000| Banglore|10-03-2025 15:30|2053123|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|2063123|
+--------------+-------+------+---------+----------------+-------+



In [12]:
from pyspark.sql import functions
df.withColumn('tr_us', functions.concat_ws('==','transaction_id','user_id',functions.lit('123'))).show()

+--------------+-------+------+---------+----------------+-----------+
|transaction_id|user_id|amount| location|transaction_time|      tr_us|
+--------------+-------+------+---------+----------------+-----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|201==1==123|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|202==1==123|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|203==2==123|
|           204|      2|   250|  Chennai|10-03-2025 14:01|204==2==123|
|           205|      3|  1000| Banglore|10-03-2025 15:30|205==3==123|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|206==3==123|
+--------------+-------+------+---------+----------------+-----------+



In [13]:
from pyspark.sql import functions
df.withColumn('upper_conv', functions.upper('location')).show()

+--------------+-------+------+---------+----------------+----------+
|transaction_id|user_id|amount| location|transaction_time|upper_conv|
+--------------+-------+------+---------+----------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00| HYDERABAD|
|           202|      1|   700|Hyderabad|10-03-2025 12:04| HYDERABAD|
|           203|      2|   200|Hyderabad|10-03-2025 14:00| HYDERABAD|
|           204|      2|   250|  Chennai|10-03-2025 14:01|   CHENNAI|
|           205|      3|  1000| Banglore|10-03-2025 15:30|  BANGLORE|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34| HYDERABAD|
+--------------+-------+------+---------+----------------+----------+



In [14]:
from pyspark.sql import functions
df.withColumn('upper_conv', functions.upper('transaction_id')).show()

+--------------+-------+------+---------+----------------+----------+
|transaction_id|user_id|amount| location|transaction_time|upper_conv|
+--------------+-------+------+---------+----------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|       201|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|       202|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|       203|
|           204|      2|   250|  Chennai|10-03-2025 14:01|       204|
|           205|      3|  1000| Banglore|10-03-2025 15:30|       205|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|       206|
+--------------+-------+------+---------+----------------+----------+



In [15]:
from pyspark.sql import functions
df.withColumn('upper_conv', functions.length('transaction_id')).show()

+--------------+-------+------+---------+----------------+----------+
|transaction_id|user_id|amount| location|transaction_time|upper_conv|
+--------------+-------+------+---------+----------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|         3|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|         3|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|         3|
|           204|      2|   250|  Chennai|10-03-2025 14:01|         3|
|           205|      3|  1000| Banglore|10-03-2025 15:30|         3|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|         3|
+--------------+-------+------+---------+----------------+----------+



In [18]:
from pyspark.sql import functions
from pyspark.sql import functions
df = df.withColumn('trim_ext', functions.concat(functions.lit('123'),"location", functions.lit('213')))
df.withColumn('after_trim', functions.trim('trim_ext')).show()

+--------------+-------+------+---------+----------------+---------------+---------------+
|transaction_id|user_id|amount| location|transaction_time|       trim_ext|     after_trim|
+--------------+-------+------+---------+----------------+---------------+---------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|123Hyderabad213|123Hyderabad213|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|123Hyderabad213|123Hyderabad213|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|123Hyderabad213|123Hyderabad213|
|           204|      2|   250|  Chennai|10-03-2025 14:01|  123Chennai213|  123Chennai213|
|           205|      3|  1000| Banglore|10-03-2025 15:30| 123Banglore213| 123Banglore213|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|123Hyderabad213|123Hyderabad213|
+--------------+-------+------+---------+----------------+---------------+---------------+



In [19]:
from pyspark.sql import functions
from pyspark.sql import functions
df = df.withColumn('trim_ext', functions.concat(functions.lit(' 123   '),"location", functions.lit('   213 ')))
df.withColumn('after_trim', functions.trim('trim_ext')).show()

+--------------+-------+------+---------+----------------+--------------------+--------------------+
|transaction_id|user_id|amount| location|transaction_time|            trim_ext|          after_trim|
+--------------+-------+------+---------+----------------+--------------------+--------------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00| 123   Hyderabad ...|123   Hyderabad  ...|
|           202|      1|   700|Hyderabad|10-03-2025 12:04| 123   Hyderabad ...|123   Hyderabad  ...|
|           203|      2|   200|Hyderabad|10-03-2025 14:00| 123   Hyderabad ...|123   Hyderabad  ...|
|           204|      2|   250|  Chennai|10-03-2025 14:01| 123   Chennai   ...| 123   Chennai   213|
|           205|      3|  1000| Banglore|10-03-2025 15:30| 123   Banglore  ...|123   Banglore   213|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34| 123   Hyderabad ...|123   Hyderabad  ...|
+--------------+-------+------+---------+----------------+--------------------+------------

In [20]:
from pyspark.sql import functions
from pyspark.sql import functions
df = df.withColumn('trim_ext', functions.concat(functions.lit('123'),"location", functions.lit('213')))
df.withColumn('after_trim', functions.trim('trim_ext','3')).show()

TypeError: trim() takes 1 positional argument but 2 were given

In [21]:
functions.trim?

[1;31mSignature:[0m [0mfunctions[0m[1;33m.[0m[0mtrim[0m[1;33m([0m[0mcol[0m[1;33m:[0m [1;34m'ColumnOrName'[0m[1;33m)[0m [1;33m->[0m [0mpyspark[0m[1;33m.[0m[0msql[0m[1;33m.[0m[0mcolumn[0m[1;33m.[0m[0mColumn[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Trim the spaces from both ends for the specified string column.

.. versionadded:: 1.5.0

.. versionchanged:: 3.4.0
    Supports Spark Connect.

Parameters
----------
col : :class:`~pyspark.sql.Column` or str
    target column to work on.

Returns
-------
:class:`~pyspark.sql.Column`
    trimmed values from both sides.

Examples
--------
>>> df = spark.createDataFrame(["   Spark", "Spark  ", " Spark"], "STRING")
>>> df.select(trim("value").alias("r")).withColumn("length", length("r")).show()
+-----+------+
|    r|length|
+-----+------+
|Spark|     5|
|Spark|     5|
|Spark|     5|
+-----+------+
[1;31mFile:[0m      c:\users\navya\anaconda3\lib\site-packages\pyspark\sql\functions.py
[1;31mType:[0m  

In [22]:
from pyspark.sql import functions
from pyspark.sql import functions
df = df.withColumn('trim_ext', functions.concat(functions.lit('123'),"location", functions.lit('213')))
df.withColumn('after_trim', functions.ltrim('trim_ext')).show()

+--------------+-------+------+---------+----------------+---------------+---------------+
|transaction_id|user_id|amount| location|transaction_time|       trim_ext|     after_trim|
+--------------+-------+------+---------+----------------+---------------+---------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|123Hyderabad213|123Hyderabad213|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|123Hyderabad213|123Hyderabad213|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|123Hyderabad213|123Hyderabad213|
|           204|      2|   250|  Chennai|10-03-2025 14:01|  123Chennai213|  123Chennai213|
|           205|      3|  1000| Banglore|10-03-2025 15:30| 123Banglore213| 123Banglore213|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|123Hyderabad213|123Hyderabad213|
+--------------+-------+------+---------+----------------+---------------+---------------+



In [23]:
functions.ltrim?

[1;31mSignature:[0m [0mfunctions[0m[1;33m.[0m[0mltrim[0m[1;33m([0m[0mcol[0m[1;33m:[0m [1;34m'ColumnOrName'[0m[1;33m)[0m [1;33m->[0m [0mpyspark[0m[1;33m.[0m[0msql[0m[1;33m.[0m[0mcolumn[0m[1;33m.[0m[0mColumn[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Trim the spaces from left end for the specified string value.

.. versionadded:: 1.5.0

.. versionchanged:: 3.4.0
    Supports Spark Connect.

Parameters
----------
col : :class:`~pyspark.sql.Column` or str
    target column to work on.

Returns
-------
:class:`~pyspark.sql.Column`
    left trimmed values.

Examples
--------
>>> df = spark.createDataFrame(["   Spark", "Spark  ", " Spark"], "STRING")
>>> df.select(ltrim("value").alias("r")).withColumn("length", length("r")).show()
+-------+------+
|      r|length|
+-------+------+
|  Spark|     5|
|Spark  |     7|
|  Spark|     5|
+-------+------+
[1;31mFile:[0m      c:\users\navya\anaconda3\lib\site-packages\pyspark\sql\functions.py
[1;31mType:[0

In [24]:
functions.rtrim?

[1;31mSignature:[0m [0mfunctions[0m[1;33m.[0m[0mrtrim[0m[1;33m([0m[0mcol[0m[1;33m:[0m [1;34m'ColumnOrName'[0m[1;33m)[0m [1;33m->[0m [0mpyspark[0m[1;33m.[0m[0msql[0m[1;33m.[0m[0mcolumn[0m[1;33m.[0m[0mColumn[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Trim the spaces from right end for the specified string value.

.. versionadded:: 1.5.0

.. versionchanged:: 3.4.0
    Supports Spark Connect.

Parameters
----------
col : :class:`~pyspark.sql.Column` or str
    target column to work on.

Returns
-------
:class:`~pyspark.sql.Column`
    right trimmed values.

Examples
--------
>>> df = spark.createDataFrame(["   Spark", "Spark  ", " Spark"], "STRING")
>>> df.select(rtrim("value").alias("r")).withColumn("length", length("r")).show()
+--------+------+
|       r|length|
+--------+------+
|   Spark|     8|
|   Spark|     5|
|   Spark|     6|
+--------+------+
[1;31mFile:[0m      c:\users\navya\anaconda3\lib\site-packages\pyspark\sql\functions.py
[1;31

In [27]:
print([i for i in dir(functions) if 'pad' in i.lower()])

['lpad', 'rpad']


In [28]:
from pyspark.sql import functions
from pyspark.sql import functions
df.show()


+--------------+-------+------+---------+----------------+---------------+
|transaction_id|user_id|amount| location|transaction_time|       trim_ext|
+--------------+-------+------+---------+----------------+---------------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|123Hyderabad213|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|123Hyderabad213|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|123Hyderabad213|
|           204|      2|   250|  Chennai|10-03-2025 14:01|  123Chennai213|
|           205|      3|  1000| Banglore|10-03-2025 15:30| 123Banglore213|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|123Hyderabad213|
+--------------+-------+------+---------+----------------+---------------+



In [30]:
df.withColumn('after_lpad', functions.lpad('location',10,"#")).show()

+--------------+-------+------+---------+----------------+---------------+----------+
|transaction_id|user_id|amount| location|transaction_time|       trim_ext|after_lpad|
+--------------+-------+------+---------+----------------+---------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|123Hyderabad213|#Hyderabad|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|123Hyderabad213|#Hyderabad|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|123Hyderabad213|#Hyderabad|
|           204|      2|   250|  Chennai|10-03-2025 14:01|  123Chennai213|###Chennai|
|           205|      3|  1000| Banglore|10-03-2025 15:30| 123Banglore213|##Banglore|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|123Hyderabad213|#Hyderabad|
+--------------+-------+------+---------+----------------+---------------+----------+



In [31]:
df.withColumn('after_lpad', functions.lpad('location',6,"#")).show()

+--------------+-------+------+---------+----------------+---------------+----------+
|transaction_id|user_id|amount| location|transaction_time|       trim_ext|after_lpad|
+--------------+-------+------+---------+----------------+---------------+----------+
|           201|      1|   500|Hyderabad|10-03-2025 12:00|123Hyderabad213|    Hydera|
|           202|      1|   700|Hyderabad|10-03-2025 12:04|123Hyderabad213|    Hydera|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|123Hyderabad213|    Hydera|
|           204|      2|   250|  Chennai|10-03-2025 14:01|  123Chennai213|    Chenna|
|           205|      3|  1000| Banglore|10-03-2025 15:30| 123Banglore213|    Banglo|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|123Hyderabad213|    Hydera|
+--------------+-------+------+---------+----------------+---------------+----------+



In [34]:
df = spark.read.csv("data\\explode_data.txt", header=True, inferSchema=True)
df.show()
df.printSchema()

+-----------+--------------------+----------+
|customer_id|               items|      date|
+-----------+--------------------+----------+
|        101|Laptop;Mouse;Keyb...|2025-03-09|
|        102|  Smartphone;Earbuds|2025-03-08|
|        103|              Tablet|2025-03-07|
|        104|Monitor;HDMICable...|2025-03-06|
+-----------+--------------------+----------+

root
 |-- customer_id: integer (nullable = true)
 |-- items: string (nullable = true)
 |-- date: date (nullable = true)



In [36]:
df = spark.read.csv("data\\explode_data.txt", header=True, inferSchema=True)
df.withColumn('items_split',functions.split('items',';')).show()
df.printSchema()

+-----------+--------------------+----------+--------------------+
|customer_id|               items|      date|         items_split|
+-----------+--------------------+----------+--------------------+
|        101|Laptop;Mouse;Keyb...|2025-03-09|[Laptop, Mouse, K...|
|        102|  Smartphone;Earbuds|2025-03-08|[Smartphone, Earb...|
|        103|              Tablet|2025-03-07|            [Tablet]|
|        104|Monitor;HDMICable...|2025-03-06|[Monitor, HDMICab...|
+-----------+--------------------+----------+--------------------+

root
 |-- customer_id: integer (nullable = true)
 |-- items: string (nullable = true)
 |-- date: date (nullable = true)



In [37]:
functions.split?

[1;31mSignature:[0m [0mfunctions[0m[1;33m.[0m[0msplit[0m[1;33m([0m[0mstr[0m[1;33m:[0m [1;34m'ColumnOrName'[0m[1;33m,[0m [0mpattern[0m[1;33m:[0m [0mstr[0m[1;33m,[0m [0mlimit[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;33m-[0m[1;36m1[0m[1;33m)[0m [1;33m->[0m [0mpyspark[0m[1;33m.[0m[0msql[0m[1;33m.[0m[0mcolumn[0m[1;33m.[0m[0mColumn[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Splits str around matches of the given pattern.

.. versionadded:: 1.5.0

.. versionchanged:: 3.4.0
    Supports Spark Connect.

Parameters
----------
str : :class:`~pyspark.sql.Column` or str
    a string expression to split
pattern : str
    a string representing a regular expression. The regex string should be
    a Java regular expression.
limit : int, optional
    an integer which controls the number of times `pattern` is applied.

    * ``limit > 0``: The resulting array's length will not be more than `limit`, and the
                     resulting array's 

In [38]:
df = spark.read.csv("data\\explode_data.txt", header=True, inferSchema=True)
df.withColumn('items_split',functions.split('items',';',1)).show()
df.printSchema()

+-----------+--------------------+----------+--------------------+
|customer_id|               items|      date|         items_split|
+-----------+--------------------+----------+--------------------+
|        101|Laptop;Mouse;Keyb...|2025-03-09|[Laptop;Mouse;Key...|
|        102|  Smartphone;Earbuds|2025-03-08|[Smartphone;Earbuds]|
|        103|              Tablet|2025-03-07|            [Tablet]|
|        104|Monitor;HDMICable...|2025-03-06|[Monitor;HDMICabl...|
+-----------+--------------------+----------+--------------------+

root
 |-- customer_id: integer (nullable = true)
 |-- items: string (nullable = true)
 |-- date: date (nullable = true)



In [39]:
df = spark.read.csv("data\\explode_data.txt", header=True, inferSchema=True)
df.withColumn('items_split',functions.split('items',';',1)).show(truncate=False)
df.printSchema()

+-----------+-------------------------+----------+---------------------------+
|customer_id|items                    |date      |items_split                |
+-----------+-------------------------+----------+---------------------------+
|101        |Laptop;Mouse;Keyboard    |2025-03-09|[Laptop;Mouse;Keyboard]    |
|102        |Smartphone;Earbuds       |2025-03-08|[Smartphone;Earbuds]       |
|103        |Tablet                   |2025-03-07|[Tablet]                   |
|104        |Monitor;HDMICable;Speaker|2025-03-06|[Monitor;HDMICable;Speaker]|
+-----------+-------------------------+----------+---------------------------+

root
 |-- customer_id: integer (nullable = true)
 |-- items: string (nullable = true)
 |-- date: date (nullable = true)



In [40]:
df = spark.read.csv("data\\explode_data.txt", header=True, inferSchema=True)
df.withColumn('items_split',functions.split('items',';',2)).show(truncate=False)
df.printSchema()

+-----------+-------------------------+----------+----------------------------+
|customer_id|items                    |date      |items_split                 |
+-----------+-------------------------+----------+----------------------------+
|101        |Laptop;Mouse;Keyboard    |2025-03-09|[Laptop, Mouse;Keyboard]    |
|102        |Smartphone;Earbuds       |2025-03-08|[Smartphone, Earbuds]       |
|103        |Tablet                   |2025-03-07|[Tablet]                    |
|104        |Monitor;HDMICable;Speaker|2025-03-06|[Monitor, HDMICable;Speaker]|
+-----------+-------------------------+----------+----------------------------+

root
 |-- customer_id: integer (nullable = true)
 |-- items: string (nullable = true)
 |-- date: date (nullable = true)



In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("data\\joins\\user_transactions.csv", header=True, inferSchema=True)


In [2]:
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+



In [3]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.withColumn('concat',functions.concat('user_id','amount','location')).show()

+--------------+-------+------+---------+----------------+--------------+
|transaction_id|user_id|amount| location|transaction_time|        concat|
+--------------+-------+------+---------+----------------+--------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|          NULL|
|           202|   NULL|   700|Hyderabad|10-03-2025 12:04|          NULL|
|           203|      2|   200|Hyderabad|10-03-2025 14:00| 2200Hyderabad|
|           204|      2|   250|     NULL|10-03-2025 14:01|          NULL|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|          NULL|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|31500Hyderabad|
+--------------+-------+------+---------+----------------+--------------+



In [7]:
df.fillna("").show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|         |10-03-2025 12:00|
|           202|   NULL|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|         |10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+



In [9]:
df.fillna("").fillna(0).show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|         |10-03-2025 12:00|
|           202|      0|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|         |10-03-2025 14:01|
|           205|      0|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+



In [10]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df = df.fillna("").fillna(0)
df.withColumn('concat',functions.concat('user_id','amount','location')).show()

+--------------+-------+------+---------+----------------+--------------+
|transaction_id|user_id|amount| location|transaction_time|        concat|
+--------------+-------+------+---------+----------------+--------------+
|           201|      1|   500|         |10-03-2025 12:00|          1500|
|           202|      0|   700|Hyderabad|10-03-2025 12:04| 0700Hyderabad|
|           203|      2|   200|Hyderabad|10-03-2025 14:00| 2200Hyderabad|
|           204|      2|   250|         |10-03-2025 14:01|          2250|
|           205|      0|  1000| Banglore|10-03-2025 15:30| 01000Banglore|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|31500Hyderabad|
+--------------+-------+------+---------+----------------+--------------+



In [11]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+



In [15]:
import datetime

In [16]:
print(dir(datetime))

['MAXYEAR', 'MINYEAR', 'UTC', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'date', 'datetime', 'datetime_CAPI', 'time', 'timedelta', 'timezone', 'tzinfo']


In [17]:
print(dir(datetime.datetime))

['__add__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__rsub__', '__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', 'astimezone', 'combine', 'ctime', 'date', 'day', 'dst', 'fold', 'fromisocalendar', 'fromisoformat', 'fromordinal', 'fromtimestamp', 'hour', 'isocalendar', 'isoformat', 'isoweekday', 'max', 'microsecond', 'min', 'minute', 'month', 'now', 'replace', 'resolution', 'second', 'strftime', 'strptime', 'time', 'timestamp', 'timetuple', 'timetz', 'today', 'toordinal', 'tzinfo', 'tzname', 'utcfromtimestamp', 'utcnow', 'utcoffset', 'utctimetuple', 'weekday', 'year']


In [21]:
from datetime import datetime
#"2025-03-14"
da = datetime.strptime("2025-03-14","%Y-%m-%d")
print(da, type(da))

2025-03-14 00:00:00 <class 'datetime.datetime'>


In [22]:
from datetime import datetime
#"2025-03-14"
da = datetime.strptime("2025-03-14","%m-%Y-%d")
print(da, type(da))

ValueError: time data '2025-03-14' does not match format '%m-%Y-%d'

In [23]:
from datetime import datetime
#"2025-03-14"
da = datetime.strptime("2025-03-14","%Y-%m-%d")
print(da, type(da))

2025-03-14 00:00:00 <class 'datetime.datetime'>


In [24]:
from datetime import datetime
#"2025-03-14"
da = datetime.strptime("2025-03-14","%Y-%m-%d")
print(da, type(da))
da_str = datetime.strftime(da, '%m-%d-%Y')
print(da_str, type(da_str))

2025-03-14 00:00:00 <class 'datetime.datetime'>
03-14-2025 <class 'str'>


In [26]:
from pyspark.sql import functions
df = spark.read.csv("data\\explode_data.txt", header=True, inferSchema=True)
df.show(truncate=False)


+-----------+-------------------------+----------+
|customer_id|items                    |date      |
+-----------+-------------------------+----------+
|101        |Laptop;Mouse;Keyboard    |2025-03-09|
|102        |Smartphone;Earbuds       |2025-03-08|
|103        |Tablet                   |2025-03-07|
|104        |Monitor;HDMICable;Speaker|2025-03-06|
+-----------+-------------------------+----------+



In [27]:
from pyspark.sql import functions
df = spark.read.csv("data\\explode_data.txt", header=True, inferSchema=True)
df.withColumn('items_list',functions.split("items",";")).show(truncate=False)

+-----------+-------------------------+----------+-----------------------------+
|customer_id|items                    |date      |items_list                   |
+-----------+-------------------------+----------+-----------------------------+
|101        |Laptop;Mouse;Keyboard    |2025-03-09|[Laptop, Mouse, Keyboard]    |
|102        |Smartphone;Earbuds       |2025-03-08|[Smartphone, Earbuds]        |
|103        |Tablet                   |2025-03-07|[Tablet]                     |
|104        |Monitor;HDMICable;Speaker|2025-03-06|[Monitor, HDMICable, Speaker]|
+-----------+-------------------------+----------+-----------------------------+



In [30]:
from pyspark.sql import functions
df = spark.read.csv("data\\explode_data.txt", header=True, inferSchema=True)
df = df.withColumn('items_list',functions.split("items",";"))
df = df.drop('items')

df.withColumn('itm', functions.explode("items_list")).show(truncate=False)

+-----------+----------+-----------------------------+----------+
|customer_id|date      |items_list                   |itm       |
+-----------+----------+-----------------------------+----------+
|101        |2025-03-09|[Laptop, Mouse, Keyboard]    |Laptop    |
|101        |2025-03-09|[Laptop, Mouse, Keyboard]    |Mouse     |
|101        |2025-03-09|[Laptop, Mouse, Keyboard]    |Keyboard  |
|102        |2025-03-08|[Smartphone, Earbuds]        |Smartphone|
|102        |2025-03-08|[Smartphone, Earbuds]        |Earbuds   |
|103        |2025-03-07|[Tablet]                     |Tablet    |
|104        |2025-03-06|[Monitor, HDMICable, Speaker]|Monitor   |
|104        |2025-03-06|[Monitor, HDMICable, Speaker]|HDMICable |
|104        |2025-03-06|[Monitor, HDMICable, Speaker]|Speaker   |
+-----------+----------+-----------------------------+----------+



In [31]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()
df.dropna().show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+----------------+



In [33]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()
df.dropna("all").show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
|          NULL|   NULL|  NULL|     NULL|            NULL|
+--------------+-------+------+---------+----------------+

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:

In [34]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()
df.dropna(subset=['user_id']).show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|Hyderabad|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
|          NULL|   NULL|  NULL|     NULL|            NULL|
+--------------+-------+------+---------+----------------+

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:

In [35]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()
df.dropna(subset=['user_id','location']).show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|     NULL|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
|          NULL|   NULL|  NULL|     NULL|            NULL|
+--------------+-------+------+---------+----------------+

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
+--------------+-------+------+---------+--------------

In [36]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()
df.dropna?

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|     NULL|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
|          NULL|   NULL|  NULL|     NULL|            NULL|
+--------------+-------+------+---------+----------------+



[1;31mSignature:[0m
[0mdf[0m[1;33m.[0m[0mdropna[0m[1;33m([0m[1;33m
[0m    [0mhow[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'any'[0m[1;33m,[0m[1;33m
[0m    [0mthresh[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mint[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msubset[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mTuple[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [1;33m...[0m[1;33m][0m[1;33m,[0m [0mList[0m[1;33m[[0m[0mstr[0m[1;33m][0m[1;33m,[0m [0mNoneType[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Returns a new :class:`DataFrame` omitting rows with null values.
:func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.

.. versionadded:: 1.3.1

.. versionchanged:: 3.4.0
    Supports Spark Connect.

Parameters
----------
how : str, optional
    '

In [37]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()
df.dropna(how="any", thresh=2).show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|     NULL|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
|          NULL|   NULL|  NULL|     NULL|            NULL|
+--------------+-------+------+---------+----------------+

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|     NULL|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:

In [38]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()
df.dropna(how="any", thresh=3).show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|     NULL|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
|          NULL|   NULL|  NULL|     NULL|            NULL|
+--------------+-------+------+---------+----------------+

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|     NULL|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:

In [39]:
from pyspark.sql import functions
df = spark.read.csv("data\\joins\\user_transactions1.csv", header=True, inferSchema=True)
df.show()
df.dropna(thresh=4).show()

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           202|   NULL|   700|     NULL|10-03-2025 12:04|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14:01|
|           205|   NULL|  1000| Banglore|10-03-2025 15:30|
|           206|      3|  1500|Hyderabad|10-03-2025 15:34|
|          NULL|   NULL|  NULL|     NULL|            NULL|
+--------------+-------+------+---------+----------------+

+--------------+-------+------+---------+----------------+
|transaction_id|user_id|amount| location|transaction_time|
+--------------+-------+------+---------+----------------+
|           201|      1|   500|     NULL|10-03-2025 12:00|
|           203|      2|   200|Hyderabad|10-03-2025 14:00|
|           204|      2|   250|     NULL|10-03-2025 14: