In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive


## 1. Create an empty DataFrame

In [2]:
Spark2 = SparkSession.builder.master("local").appName("EmptyDataFrame").getOrCreate()
#Empty DataFrame with Nodata
empty_df = Spark2.createDataFrame([], "id INT, name STRING")
empty_df.show()

+---+----+
| id|name|
+---+----+
+---+----+



## 2. Convert RDD to DataFrame

In [3]:
rdd = spark.sparkContext.parallelize([(1, "ABC"),(2, "DEF")])
columns = ["id", "name"]
df_from_rdd = rdd.toDF(columns)
df_from_rdd.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
+---+----+



## 3. Convert DataFrame to Pandas

In [4]:
pands_df = df_from_rdd.toPandas()
print(pands_df)

   id name
0   1  ABC
1   2  DEF


## 4. show()

In [5]:
df_from_rdd.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
+---+----+



## StructType & StructField

In [6]:
data =[(1, "ABC"), (2, "DEF"), (3, "GHI")]

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField("name", StringType(), True)
])
df_with_schema= spark.createDataFrame(data, schema)
df_with_schema.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
|  3| GHI|
+---+----+



## 6. Column Class

In [7]:
df = df_with_schema.withColumn('loweerCase_format', lower(col('name')))
df.show()

+---+----+-----------------+
| id|name|loweerCase_format|
+---+----+-----------------+
|  1| ABC|              abc|
|  2| DEF|              def|
|  3| GHI|              ghi|
+---+----+-----------------+



## 7. select()

In [8]:
df.select(col('loweerCase_format')).show()

+-----------------+
|loweerCase_format|
+-----------------+
|              abc|
|              def|
|              ghi|
+-----------------+



## 8. collect()
collect() returns all the rows as a list of Row objects.

In [9]:
rows = df.collect()
print(rows)

[Row(id=1, name='ABC', loweerCase_format='abc'), Row(id=2, name='DEF', loweerCase_format='def'), Row(id=3, name='GHI', loweerCase_format='ghi')]


## 9. withColumn()
This method is used to add or modify a column.

In [10]:
df_with_column = df.withColumn("id_squared", df["id"]*df["id"])
df_with_column.show()

+---+----+-----------------+----------+
| id|name|loweerCase_format|id_squared|
+---+----+-----------------+----------+
|  1| ABC|              abc|         1|
|  2| DEF|              def|         4|
|  3| GHI|              ghi|         9|
+---+----+-----------------+----------+



## 10. withColumnRenamed()
Renames an existing column in the DataFrame

In [11]:
df_renamed = df_with_column.withColumnRenamed("id_squared", "idSquared").withColumnRenamed("loweerCase_format", "loweerCaseFormat")
df_renamed.show()

+---+----+----------------+---------+
| id|name|loweerCaseFormat|idSquared|
+---+----+----------------+---------+
|  1| ABC|             abc|        1|
|  2| DEF|             def|        4|
|  3| GHI|             ghi|        9|
+---+----+----------------+---------+



## 11. where() & filter()
Both methods are used to filter rows based on conditions.

In [12]:
df_filter = df_renamed.filter(df_renamed["id"] > 1)
df_filter.show()

+---+----+----------------+---------+
| id|name|loweerCaseFormat|idSquared|
+---+----+----------------+---------+
|  2| DEF|             def|        4|
|  3| GHI|             ghi|        9|
+---+----+----------------+---------+



In [13]:
df_where = df_renamed.where(df_filter["id"] > 2)
df_where.show()

+---+----+----------------+---------+
| id|name|loweerCaseFormat|idSquared|
+---+----+----------------+---------+
|  3| GHI|             ghi|        9|
+---+----+----------------+---------+



## 12. drop() & dropDuplicates()
Used to drop a column or remove duplicate rows.

In [14]:
df_drop= df_renamed.drop("idSquared")
df_drop.show()


+---+----+----------------+
| id|name|loweerCaseFormat|
+---+----+----------------+
|  1| ABC|             abc|
|  2| DEF|             def|
|  3| GHI|             ghi|
+---+----+----------------+



In [15]:
data_v2 = [
    (1, "Alice", 30),
    (2, "Bob", 25),
    (1, "Alice", 30),
    (3, "Charlie", 25)
]

df_v2 = spark.createDataFrame(data_v2, ["id", "name", "age"])
df_v2.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 30|
|  2|    Bob| 25|
|  1|  Alice| 30|
|  3|Charlie| 25|
+---+-------+---+



In [16]:
df_drop_duplicates = df_v2.dropDuplicates()
df_drop_duplicates.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 25|
|  1|  Alice| 30|
|  3|Charlie| 25|
+---+-------+---+



## 13. orderBy() and sort()
These methods are used for sorting data in DataFrame.

In [17]:
sorted_df = df_drop_duplicates.sort(df_drop_duplicates["id"].desc())
sorted_df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 25|
|  2|    Bob| 25|
|  1|  Alice| 30|
+---+-------+---+



In [18]:
df_order_by = df_drop_duplicates.orderBy(df_drop_duplicates["id"].asc())
df_order_by.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 30|
|  2|    Bob| 25|
|  3|Charlie| 25|
+---+-------+---+



## 14. groupBy()
Used for group-by operations.

In [19]:
df_grouped = df_drop_duplicates.groupBy("Id").count()
df_grouped.show()

+---+-----+
| Id|count|
+---+-----+
|  1|    1|
|  3|    1|
|  2|    1|
+---+-----+



## join()

In [21]:
df2 = spark.createDataFrame([
    (1, 'Math'),
    (2, 'Science'),
    (3, 'English')
], ['id', 'subject'])
df2.show()
df_joined = df_order_by.join(df2, on='id', how='inner')
df_joined.show()



+---+-------+
| id|subject|
+---+-------+
|  1|   Math|
|  2|Science|
|  3|English|
+---+-------+

+---+-------+---+-------+
| id|   name|age|subject|
+---+-------+---+-------+
|  1|  Alice| 30|   Math|
|  3|Charlie| 25|English|
|  2|    Bob| 25|Science|
+---+-------+---+-------+



## 16. union() & unionAll()

In [27]:
df_with_schema.show()
df_3 = spark.createDataFrame([(3, 'JKL'),(4, 'MNO')], ['id', 'name'])
df_3.show()

df_3.union(df_with_schema).show()


+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
|  3| GHI|
+---+----+

+---+----+
| id|name|
+---+----+
|  3| JKL|
|  4| MNO|
+---+----+

+---+----+
| id|name|
+---+----+
|  3| JKL|
|  4| MNO|
|  1| ABC|
|  2| DEF|
|  3| GHI|
+---+----+



## unionByName()

In [28]:
unionByName_df = df_with_schema.unionByName(df_3)
unionByName_df.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
|  3| GHI|
|  3| JKL|
|  4| MNO|
+---+----+



## 18. UDF (User Defined Function)

In [29]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def add_exclamation(name):
  return name + "!"

add_udf = udf(add_exclamation, StringType())
df_udf = df_from_rdd.withColumn("excited_name", add_udf("name"))
df_udf.show()



+---+----+------------+
| id|name|excited_name|
+---+----+------------+
|  1| ABC|        ABC!|
|  2| DEF|        DEF!|
+---+----+------------+



## 19. transform()

In [30]:
df_transformed = df_from_rdd.transform(lambda df: df.withColumn('id_squard', df['id']**2))
df_transformed.show()

+---+----+---------+
| id|name|id_squard|
+---+----+---------+
|  1| ABC|      1.0|
|  2| DEF|      4.0|
+---+----+---------+



## 20. apply()

In [31]:
#Applying to every row
df_applied = df_from_rdd.rdd.map(lambda row: (row.id*2, row.name)).toDF(['id', 'name'])
df_applied.show()

+---+----+
| id|name|
+---+----+
|  2| ABC|
|  4| DEF|
+---+----+

