In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive


## 1. Create an empty DataFrame

In [2]:
Spark2 = SparkSession.builder.master("local").appName("EmptyDataFrame").getOrCreate()
#Empty DataFrame with Nodata
empty_df = Spark2.createDataFrame([], "id INT, name STRING")
empty_df.show()

+---+----+
| id|name|
+---+----+
+---+----+



## 2. Convert RDD to DataFrame

In [4]:
rdd = spark.sparkContext.parallelize([(1, "ABC"),(2, "DEF")])
columns = ["id", "name"]
df_from_rdd = rdd.toDF(columns)
df_from_rdd.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
+---+----+



## 3. Convert DataFrame to Pandas

In [6]:
pands_df = df_from_rdd.toPandas()
print(pands_df)

   id name
0   1  ABC
1   2  DEF


## 4. show()

In [7]:
df_from_rdd.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
+---+----+



## StructType & StructField

In [10]:
data =[(1, "ABC"), (2, "DEF"), (3, "GHI")]

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField("name", StringType(), True)
])
df_with_schema= spark.createDataFrame(data, schema)
df_with_schema.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
|  3| GHI|
+---+----+



## 6. Column Class

In [11]:
df = df_with_schema.withColumn('loweerCase_format', lower(col('name')))
df.show()

+---+----+-----------------+
| id|name|loweerCase_format|
+---+----+-----------------+
|  1| ABC|              abc|
|  2| DEF|              def|
|  3| GHI|              ghi|
+---+----+-----------------+



## 7. select()

In [12]:
df.select(col('loweerCase_format')).show()

+-----------------+
|loweerCase_format|
+-----------------+
|              abc|
|              def|
|              ghi|
+-----------------+



## 8. collect()
collect() returns all the rows as a list of Row objects.

In [15]:
rows = df.collect()
print(rows)

[Row(id=1, name='ABC', loweerCase_format='abc'), Row(id=2, name='DEF', loweerCase_format='def'), Row(id=3, name='GHI', loweerCase_format='ghi')]


## 9. withColumn()
This method is used to add or modify a column.

In [20]:
df_with_column = df.withColumn("id_squared", df["id"]*df["id"])
df_with_column.show()

+---+----+-----------------+----------+
| id|name|loweerCase_format|id_squared|
+---+----+-----------------+----------+
|  1| ABC|              abc|         1|
|  2| DEF|              def|         4|
|  3| GHI|              ghi|         9|
+---+----+-----------------+----------+



## 10. withColumnRenamed()
Renames an existing column in the DataFrame

In [21]:
df_renamed = df_with_column.withColumnRenamed("id_squared", "idSquared").withColumnRenamed("loweerCase_format", "loweerCaseFormat")
df_renamed.show()

+---+----+----------------+---------+
| id|name|loweerCaseFormat|idSquared|
+---+----+----------------+---------+
|  1| ABC|             abc|        1|
|  2| DEF|             def|        4|
|  3| GHI|             ghi|        9|
+---+----+----------------+---------+



## 11. where() & filter()
Both methods are used to filter rows based on conditions.

In [22]:
df_filter = df_renamed.filter(df_renamed["id"] > 1)
df_filter.show()

+---+----+----------------+---------+
| id|name|loweerCaseFormat|idSquared|
+---+----+----------------+---------+
|  2| DEF|             def|        4|
|  3| GHI|             ghi|        9|
+---+----+----------------+---------+



In [24]:
df_where = df_renamed.where(df_filter["id"] > 2)
df_where.show()

+---+----+----------------+---------+
| id|name|loweerCaseFormat|idSquared|
+---+----+----------------+---------+
|  3| GHI|             ghi|        9|
+---+----+----------------+---------+



## 12. drop() & dropDuplicates()
Used to drop a column or remove duplicate rows.

In [25]:
df_drop= df_renamed.drop("idSquared")
df_drop.show()


+---+----+----------------+
| id|name|loweerCaseFormat|
+---+----+----------------+
|  1| ABC|             abc|
|  2| DEF|             def|
|  3| GHI|             ghi|
+---+----+----------------+



In [31]:
data_v2 = [
    (1, "Alice", 30),
    (2, "Bob", 25),
    (1, "Alice", 30),
    (3, "Charlie", 25)
]

df_v2 = spark.createDataFrame(data_v2, ["id", "name", "age"])
df_v2.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 30|
|  2|    Bob| 25|
|  1|  Alice| 30|
|  3|Charlie| 25|
+---+-------+---+



In [34]:
df_drop_duplicates = df_v2.dropDuplicates()
df_drop_duplicates.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 25|
|  1|  Alice| 30|
|  3|Charlie| 25|
+---+-------+---+



## 13. orderBy() and sort()
These methods are used for sorting data in DataFrame.

In [37]:
sorted_df = df_drop_duplicates.sort(df_drop_duplicates["id"].desc())
sorted_df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 25|
|  2|    Bob| 25|
|  1|  Alice| 30|
+---+-------+---+



In [40]:
df_order_by = df_drop_duplicates.orderBy(df_drop_duplicates["id"].asc())
df_order_by.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 30|
|  2|    Bob| 25|
|  3|Charlie| 25|
+---+-------+---+



## 14. groupBy()
Used for group-by operations.

In [42]:
df_grouped = df_drop_duplicates.groupBy("Id").count()
df_grouped.show()

+---+-----+
| Id|count|
+---+-----+
|  1|    1|
|  3|    1|
|  2|    1|
+---+-----+

