In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive


## 1. Create an empty DataFrame

In [2]:
Spark2 = SparkSession.builder.master("local").appName("EmptyDataFrame").getOrCreate()
#Empty DataFrame with Nodata
empty_df = Spark2.createDataFrame([], "id INT, name STRING")
empty_df.show()

+---+----+
| id|name|
+---+----+
+---+----+



## 2. Convert RDD to DataFrame

In [4]:
rdd = spark.sparkContext.parallelize([(1, "ABC"),(2, "DEF")])
columns = ["id", "name"]
df_from_rdd = rdd.toDF(columns)
df_from_rdd.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
+---+----+



## 3. Convert DataFrame to Pandas

In [6]:
pands_df = df_from_rdd.toPandas()
print(pands_df)

   id name
0   1  ABC
1   2  DEF


## 4. show()

In [7]:
df_from_rdd.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
+---+----+



## StructType & StructField

In [10]:
data =[(1, "ABC"), (2, "DEF"), (3, "GHI")]

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField("name", StringType(), True)
])
df_with_schema= spark.createDataFrame(data, schema)
df_with_schema.show()

+---+----+
| id|name|
+---+----+
|  1| ABC|
|  2| DEF|
|  3| GHI|
+---+----+



## 6. Column Class

In [11]:
df = df_with_schema.withColumn('loweerCase_format', lower(col('name')))
df.show()

+---+----+-----------------+
| id|name|loweerCase_format|
+---+----+-----------------+
|  1| ABC|              abc|
|  2| DEF|              def|
|  3| GHI|              ghi|
+---+----+-----------------+



## 7. select()

In [12]:
df.select(col('loweerCase_format')).show()

+-----------------+
|loweerCase_format|
+-----------------+
|              abc|
|              def|
|              ghi|
+-----------------+



## 8. collect()
collect() returns all the rows as a list of Row objects.

In [15]:
rows = df.collect()
print(rows)

[Row(id=1, name='ABC', loweerCase_format='abc'), Row(id=2, name='DEF', loweerCase_format='def'), Row(id=3, name='GHI', loweerCase_format='ghi')]


## 9. withColumn()
This method is used to add or modify a column.

In [18]:
df_with_column = df.withColumn("id_squared", df["id"]*df[id])
df_with_column.show()

PySparkTypeError: [NOT_COLUMN_OR_FLOAT_OR_INT_OR_LIST_OR_STR] Argument `item` should be a column, float, integer, list or string, got builtin_function_or_method.