In [None]:
# What is a schema in PySpark?
# A schema defines the structure of a DataFrame, i.e., the column names and data types.
# Think of it as a blueprint for your data.
# Schemas help Spark understand the type of each column, optimize queries, and avoid type errors.
# You can define it manually or let Spark infer it automatically.

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", DoubleType(), True)
])


In [None]:
# 5️⃣ Examples of common data types
| DataType          | Description            |
| ----------------- | ---------------------- |
| `StringType()`    | Text / string          |
| `IntegerType()`   | Integer numbers        |
| `LongType()`      | Long integers          |
| `DoubleType()`    | Decimal numbers        |
| `FloatType()`     | Floating point numbers |
| `BooleanType()`   | True/False             |
| `TimestampType()` | Timestamp/date         |


### Using DDL schema while reading a CSV

In [None]:
# DDL Schema in PySpark
# DDL (Data Definition Language) schema lets you define a schema as a string instead of using StructType objects.
# Very compact and easy for quick definitions, especially when reading files.
# Spark will parse the string and create the corresponding schema.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DDL_Schema_Example").getOrCreate()

ddl_schema = """name STRING,
                age INT,
                salary DOUBLE"""

df = spark.read.csv("path/to/file.csv", header=True, schema=ddl_schema)
df.show()
# df.printSchema()