In [6]:
from pyspark.sql import SparkSession

# Default Schema

In [7]:
spark = SparkSession.builder.appName("Example").getOrCreate()

In [8]:
data = [(1, "Alice", 29), (2, "Alex", 30)]
df = spark.createDataFrame(data, ["id", "name", "age"])
df.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 29|
|  2| Alex| 30|
+---+-----+---+



# Explicit Schema

In [9]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([StructField("id", IntegerType(), True),
                     StructField("name", StringType(), True),
                     StructField("age", IntegerType(), True)])

df = spark.createDataFrame(data, schema)
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 29|
|  2| Alex| 30|
+---+-----+---+



In [11]:
# schema as a string
data = [(1, "misty",12), (2, "makato", 14)]
schema = "id INT, name STRING, age INT"

# Corrected line: createDataFrame should be called on the spark session object
df_new = spark.createDataFrame(data, schema=schema)
df_new.show()


+---+------+---+
| id|  name|age|
+---+------+---+
|  1| misty| 12|
|  2|makato| 14|
+---+------+---+



In [12]:
# schema string with float and boolean types
schema = "id INT, name STRING, salary FLOAT, is_active BOOLEAN"
data = [(1, "Alice", 50000.0, True), (2, "Bob", 60000.50, False)]
df = spark.createDataFrame(data, schema = schema)
df.show()

+---+-----+-------+---------+
| id| name| salary|is_active|
+---+-----+-------+---------+
|  1|Alice|50000.0|     true|
|  2|  Bob|60000.5|    false|
+---+-----+-------+---------+



In [14]:
# schema string with float and boolean types

schema = "id INT, name STRING, salary FLOAT, is_active BOOLEAN"
data = [(1, "Alice", 50000.75, True), (2,"Alex", 60000.50, False)]
spark.createDataFrame(data, schema = schema)

DataFrame[id: int, name: string, salary: float, is_active: boolean]

In [15]:
# schema string with date and timestamp

from datetime import date, datetime
schema = "id INT, name STRING, date DATE, timestamp TIMESTAMP"

data = [(1, "alice", date(2023, 1,15), datetime(2024,3,10,14,30,0)),
        (2,"bob", date(2023,1,15), datetime(2024,3,10,14,30,0))]

spark.createDataFrame(data, schema= schema)

DataFrame[id: int, name: string, date: date, timestamp: timestamp]

# Using a List of Dictionaries

In [16]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Example").getOrCreate()

data = [{"id":1, "name":"alice", "age":29},
        {"id":2, "name":"alex", "age":30}]

spark.createDataFrame(data)
df.show()

+---+-----+-------+---------+
| id| name| salary|is_active|
+---+-----+-------+---------+
|  1|Alice|50000.0|     true|
|  2|  Bob|60000.5|    false|
+---+-----+-------+---------+



# Reading CSV Files

In [None]:
# basic csv files

df.spark.readformat("csv").load("/path/to/sample.csv")

# csv with header
df = spark.read.option("header", True).csv("/path/to/sample.csv")

# multiple other options
df = spark.read.option("inferSchema", True).option("delimiter",",").csv("/path/to/sample.csv")

# with defined schema
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, IntegralType

schema = StructType([StructField("name", StringType(), True),
                     StructField("age", IntegerType(), True)])

df.spark.read.format("csv").schema(schema).load("/path/to/sample.csv")

# JSON FILES

In [None]:
# Basic JSON file
df = spark.read.format("json").load("/path/to/sample.json")

# JSON with multi-line records
df = spark.read.option("multiline", True).json("/path/to/sample.json")

# JSON with a defined schema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
StructField("name", StringType(), True),
StructField("age", IntegerType(), True)])

df = spark.read.format("json").schema(schema).load("/path/to/sample.json")

# CSV FILES

In [None]:
#Basic write to CSV
df.write.csv("/path/to/output_csv")

# With header
df.write.option("header", True).csv("/path/to/output_csv")

# With multiple options
df.write.option("header", True)\.option("delimiter", ",")\.option("quote", '"')\.csv("/path/to/output_csv")

# Overwrite existing files
df.write.mode("overwrite").option("header", True).csv("/path/to/output_csv")

# Append to existing data
df.write.mode("append").option("header", True).csv("/path/to/output_csv")

# Write as a single file
df.coalesce(1).write.option("header", True).csv("/path/to/output_csv")

In [None]:
# Basic JSON write
df.write.json("/path/to/output_json")

# Overwrite mode
df.write.mode("overwrite").json("/path/to/output_json")

# Append mode
df.write.mode("append").json("/path/to/output_json")

# Pretty format (for readability)
df.write.option("compression", "none").json("/path/to/output_json")

# Partitioned output
df.write.partitionBy("column_name").json("/path/to/output_json")

# Parquet Files

In [None]:
# Basic Parquet write
df.write.parquet("/path/to/output_parquet")

# Overwrite mode
df.write.mode("overwrite").parquet("/path/to/output_parquet")

# Append mode
df.write.mode("append").parquet("/path/to/output_parquet")

# Partitioned output
df.write.partitionBy("column_name").parquet("/path/to/output_parquet")

# Compression options (default is snappy)
df.write.option("compression", "gzip").parquet("/path/to/output_parquet")

# ORC FILES

In [None]:
# Basic ORC write
df.write.orc("/path/to/output_orc")

# Overwrite mode
df.write.mode("overwrite").orc("/path/to/output_orc")

# Append mode
df.write.mode("append").orc("/path/to/output_orc")

# Partitioned output
df.write.partitionBy("column_name").orc("/path/to/output_orc")

# Compression options
df.write.option("compression", "zlib").orc("/path/to/output_orc")