In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read Parquet Example") \
    .getOrCreate()

# Path to your Parquet file
parquet_file_path = 'path_to_parquet_file.parquet'

# Read the Parquet file
df_parquet = spark.read.parquet(parquet_file_path)

# Show the first few rows of the DataFrame
df_parquet.show()

# Stop the Spark session
spark.stop()


In [None]:
# Merging schemas: If you're reading multiple Parquet files with different but compatible schemas, you can merge their schemas.
df = spark.read.option("mergeSchema", "true").parquet("path/to/different/schemas/")
# Filtering files: You can filter which Parquet files to read directly in the read.parquet call by specifying a pattern or a subset of files.
df = spark.read.parquet("path/to/parquet/files/part-*.parquet")
# Reading specific columns: You can specify only certain columns to load if you do not need all the data, which can improve performance by reducing I/O.
df = spark.read.parquet("path/to/parquet/file.parquet").select("column1", "column2")
# Read the Parquet file using read.format()
df_parquet = spark.read.format("parquet").load(parquet_file_path)

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('test').getOrCreate()
parquet_file_path = ''
df = spark.read.parquet(parquet_file_path)
df.show()

In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read Multiple Parquet Files") \
    .getOrCreate()

# Path to your directory containing Parquet files
parquet_directory_path = 'path_to_parquet_directory/'

# Read all Parquet files in the directory
df_parquet = spark.read.parquet(parquet_directory_path + '*')

# Show the first few rows of the DataFrame
df_parquet.show()

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read Parquet Format Example") \
    .getOrCreate()

# Path to your Parquet file
parquet_file_path = 'path_to_parquet_file.parquet'

# Read the Parquet file using read.format()
df_parquet = spark.read.format("parquet").load(parquet_file_path)

# Show the first few rows of the DataFrame
df_parquet.show()

# Stop the Spark session
spark.stop()


In [None]:
df = spark.read.format('parquet').load(parquet_file_path)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read Parquet with Schema") \
    .getOrCreate()

# Define your schema
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True)
])

# Path to your Parquet file
parquet_file_path = 'path_to_parquet_file.parquet'

# Read the Parquet file with a predefined schema
df_parquet = spark.read.schema(schema).parquet(parquet_file_path)

# Show the first few rows of the DataFrame
df_parquet.show()

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Write Parquet Example") \
    .getOrCreate()

# Example DataFrame creation (you might have your DataFrame ready from previous operations)
data = [("James", 34), ("Anna", 28), ("Lee", 23)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, schema=columns)

# Path to output the Parquet file
output_parquet_path = 'output_path.parquet'

# Write the DataFrame to a Parquet file
df.write.parquet(output_parquet_path, mode='overwrite')

# Stop the Spark session
spark.stop()


In [None]:
# Assume 'df' is your DataFrame

# To save the DataFrame as a Parquet file
df.write.format("parquet").save("path/to/output/file.parquet")
# This is equivalent to the previous example
df.write.parquet("path/to/output/file.parquet")
df.write.parquet(output_parquet_path, mode='overwrite')
df.write.format("parquet").option("compression", "snappy").save("path/to/output/file.parquet")
df.write.format("parquet").mode("overwrite").save("path/to/output/file.parquet")
df.write.format("parquet").partitionBy("column_name").save("path/to/output/file.parquet")
df.write.format("parquet").option("path", "path/to/output/file.parquet").save()
df.write.format("parquet").option("mergeSchema", "true").save("path/to/output/file.parquet")
df.write.format("parquet")\
    .option("compression", "snappy")\
    .option("mergeSchema", "true")\
    .partitionBy("column_name")\
    .mode("overwrite")\
    .save("path/to/output/file.parquet")
# To write a DataFrame to a Parquet file with a specific mode
df.write.mode("overwrite").parquet("path/to/output/file.parquet")




In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

# Path to your CSV file
csv_file_path = 'path_to_csv_file.csv'

# Read the CSV file
df_csv = spark.read.format("csv") \
    .option("header", "true")  # Assumes the first row is a header
    .option("inferSchema", "true")  # Infers the input schema automatically from data
    .load(csv_file_path)

# Show the first few rows of the DataFrame
df_csv.show()

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read Multiple CSV Files") \
    .getOrCreate()

# Multiple CSV file paths
csv_file_path1 = 'path_to_first_csv_file.csv'
csv_file_path2 = 'path_to_second_csv_file.csv'

# Read multiple CSV files
df_csv = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_file_path1, csv_file_path2)

# Show the first few rows of the DataFrame
df_csv.show()

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read CSV Files with Wildcard") \
    .getOrCreate()

# Directory path containing CSV files
directory_path = 'path_to_csv_directory/'

# Read all CSV files in the directory that match a pattern
df_csv = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(directory_path + "*.csv")  # Adjust pattern as needed

# Show the first few rows of the DataFrame
df_csv.show()

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read All CSV Files in Directory") \
    .getOrCreate()

# Directory path containing CSV files
directory_path = 'path_to_csv_directory/'

# Read all CSV files in the directory
df_csv = spark.read.csv(directory_path, header=True, inferSchema=True)

# Show the first few rows of the DataFrame
df_csv.show()

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read Single CSV File") \
    .getOrCreate()

# Path to your CSV file
csv_file_path = 'path_to_csv_file.csv'

# Read the CSV file
df_csv = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show the first few rows of the DataFrame
df_csv.show()

# Stop the Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Write CSV Example") \
    .getOrCreate()

# Example DataFrame creation (you might have your DataFrame ready from previous operations)
data = [("James", 34), ("Anna", 28), ("Lee", 23)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, schema=columns)

# Path to output the CSV file
output_csv_path = 'output_path.csv'

# Write the DataFrame to a CSV file
df.write.format("csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save(output_csv_path)

# Stop the Spark session
spark.stop()


In [None]:
df.write.format("csv").option("delimiter", ";").save("path/to/output/file.csv")
df.write.format("csv").option("header", "true").save("path/to/output/file.csv")
df.write.format("csv").option("quote", "\"").save("path/to/output/file.csv")
df.write.format("csv").option("escape", "\\").save("path/to/output/file.csv")
df.write.format("csv").option("nullValue", "NULL").save("path/to/output/file.csv")
df.write.format("csv").option("nullValue", "NULL").save("path/to/output/file.csv")
df.write.format("csv").option("codec", "gzip").save("path/to/output/file.csv")
df.write.format("csv").mode("overwrite").save("path/to/output/file.csv")
df.write.format("csv").option("quoteAll", "true").save("path/to/output/file.csv")
df.write.format("csv")\
    .option("delimiter", ";")\
    .option("header", "true")\
    .option("nullValue", "NULL")\
    .option("codec", "gzip")\
    .mode("overwrite")\
    .save("path/to/output/file.csv")\
# To write a DataFrame to a CSV file with a specific mode
df.write.mode("overwrite").csv("path/to/output/file.csv")

