# PySpark Functions and Usage Examples

## 1. SparkSession

In [None]:

from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("ExampleApp").getOrCreate()

# Get SparkContext
sc = spark.sparkContext

# Stop SparkSession
spark.stop()


## 2. DataFrame API

In [None]:

# Create DataFrame
data = [(1, "Alice", 29), (2, "Bob", 31)]
columns = ["id", "name", "age"]
df = spark.createDataFrame(data, columns)

# Display data
df.show()                 # Show the first 20 rows
df.printSchema()          # Print the schema
df.columns                # Get column names
df.dtypes                 # Get column names and types


In [None]:

# Filter rows
filtered_df = df.filter(df.age > 30)

# Select columns
selected_df = df.select("name", "age")

# Order by column
sorted_df = df.orderBy("age", ascending=False)


In [None]:

from pyspark.sql.functions import avg, count

# Aggregations
df.groupBy("age").count().show()
df.agg(avg("age").alias("average_age")).show()


## 3. SQL Operations

In [None]:

# Create temporary table
df.createOrReplaceTempView("people")

# Execute SQL query
result = spark.sql("SELECT name, age FROM people WHERE age > 30")
result.show()


## 4. RDD (Resilient Distributed Dataset)

In [None]:

# Create RDD
rdd = sc.parallelize([1, 2, 3, 4, 5])

# Map and filter
mapped_rdd = rdd.map(lambda x: x * 2)
filtered_rdd = rdd.filter(lambda x: x > 3)

# Collect results
collected = filtered_rdd.collect()
print(collected)  # [4, 5]


In [None]:

# Reduce
sum_rdd = rdd.reduce(lambda x, y: x + y)

# Aggregations
count = rdd.count()
max_value = rdd.max()


## 5. DataFrame Functions

In [None]:

from pyspark.sql.functions import col, lit, when

# Column operations
df = df.withColumn("new_column", df.age * 2)

# Conditional operations
df = df.withColumn("is_adult", when(col("age") >= 18, lit(True)).otherwise(lit(False)))

# String operations
from pyspark.sql.functions import upper, lower
df = df.withColumn("name_upper", upper(col("name")))


## 6. MLlib (Machine Learning Library)

In [None]:

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Create feature vector
assembler = VectorAssembler(inputCols=["age"], outputCol="features")
data = assembler.transform(df)

# Linear regression
lr = LinearRegression(featuresCol="features", labelCol="age")
model = lr.fit(data)


## 7. Streaming

In [None]:

from pyspark.streaming import StreamingContext

# Create StreamingContext
ssc = StreamingContext(sc, 1)  # Batch interval of 1 second

# Create data stream
lines = ssc.socketTextStream("localhost", 9999)

# Process data stream
words = lines.flatMap(lambda line: line.split(" "))
word_counts = words.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)
word_counts.pprint()

# Start streaming
ssc.start()
ssc.awaitTermination()


## 8. GraphFrames

In [None]:

from graphframes import GraphFrame

# Nodes and edges
vertices = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
edges = spark.createDataFrame([(1, 2)], ["src", "dst"])

# Create graph
g = GraphFrame(vertices, edges)

# Run PageRank
result = g.pageRank(resetProbability=0.15, maxIter=10)
result.vertices.show()
