# PySpark Examples
This notebook demonstrates common PySpark operations and transformations.

## 1. Initialize Spark

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("PySpark Examples") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

## 2. Create Sample DataFrames

In [None]:
# Create a sample employee DataFrame
employees_data = [
    (1, "John Doe", 30, "Engineering", 75000),
    (2, "Jane Smith", 25, "Marketing", 65000),
    (3, "Bob Johnson", 35, "Engineering", 85000),
    (4, "Alice Brown", 28, "Marketing", 67000),
    (5, "Charlie Wilson", 40, "Engineering", 95000)
]

employees_df = spark.createDataFrame(employees_data, 
    ["id", "name", "age", "department", "salary"])

# Create a sample departments DataFrame
departments_data = [
    ("Engineering", "New York"),
    ("Marketing", "San Francisco"),
    ("Sales", "Chicago")
]

departments_df = spark.createDataFrame(departments_data, 
    ["department", "location"])

# Show the DataFrames
print("Employees DataFrame:")
employees_df.show()

print("\nDepartments DataFrame:")
departments_df.show()

## 3. Basic DataFrame Operations

In [None]:
from pyspark.sql.functions import col, avg, sum, count

# Select specific columns
print("Select name and salary:")
employees_df.select("name", "salary").show()

# Filter data
print("\nEmployees with salary > 70000:")
employees_df.filter(col("salary") > 70000).show()

# Group by and aggregate
print("\nAverage salary by department:")
employees_df.groupBy("department") \
    .agg(avg("salary").alias("avg_salary")) \
    .show()

## 4. Joining DataFrames

In [None]:
# Join employees with departments
print("Join employees with their department locations:")
joined_df = employees_df.join(departments_df, "department")
joined_df.show()

## 5. SQL Queries

In [None]:
# Create temporary views
employees_df.createOrReplaceTempView("employees")
departments_df.createOrReplaceTempView("departments")

# Run SQL query
query = """
SELECT e.name, e.department, e.salary, d.location
FROM employees e
JOIN departments d ON e.department = d.department
WHERE e.salary > 70000
ORDER BY e.salary DESC
"""

print("SQL query result:")
spark.sql(query).show()

## 6. Converting to Pandas

In [None]:
# Convert Spark DataFrame to Pandas
pandas_df = employees_df.toPandas()
print("Pandas DataFrame:")
print(pandas_df)

## 7. Clean Up

In [None]:
# Stop the Spark session
spark.stop()