<a href="https://colab.research.google.com/github/stephenindia1/Python-Faker/blob/main/Colab_PySpark_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import findspark
findspark.init() # Initializes findspark to locate Spark installation

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("PySparkJupyterExample") \
    .master("local[*]") \
    .getOrCreate()

print("SparkSession created successfully!")

SparkSession created successfully!


In [None]:
# Creating DataFrame - demonstrates how to create a PySpark DataFrame from a Python list of tuples and define its schema
data = [
    ("Alice", 25, "New York"),
    ("Bob", 30, "London"),
    ("Charlie", 35, "Paris"),
    ("David", 28, "New York"),
    ("Eve", 32, "London")
]

# Define schema
schema = ["Name", "Age", "City"]

# Create DataFrame
df = spark.createDataFrame(data, schema)

print("DataFrame created:")
df.show()

DataFrame created:
+-------+---+--------+
|   Name|Age|    City|
+-------+---+--------+
|  Alice| 25|New York|
|    Bob| 30|  London|
|Charlie| 35|   Paris|
|  David| 28|New York|
|    Eve| 32|  London|
+-------+---+--------+



In [None]:
# Basic DataFrame Operations - illustrates common DataFrame operations such as filtering, selecting columns, and adding a new column

# Filter data for individuals older than 30
df_filtered = df.filter(col("Age") > 30)
print("Filtered DataFrame (Age > 30):")
df_filtered.show()

# Select specific columns
df_selected = df.select("Name", "City")
print("Selected columns:")
df_selected.show()

# Add a new column 'AgeGroup' based on age
df_with_age_group = df.withColumn(
    "AgeGroup", when(col("Age") < 30, "Young")
    .when((col("Age") >= 30) & (col("Age") < 40), "Middle-aged")
    .otherwise("Senior")
)
print("DataFrame with AgeGroup:")
df_with_age_group.show()

Filtered DataFrame (Age > 30):
+-------+---+------+
|   Name|Age|  City|
+-------+---+------+
|Charlie| 35| Paris|
|    Eve| 32|London|
+-------+---+------+

Selected columns:
+-------+--------+
|   Name|    City|
+-------+--------+
|  Alice|New York|
|    Bob|  London|
|Charlie|   Paris|
|  David|New York|
|    Eve|  London|
+-------+--------+

DataFrame with AgeGroup:
+-------+---+--------+-----------+
|   Name|Age|    City|   AgeGroup|
+-------+---+--------+-----------+
|  Alice| 25|New York|      Young|
|    Bob| 30|  London|Middle-aged|
|Charlie| 35|   Paris|Middle-aged|
|  David| 28|New York|      Young|
|    Eve| 32|  London|Middle-aged|
+-------+---+--------+-----------+



In [None]:
# Stop the SparkSession

spark.stop()
print("SparkSession stopped.")

SparkSession stopped.
