# Getting Started with PySpark
### Spark Session

In [0]:
spark

<pyspark.sql.connect.session.SparkSession at 0xffef589d6c30>

### Creating a DataFrame

In [0]:
data = [
    ("iPhone", "Electronics", 999),
    ("Samsung", "Electronics", 799),
    ("MacBook", "Electronics", 1299),
    ("Table", "Furniture", 299)
]

columns = ["product", "category", "price"]

df = spark.createDataFrame(data, columns)
df.show()

+-------+-----------+-----+
|product|   category|price|
+-------+-----------+-----+
| iPhone|Electronics|  999|
|Samsung|Electronics|  799|
|MacBook|Electronics| 1299|
|  Table|  Furniture|  299|
+-------+-----------+-----+



### Understanding the Schema

In [0]:
df.printSchema()

root
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)



### Basic Data Exploration Commands

View first few rows

In [0]:
df.show(2)

+-------+-----------+-----+
|product|   category|price|
+-------+-----------+-----+
| iPhone|Electronics|  999|
|Samsung|Electronics|  799|
+-------+-----------+-----+
only showing top 2 rows


Count number of rows

In [0]:
df.count()


4

Select specific columns

In [0]:
df.select("product", "price").show()

+-------+-----+
|product|price|
+-------+-----+
| iPhone|  999|
|Samsung|  799|
|MacBook| 1299|
|  Table|  299|
+-------+-----+



### Filtering Data

In [0]:
df.filter(df.price > 500).show()

+-------+-----------+-----+
|product|   category|price|
+-------+-----------+-----+
| iPhone|Electronics|  999|
|Samsung|Electronics|  799|
|MacBook|Electronics| 1299|
+-------+-----------+-----+



### Adding a New Column (Basic Transformation)

In [0]:
from pyspark.sql.functions import col

df_with_tax = df.withColumn("price_with_tax", col("price") * 1.1)
df_with_tax.show()

+-------+-----------+-----+------------------+
|product|   category|price|    price_with_tax|
+-------+-----------+-----+------------------+
| iPhone|Electronics|  999|            1098.9|
|Samsung|Electronics|  799| 878.9000000000001|
|MacBook|Electronics| 1299|            1428.9|
|  Table|  Furniture|  299|328.90000000000003|
+-------+-----------+-----+------------------+



### Basic Aggregations

In [0]:
from pyspark.sql.functions import avg, max, min

df.groupBy("category").agg(
    avg("price").alias("avg_price"),
    max("price").alias("max_price"),
    min("price").alias("min_price")
).show()

+-----------+------------------+---------+---------+
|   category|         avg_price|max_price|min_price|
+-----------+------------------+---------+---------+
|Electronics|1032.3333333333333|     1299|      799|
|  Furniture|             299.0|      299|      299|
+-----------+------------------+---------+---------+



### Create a Temporary View (Bridge to SQL)

In [0]:
df.createOrReplaceTempView("products")

### Running SQL on the same DataFrame

In [0]:
spark.sql("""
    SELECT category, AVG(price) AS avg_price
    FROM products
    GROUP BY category
""").show()

+-----------+------------------+
|   category|         avg_price|
+-----------+------------------+
|Electronics|1032.3333333333333|
|  Furniture|             299.0|
+-----------+------------------+

