# 📦 Mount Google Drive
We begin by mounting Google Drive to access the dataset stored in your cloud workspace.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

# ⚙️ Install Spark and Dependencies
This installs Java, downloads Spark, and sets up `findspark` to initialize the Spark environment.

In [None]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://mirrors.huaweicloud.com/apache/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar -xvf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

# 🚀 Initialize SparkSession
We configure environment variables and start a Spark session for distributed data processing.

In [None]:
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
os.environ['SPARK_HOME'] = '/content/spark-3.4.1-bin-hadoop3'
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('AirbnbReviews').getOrCreate()

# 📥 Load Airbnb Dataset
We load the CSV file into a Spark DataFrame and inspect its schema and sample rows.

In [None]:
df = spark.read.csv('/content/drive/MyDrive/listings.csv', header=True, inferSchema=True)
df.printSchema()
df.show(5)

# 🧹 Data Cleaning
We filter out rows with missing prices and convert review dates to proper date format.

In [None]:
from pyspark.sql.functions import col, to_date
df_clean = df.filter(col('price').isNotNull())
df_clean = df_clean.withColumn('last_review', to_date(col('last_review')))
df_clean.select('id', 'name', 'price', 'last_review').show()

# 📊 Aggregation: Price by Room Type
We calculate average prices grouped by room type to identify pricing trends.

In [None]:
df_clean.groupBy('room_type').avg('price').orderBy('avg(price)', ascending=False).show()

# 🏘️ Aggregation: Listings by Neighborhood
We count the number of listings per neighborhood to identify popular areas.

In [None]:
df_clean.groupBy('neighbourhood').count().orderBy('count', ascending=False).show()