**Displaying Host And Their Properties In Neighbourhoods:**

In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from google.colab import auth
from google.colab import drive


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=00d90c5240a89f50fb2a2ea5394615ce5adc779cf8ac55a5d3565e53af9659c0
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


**Reading the streamed JSON Documents from MongoDB**

In [None]:
spark = SparkSession.builder \
    .appName("MongoDBExportToCSV") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/project_budt737.airbnb_data") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/project_budt737.airbnb_data") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

# Set log level to ERROR to reduce verbosity
spark.sparkContext.setLogLevel("ERROR")

In [None]:
df = spark.read.format("mongo").load()

In [None]:
pip install networkx matplotlib



**Creating Graph:**

In [None]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

vertices = df.select("host_id", "name", "room_type", "neighbourhood").withColumnRenamed("host_id", "vertexId")
edges = df.select("host_id", "neighbourhood", "geolocation").withColumnRenamed("host_id", "src").withColumnRenamed("neighbourhood", "dst")

# Create DataFrames for vertices and edges
vertices_df = vertices.select("vertexId", "name", "room_type", "neighbourhood")
edges_df = edges.select("src", "dst", "geolocation")

# Create a temporary view for vertices and edges
vertices_df.createOrReplaceTempView("vertices")
edges_df.createOrReplaceTempView("edges")

# Define a query to create a graph
graph_query = """
    SELECT edges.src AS src, edges.dst AS dst,
           vertices.name AS src_name, vertices.name AS dst_name, edges.geolocation
    FROM edges
    JOIN vertices ON edges.src = vertices.vertexId
"""

# Execute the query to create the graph
graph = spark.sql(graph_query)



# Replace null values with a placeholder string
graph_filled = graph.na.fill("Unknown")

# Show the resulting graph with truncated nulls
graph_filled.show(truncate=False)



+---------+---------------------+--------------------------------------------------+--------------------------------------------------+-----------------------------------------+
|src      |dst                  |src_name                                          |dst_name                                          |geolocation                              |
+---------+---------------------+--------------------------------------------------+--------------------------------------------------+-----------------------------------------+
|3633025  |Surry Hills          |Light and sunny loft apt in Surry!                |Light and sunny loft apt in Surry!                |{-33.88605195495153, 151.20842616840682} |
|106400634|Unknown              |Easy access studio apt in NE DC                   |Easy access studio apt in NE DC                   |{38.92319239542878, -76.99360052167921}  |
|19980922 |Buckman              |Private & central PDX room with free breakfast    |Private & central PDX room

In [None]:
!pip install graphframes


Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7


**Geolocation Map:**

In [None]:
pip install folium




In [None]:
import folium
from pyspark.sql.functions import expr

# Sample a subset of data for simplicity
sample_map_data = graph_filled.limit(90)

# Extract latitude and longitude from the geolocation struct
sample_map_data = sample_map_data.withColumn("latitude", expr("geolocation.lat")).withColumn("longitude", expr("geolocation.lon"))

# Create a folium map centered around the average coordinates
center_latitude, center_longitude = sample_map_data.select(expr("avg(latitude)"), expr("avg(longitude)")).first()

# Adjust the zoom level
m = folium.Map(location=[center_latitude, center_longitude], zoom_start=5)

# Add markers for each data point
for row in sample_map_data.collect():
    popup_text = f"{row['src_name']} to {row['dst_name']}"
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=folium.Popup(popup_text, parse_html=True)
    ).add_to(m)

# Display the map
m
