In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StringType
from pyspark.sql.functions import pandas_udf
import pandas as pd
import numpy as np
import pickle

In [2]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [3]:
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://dlcdn.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
    !tar xf spark-3.5.5-bin-hadoop3.tgz
    !mv spark-3.5.5-bin-hadoop3 spark
    !pip install -q findspark
    import os
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    os.environ["SPARK_HOME"] = "/content/spark"

In [4]:
import findspark
findspark.init()
spark_url = 'local'
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark')\
        .getOrCreate()

---

In [None]:
# Loading File from Google Drive
!gdown 'https://drive.google.com/uc?id=1PE-fNRsLA9qqEHmaiof0xV-Uz_5ZfCuO' --output df_with_cluster.csv

Downloading...
From (original): https://drive.google.com/uc?id=1Xbce69FUS1DOgRkOOmi6VT2eysYlSSFJ
From (redirected): https://drive.google.com/uc?id=1Xbce69FUS1DOgRkOOmi6VT2eysYlSSFJ&confirm=t&uuid=01633e0f-2398-4371-9858-7702f9a32e65
To: /content/df_with_cluster.csv
100% 637M/637M [00:06<00:00, 103MB/s]


In [17]:
# -------------- TABLE 1: Showing Example Comment ---------------------

# 1. Read the CSV
df_with_clustering = spark.read.csv('df_with_cluster.csv', header=True, inferSchema=True)

# 2. Select and rename
df = df_with_clustering.select(
    col('cluster_id').alias('cluster'),
    col('comment')
)

# 3. Drop noise row
df = df.filter(~col('cluster').startswith('noise'))

# 4. Save the DataFrame as a single CSV file
df.coalesce(1).write.csv('example_comment', header=True, mode='overwrite')

---

In [None]:
!gdown 'https://drive.google.com/uc?id=12RR873XXJrFnJiKJWkqYNnzAfUB-sDyL' --output bright.csv
!gdown 'https://drive.google.com/uc?id=1KupegXEIkR8hURDfWE7qGM9HRlWY-iCT' --output piti.csv

Downloading...
From (original): https://drive.google.com/uc?id=165cvX8TlwZrShLMBpvEg7v2iZgZlG8h-
From (redirected): https://drive.google.com/uc?id=165cvX8TlwZrShLMBpvEg7v2iZgZlG8h-&confirm=t&uuid=4688ac35-407c-42fe-8485-8d7153d88c53
To: /content/bright.csv
100% 135M/135M [00:01<00:00, 133MB/s]
Downloading...
From: https://drive.google.com/uc?id=1KupegXEIkR8hURDfWE7qGM9HRlWY-iCT
To: /content/piti.csv
100% 4.10M/4.10M [00:00<00:00, 196MB/s]


In [None]:
# -------------- TABLE 2: Map and Graph visualization ---------------------

# 1. piti: 'cluster_id', 'num_times', 'status'
# 2. bright: 'cluster_id', 'cluster_desc', 'lat', 'long', 'organization'
# 3. keen: zone scraping to get 'zone'

# 1. Load the pre-trained KNN model from Google Drive
!gdown --fuzzy "https://drive.google.com/file/d/1w21nVLe-xYaQ3ZiP5nOY_f9R_t9dMbII/view?usp=drive_link"

with open("knn.pkl", "rb") as f:
    knn = pickle.load(f)

# 2. Read the CSV
piti_df = spark.read.csv('piti.csv', header=True, inferSchema=True)
bright_df = spark.read.csv('bright.csv', header=True, inferSchema=True)
bright_df = bright_df.withColumnRenamed("lon", "long")

# 3. Merge the tables on 'cluster_id'
piti_df = piti_df.withColumn("cluster_id", col("cluster_id").cast("string"))
bright_df = bright_df.withColumn("cluster_id", col("cluster_id").cast("string"))
merged_df = piti_df.join(bright_df, on='cluster_id', how='inner')

# 4. Define the pandas_udf function to apply the KNN model and predict the zone
@pandas_udf(StringType())
def predict_zone_pandas_udf(lat: pd.Series, lon: pd.Series) -> pd.Series:
    # Assuming knn is already loaded with pickle
    coords = np.column_stack((lat.values, lon.values))  # Combine lat and lon into a 2D array
    return pd.Series(knn.predict(coords))

# 5. Apply the pandas_udf function to predict the zone based on latitude and longitude
merged_df = merged_df.withColumn('zone', predict_zone_pandas_udf(col('lat'), col('long')))

# 6. Save the DataFrame as a single CSV file
merged_df.coalesce(1).write.csv('cluster_data', header=True, mode='overwrite')

Downloading...
From: https://drive.google.com/uc?id=1w21nVLe-xYaQ3ZiP5nOY_f9R_t9dMbII
To: /content/knn.pkl
100% 23.9M/23.9M [00:00<00:00, 106MB/s]
