In [58]:
!pip install gdown

# to upgrade
!pip install --upgrade gdown

!pip install pyspark



In [59]:
import gdown
url = "https://drive.google.com/file/d/13xxXYVtwlP50BXGOHpnb8bGlNVHxoxyZ/view?usp=sharing"
output = "bangkok_traffy.csv"
gdown.download(url=url, output=output, fuzzy=True)

Downloading...
From (original): https://drive.google.com/uc?id=13xxXYVtwlP50BXGOHpnb8bGlNVHxoxyZ
From (redirected): https://drive.google.com/uc?id=13xxXYVtwlP50BXGOHpnb8bGlNVHxoxyZ&confirm=t&uuid=b34f896e-3a88-4d63-b7ee-2cd56a4ee4ed
To: /content/bangkok_traffy.csv
100%|██████████| 1.07G/1.07G [00:20<00:00, 51.1MB/s]


'bangkok_traffy.csv'

Fix broken CSV before read_csv

In [72]:
import csv


with open('bangkok_traffy.csv', 'r', encoding='utf-8', errors='replace') as infile, \
     open('cleaned_traffy.csv', 'w', encoding='utf-8', newline='') as outfile:

    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for row in reader:
        if len(row) > 1:
            writer.writerow(row)

read_csv

In [73]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, regexp_replace, trim

spark = SparkSession.builder.appName("TraffyAnalysis").getOrCreate()
df = spark.read.option("header", True).option("encoding", "utf-8").csv("cleaned_traffy.csv")
df.head(5)

[Row(ticket_id='2021-FYJTFP', type='{ความสะอาด}', organization='เขตบางซื่อ', comment='ขยะเยอะ', photo='https://storage.googleapis.com/traffy_public_bucket/attachment/2021-09/3063e748259afbb7171467e19b92e9cc1f1a5826.jpg', photo_after=None, coords='100.53084,13.81865', address='12/14 ถนน กรุงเทพ- นนทบุรี แขวง บางซื่อ เขตบางซื่อ กรุงเทพมหานคร 10800 ประเทศไทย', subdistrict=None, district=None, province='กรุงเทพมหานคร', timestamp='2021-09-03 12:51:09.453003+00', state='เสร็จสิ้น', star=None, count_reopen='0', last_activity='2022-06-04 15:34:14.609206+00'),
 Row(ticket_id='2021-CGPMUN', type='{น้ำท่วม,ร้องเรียน}', organization='เขตประเวศ,ฝ่ายโยธา เขตประเวศ', comment='น้ำท่วมเวลาฝนตกและทะลุเข้าบ้านเดือดร้อนมากทุกๆปีจะมีเครื่องสูบน้ำแต่ปีนี้ไม่มีกทม.ปล่อยทิ้ง ชุมชนเคหะนคร1แปลง2(ซ.เฉลิมพระเกียรติร.9ซอง22 วัดตะกล่ำ ประเวศ)', photo='https://storage.googleapis.com/traffy_public_bucket/attachment/2021-09/41ef2b1b465b708db17363a6d5fd8c5336266df7.jpg', photo_after='https://storage.googleapis.com/traf

Drop row with NA value

In [74]:
df = df.dropna(subset=["organization", "comment", "timestamp","state","last_activity"])

Extract coords to longitude and latitude


In [75]:
df = df.withColumn("longitude", split(col("coords"), ",")[0].cast("double")) \
       .withColumn("latitude", split(col("coords"), ",")[1].cast("double")) \
       .drop("coords")

Clean comment text

In [76]:
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def clean_text(text):
    if text :
      text = re.sub(r'\n', ' ', text)
      text = re.sub(r'[^\u0E00-\u0E7Fa-zA-Z0-9\s]', '', text)

      return text.strip()
    return ""

clean_comment_udf = udf(clean_text, StringType())

df = df.withColumn("comment", clean_comment_udf(col("comment")))

Download result.csv

In [78]:
import os
import shutil
import glob

output_dir = "cleaned_result"
part_file = glob.glob(os.path.join(output_dir, "part-*.csv"))[0]

shutil.move(part_file, "result.csv")

from google.colab import files
files.download("result.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>