In [15]:
# import findspark
# findspark.init()

In [16]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [17]:
spark

#pre process


In [18]:
import pyspark.pandas as ps
import pandas as pd

In [19]:
df = spark.read.csv("spark/bangkok_traffy.csv", header=True, inferSchema=True,multiLine = True,escape="\"",sep=",",encoding='utf-8')

In [20]:
df = df.select('ticket_id','type','comment')

In [21]:
df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- comment: string (nullable = true)



In [22]:
df.show(5,truncate = False)

+-----------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ticket_id  |type               |comment                                                                                                                                                   |
+-----------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|2021-9LHDM6|{}                 |ไม่มีภาพ                                                                                                                                                  |
|2021-FYJTFP|{ความสะอาด}        |ขยะเยอะ                                                                                                                                                   |
|2021-8GKAR9|{สายไฟ}            |1. เถาวัลย์งอดบนสายไฟ 

In [23]:
from pyspark.sql.functions import col, isnull
filtered_nan_bracket_df = df.filter((col("type") == "{}") | col("type").isNull())

# Count the number of rows in the filtered DataFrame
row_count = filtered_nan_bracket_df.count()
print(row_count)

62540


In [24]:
non_matching_df = df.exceptAll(filtered_nan_bracket_df)

# Show the non-matching rows (optional)df.show()

# Alternatively, you can assign the filtered DataFrame directly
bangkok_traffy_df = non_matching_df

In [26]:
filtered_nan_bracket_df.toPandas().to_csv('nan_bucket_df.csv')

In [None]:
bangkok_traffy_df.count()

208676

In [None]:
from pyspark.sql.functions import col, udf,length,substring,regexp_replace,split
from pyspark.sql.types import ArrayType, IntegerType

In [None]:
types = ['ถนน','ทางเท้า','แสงสว่าง','ความปลอดภัย','น้ำท่วม','ความสะอาด','กีดขวาง',
        'ท่อระบายน้ำ','สะพาน','จราจร','สายไฟ','คลอง','เสียงรบกวน','ต้นไม้','ร้องเรียน',
        'ป้าย','สัตว์จรจัด',"PM25",'สอบถาม','เสนอแนะ','คนจรจัด','การเดินทาง','ห้องน้ำ','ป้ายจราจร']

In [None]:
def string_to_list(string):
    clean_string = regexp_replace(string, "[{}]", "")
    return split(clean_string, ",")

# Apply the UDF to convert the string_col to a list_col
bangkok_traffy_df = bangkok_traffy_df.withColumn("type", string_to_list(df["type"]))

# Show the resulting DataFrame
bangkok_traffy_df.show(truncate=False)

+-----------+----------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ticket_id  |type                        |comment                                                                                                                                                                                                                                                                                                                                                    

In [None]:
def cal_function(x):
    x = x.split()
    return x

In [None]:
bangkok_traffy_df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- comment: string (nullable = true)



In [None]:
pandas_df = bangkok_traffy_df.toPandas()

In [None]:
spark.stop()

In [None]:
bangkok_traffy_df = pandas_df.copy()

In [None]:
bangkok_traffy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208676 entries, 0 to 208675
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ticket_id  207089 non-null  object
 1   type       208676 non-null  object
 2   comment    207089 non-null  object
dtypes: object(3)
memory usage: 4.8+ MB


In [None]:
bangkok_traffy_df['type'][0]

['ถนน']

In [None]:
for i in types:
    bangkok_traffy_df[i] = 0

In [None]:
def add_column_for_each_type(rows):
    _list = [0 for i in range(len(types))]
    for _type in rows["type"]:
        if _type[-2:] == "\n":
            _type = _type[-2:]    
        if _type == "PM2.5": _type = "PM25"
        _list[types.index(_type)] = 1
    rows[types] = _list
    rows["labels"] = _list
    return rows
bangkok_traffy_df = bangkok_traffy_df.apply(add_column_for_each_type,axis = "columns")

In [None]:
bangkok_traffy_df

Unnamed: 0,ticket_id,type,comment,ถนน,ทางเท้า,แสงสว่าง,ความปลอดภัย,น้ำท่วม,ความสะอาด,กีดขวาง,...,ป้าย,สัตว์จรจัด,PM25,สอบถาม,เสนอแนะ,คนจรจัด,การเดินทาง,ห้องน้ำ,ป้ายจราจร,labels
0,2021-BEJ9PP,[ถนน],สะพานลอยกีดขวางทางเท้า ถนนเล็ก ควรทำเป็นทางข้า...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2021-DDF9VX,"[คลอง, ความปลอดภัย, ทางเท้า]",ฟุตบาทยุบ และแคบมาก อันตรายที่จะพลัดตกลงคลองที...,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2,2022-8TN97P,[จราจร],เสาไฟจราจรเอียงจะล้ม,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,2022-GPUWCP,"[สายไฟ, ต้นไม้]",ต้นไม้สู้ชนสายไฟ,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ..."
4,2022-KNYYC3,"[คลอง, ความสะอาด]",การทิ้งขยะลงในลำคลอง,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208671,2023-4989VN,"[ถนน, คลอง]",📢🔊🌳🍃🎋แจ้ง มีวัชพืช เลื้อยพัน และไต่ระดับขึ้นสู...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
208672,UNWWLC,"[เสียงรบกวน, ถนน]",‘กรุณาอ่านข้อมูลบรรยายประกอบการปักหมุด’\n* ปัญ...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
208673,PT24KT,[สัตว์จรจัด],ปัญหา : ช่วยจัดการกับนกพิราบ จำนวนมาก\nจุดสังเ...,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
208674,2023-GL4KWC,[ทางเท้า],ทางเท้าแบบนี้สภาพไม่ดีเลย กรุณาปรับปรุงโดยด่วน,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
bangkok_traffy_df["comment"].isna().sum()

1587

# multilabel classification (text)

In [None]:
filtered_nan_comment_df = bangkok_traffy_df.loc[bangkok_traffy_df["comment"].isna()]

In [None]:
print(len(filtered_nan_comment_df))

1587


In [None]:
tmp_text_df = bangkok_traffy_df.drop(filtered_nan_comment_df.index.to_numpy())
print(len(tmp_text_df))

207089


In [None]:
tmp_text_df.to_csv('cleaned_data.csv')

In [None]:
df = tmp_text_df.sample(200)

In [None]:
df

Unnamed: 0,ticket_id,type,comment,ถนน,ทางเท้า,แสงสว่าง,ความปลอดภัย,น้ำท่วม,ความสะอาด,กีดขวาง,...,ป้าย,สัตว์จรจัด,PM25,สอบถาม,เสนอแนะ,คนจรจัด,การเดินทาง,ห้องน้ำ,ป้ายจราจร,labels
204348,2023-AWGD73,[แสงสว่าง],ปกติจะต้องมีไฟติดไหมคะ อุโมงค์ทางรอด อันนี้มืด...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
110563,2022-3H8P6D,[จราจร],รถติด,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
162314,2023-MNFBDB,[น้ำท่วม],ร่องระบายน้ำไม่ได้จริง ร่องระบายน้ำไม่เชื่อมต่...,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
204310,2023-NAR3G4,[ป้าย],ป้ายติดอยู่ที่ไฟสัญญาณจราจร,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4217,2022-96Q4QA,"[สะพาน, ถนน]",รถสองแถวจอดแช่บริเวณขาวแดงใกล้สี่แยกทำให้เกิดป...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191165,2022-39CCPN,"[ความสะอาด, ต้นไม้]",แจ้งเรื่องต้นไม้ ขยะไปนานมากแล้ว,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
62335,2023-N8GHVZ,[ทางเท้า],นั่งล้างของบนฟุตบาท นั่งเเบบปิดฟุตบาท คนเดินไม...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7734,2022-GVCT2P,"[น้ำท่วม, ทางเท้า]",การปะปาขุดฟุตบาท​แล้วไม่ปูกระเบื้องให้เหมือนเด...,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
31471,2022-9UHBWD,[น้ำท่วม],น้ำท่วม,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['labels'], test_size=0.1, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train , stratify=y_train , test_size=1/9, random_state=42)

In [None]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [None]:
from transformers import AutoTokenizer,BertTokenizer
# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader

In [None]:
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
# class Dataset(torch.utils.data.Dataset):

#     def __init__(self, df):

#         self.labels = df['labels'].values
       
#         self.texts = [tokenizer(text, 
#                                padding='max_length', max_length = 256, truncation=True,
#                                 return_tensors="pt") for text in df['comment']]

#     def __len__(self):
#         return len(self.labels)

#     def get_batch_labels(self, idx):
#         # Fetch a batch of labels
        
#         return self.labels[idx]
        

#     def get_batch_texts(self, idx):
#         # Fetch a batch of inputs
#         return self.texts[idx]

#     def __getitem__(self, idx):

#         batch_texts = self.get_batch_texts(idx)
#         batch_y = self.get_batch_labels(idx)
        

#         return batch_texts, batch_y

OSError: Model name 'airesearch/wangchanberta-base-att-spm-uncased' was not found in tokenizers model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased). We assumed 'airesearch/wangchanberta-base-att-spm-uncased' was a path or url to a directory containing vocabulary files named ['vocab.txt'] but couldn't find such vocabulary files at this path or url.