# Exploratory Data Analysis
---

In [1]:
from pathlib import Path

path = Path().absolute()

## Load stanza model
---

In [2]:
import stanza

# download indonesian model
stanza.download("id")

import spacy_stanza

# initialize pipeline
nlp = spacy_stanza.load_pipeline("id", processors="tokenize,mwt,pos,lemma")

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 47.6MB/s]                    
2023-05-05 22:11:31 INFO: Downloading default packages for language: id (Indonesian) ...
2023-05-05 22:11:31 INFO: File exists: C:\Users\c10nGp4\stanza_resources\id\default.zip
2023-05-05 22:11:33 INFO: Finished downloading models and saved to C:\Users\c10nGp4\stanza_resources.
2023-05-05 22:11:33 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 45.9MB/s]                    
2023-05-05 22:11:34 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | g

### Initialize doc extensions
---

In [3]:
from spacy.tokens import Doc

Doc.set_extension("id", default=None, force=True)
Doc.set_extension("target_emotion", default=None, force=True)

### Test one sample
---

In [4]:
from prettytable import PrettyTable

def process_sample(text: str):
    print(text, end="\n\n")

    table = PrettyTable()
    table.field_names = ["I", "Token", "Lemma", "POS", "Morph"]
    table.align = "l"

    for token in nlp(text):
        table.add_row([token.i, token.text, token.lemma_, token.pos_, str(token.morph)])

    print(table)

### Load dataset
---

In [5]:
import pandas as pd

df = pd.read_csv(path / "assets/dataset.csv", delimiter=";")

In [6]:
process_sample(df.at[0, "text"])

saya tidak merasa terhina

+---+---------+---------+------+-----------------------------------------------+
| I | Token   | Lemma   | POS  | Morph                                         |
+---+---------+---------+------+-----------------------------------------------+
| 0 | saya    | saya    | PRON | Number=Sing|Person=1|Polite=Form|PronType=Prs |
| 1 | tidak   | tidak   | PART | Polarity=Neg                                  |
| 2 | merasa  | rasa    | VERB | Mood=Ind|Voice=Act                            |
| 3 | terhina | terhina | ADJ  |                                               |
+---+---------+---------+------+-----------------------------------------------+


## Dirty Text Preprocessing
---

### Prepare the texts contexts
---

In [7]:
# # zip text with its context
# texts_with_contexts = [
#     (
#         df.at[i, "text"],
#         {
#             "id": i,
#             "target_emotion": df.at[i, "target_emotion"]
#         }
#     )
#     for i in df.index
# ]

### Process the texts
---

In [8]:
# from tqdm import tqdm

# # batch processing
# docs_with_contexts = tqdm(nlp.pipe(texts_with_contexts, as_tuples=True), total=len(texts_with_contexts))

### Set the docs extensions
---

In [9]:
# docs = []

# for doc, contexts in docs_with_contexts:
#     for key, val in contexts.items():
#         doc._.set(key, val)

#     docs.append(doc)

### Save Dirty Text Preprocessing
---

In [10]:
# from spacy.tokens import DocBin

# DocBin(
#     docs=docs,
#     store_user_data=True
# ).to_disk(path / "assets/docs.preprocessing.dirty.spacy")

## Explore Data
---

### Load processed docs
---

In [11]:
from spacy.tokens import DocBin

docs = list(DocBin().from_disk(path / "assets/docs.preprocessing.dirty.spacy").get_docs(nlp.vocab))

### Part of Speech list
---

In [12]:
pos = set()

for doc in docs:
    for token in doc:
        if token.pos_ not in pos:
            pos.add(token.pos_)

pos

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SYM',
 'VERB',
 'X'}

### Most common lemmas of specific POS tag
---

In [13]:
from collections import Counter

lemmas=[]

for doc in docs:
    for token in doc:
        if token.pos_ == "X":
            lemmas.append(token.lemma_)

Counter(lemmas).most_common()        

[('a', 37),
 ('s', 18),
 ('haha', 8),
 ('feel', 6),
 ('href', 5),
 ('wah', 4),
 ('bla', 4),
 ('i', 4),
 ('and', 4),
 ('hah', 3),
 ('hahaha', 3),
 ('the', 3),
 ('kok', 2),
 ('amp', 2),
 ('eh', 2),
 ('got', 2),
 ('hmmm', 2),
 ('oh', 2),
 ('ahaha', 1),
 ('pitch', 1),
 ('in', 1),
 ('hihi', 1),
 ('lho', 1),
 ('lol', 1),
 ('pics', 1),
 ('yay', 1),
 ('aryans', 1),
 ('boo', 1),
 ('light', 1),
 ('who', 1),
 ('boss', 1),
 ('my', 1),
 ('find', 1),
 ('b', 1),
 ('for', 1),
 ('down', 1),
 ('xd', 1),
 ('hehe', 1),
 ('whooooo', 1),
 ('crack', 1),
 ('whos', 1),
 ('craaaaaaack', 1),
 ('wow', 1),
 ('super', 1),
 ('ehh', 1),
 ('sih', 1),
 ('agn', 1),
 ('gt', 1),
 ('momo', 1),
 ('to', 1),
 ('btw', 1)]

### Text cleaning
---

In [20]:
errors = []

for doc in docs:
    text = doc.text

    for token in doc:
        if token.text == "a":
            errors.append(doc.text)
            break

print(len(errors))
errors

148


['saya berhenti merasa sangat lelah a href http provokingbeauty',
 'saya merasa sangat bingung a href http twitter',
 'saya terkadang merasa tidak disukai di tempat kerja dan berpikir orang mungkin membicarakan saya rel bookmark Terkadang saya merasa tidak disukai di tempat kerja dan berpikir orang mungkin membicarakan saya april a class url fn n href http www',
 'saya benar-benar merasa bahwa kita sedang berkembang menuju masyarakat yang lebih tak kenal takut secara bertahap membuang batas-batas yang tampaknya kaku seperti paranoia atas keamanan barang-barang seseorang a href http www',
 'saya harus merasa seperti wanita mandiri yang sukses a la takdir anak no',
 'saya suka musim gugur dan segala sesuatu yang menyertainya meskipun saya merasa saya semakin bersemangat untuk natal terlalu dini tahun ini saya dan teman-teman saya termasuk a href http andthenwear',
 'saya benar-benar merasa seperti barang panas yang mondar-mandir di jalan di dalamnya a href http',
 'aku merasa hatiku ada 

In [34]:
import re

for error in errors:
    # text = re.match(r'\s(a+)\s(href)?\s?(http)?\s?(www)?\s?', text)
    text = re.findall(r'a href', error)

    print(error)
    print(text)
    print()

saya berhenti merasa sangat lelah a href http provokingbeauty
['a href']

saya merasa sangat bingung a href http twitter
['a href']

saya terkadang merasa tidak disukai di tempat kerja dan berpikir orang mungkin membicarakan saya rel bookmark Terkadang saya merasa tidak disukai di tempat kerja dan berpikir orang mungkin membicarakan saya april a class url fn n href http www
[]

saya benar-benar merasa bahwa kita sedang berkembang menuju masyarakat yang lebih tak kenal takut secara bertahap membuang batas-batas yang tampaknya kaku seperti paranoia atas keamanan barang-barang seseorang a href http www
['a href']

saya harus merasa seperti wanita mandiri yang sukses a la takdir anak no
[]

saya suka musim gugur dan segala sesuatu yang menyertainya meskipun saya merasa saya semakin bersemangat untuk natal terlalu dini tahun ini saya dan teman-teman saya termasuk a href http andthenwear
['a href']

saya benar-benar merasa seperti barang panas yang mondar-mandir di jalan di dalamnya a href h