# Exploratory Data Analysis
---

In [1]:
from pathlib import Path

path = Path().absolute()

## Load dataset
---

In [2]:
import pandas as pd

df = pd.read_csv(path / "assets/dataset.csv", delimiter=";")

## Load spacy-stanza model
---

In [3]:
import stanza

# download indonesian model
stanza.download("id")

import spacy_stanza

# initialize pipeline
nlp = spacy_stanza.load_pipeline("id", processors="tokenize,mwt,pos,lemma")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-05 00:22:29 INFO: Downloading default packages for language: id (Indonesian) ...
2023-02-05 00:22:29 INFO: File exists: C:\Users\novia\stanza_resources\id\default.zip
2023-02-05 00:22:31 INFO: Finished downloading models and saved to C:\Users\novia\stanza_resources.
2023-02-05 00:22:32 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-05 00:22:32 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-02-05 00:22:32 INFO: Use device: cpu
2023-02-05 00:22:32 INFO: Loading: tokenize
2023-02-05 00:22:32 INFO: Loading: mwt
2023-02-05 00:22:32 INFO: Loading: pos
2023-02-05 00:22:32 INFO: Loading: lemma
2023-02-05 00:22:32 INFO: Done loading processors!


### Test one sample
---

In [4]:
i = 0
row = df.loc[i]
print(row, end="\n\n")

from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["I", "Token", "Lemma", "POS", "Morph"]
table.align = "l"

for token in nlp(row["text"]):
    table.add_row([token.i, token.text, token.lemma_, token.pos_, str(token.morph)])

print(table)

text              saya tidak merasa terhina
target_emotion                        sedih
Name: 0, dtype: object

+---+---------+---------+------+-----------------------------------------------+
| I | Token   | Lemma   | POS  | Morph                                         |
+---+---------+---------+------+-----------------------------------------------+
| 0 | saya    | saya    | PRON | Number=Sing|Person=1|Polite=Form|PronType=Prs |
| 1 | tidak   | tidak   | PART | Polarity=Neg                                  |
| 2 | merasa  | rasa    | VERB | Mood=Ind|Voice=Act                            |
| 3 | terhina | terhina | ADJ  |                                               |
+---+---------+---------+------+-----------------------------------------------+


## Dirty Text Preprocessing
---

In [5]:
# zip text with its context
texts_with_contexts = [
    (
        df.at[i, "text"],
        {
            "id": i,
            "target_emotion": df.at[i, "target_emotion"]
        }
    )
    for i in df.index
]

In [6]:
from tqdm import tqdm

# batch processing
docs_with_contexts = tqdm(nlp.pipe(texts_with_contexts, as_tuples=True), total=len(texts_with_contexts))

  0%|          | 0/17701 [00:00<?, ?it/s]

### Initialize doc context
---

In [7]:
from spacy.tokens import Doc

Doc.set_extension("id", default=None, force=True)
Doc.set_extension("target_emotion", default=None, force=True)

### Set the docs contexts
---

In [8]:
docs = []

for doc, contexts in docs_with_contexts:
    for key, val in contexts.items():
        doc._.set(key, val)

    docs.append(doc)

  doc = self._ensure_doc(doc_like)
Words: ['saya', 'melepaskan', 'permusuhan', 'yang', 'terhadap', 'siapa', 'pun', 'yang', 'saya', 'rasa', 'telah', 'mengani', 'aa', 'saya']
Entities: []
  doc = self._ensure_doc(doc_like)
Words: ['saya', 'merasa', 'bermanfaat', 'untuk', 'mendokumemtasin', 'nya', 'untuk', 'orang-orang', 'yang', 'tidak', 'terbiasa', 'dengan', 'file', 'batch']
Entities: []
  doc = self._ensure_doc(doc_like)
Words: ['Saya', 'bisa', 'merasakan', 'ketidakakaban', 'nya', 'dan', 'saya', 'tidak', 'bisa', 'menghentikan', 'tubuh', 'saya', 'untuk', 'memberi', 'nya', 'respons', 'positif']
Entities: []
  doc = self._ensure_doc(doc_like)
Words: ['saya', 'harus', 'terbuka', 'dengan', 'mereka', 'seperti', 'saya', 'dengan', 'beberapa', 'teman', 'saya', 'ketika', 'saya', 'merasa', 'bahwa', 'mereka', 'telah', 'mengani', 'aa', 'saya']
Entities: []
  doc = self._ensure_doc(doc_like)
Words: ['saya', 'bisa', 'menghabiskan', 'hidup', 'saya', 'mengutuk', 'orang', 'lain', 'saya', 'merasa', 'telah

### Save Dirty Text Preprocessing
---

In [9]:
from spacy.tokens import DocBin

DocBin(
    docs=docs,
    store_user_data=True
).to_disk(path / "assets/docs.preprocessing.dirtyX`.spacy")