In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import re
import nltk
import shutil
import warnings
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import tqdm.auto as tqdm
import spacy
import os
import emoji
import torch

from transformers import pipeline
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from wordcloud import WordCloud
from collections import Counter
from textblob import TextBlob
from nltk import ngrams

warnings.filterwarnings("ignore")

# **Import Data**

# **Filter Data**

# **Text Preprocessing**

In [None]:
def data_cleaning_pipeline(text: str):
    if not isinstance(text, str):
        text = str(text)

    def remove_emojis(input_text):
        return emoji.replace_emoji(input_text, replace='')

    step_1 = remove_emojis(text)
    step_2 = replace_slang(step_1)
    step_3 = re.sub(r'http\S+', '', step_2)
    step_4 = re.sub(r'\B[@#]\w+\b', '', step_3)
    step_6 = re.sub(r'\d+', '', step_4)
    step_7 = re.sub(r'\s+', ' ', step_6)

    words = step_7.split()
    step_8 = ' '.join(sorted(set(words), key=words.index))

    return step_8.strip().lower()

In [None]:
df['full_text'] = df['full_text'].apply(data_cleaning_pipeline)
df.dropna(subset=["full_text"], inplace=True)
df.drop_duplicates(subset=['full_text'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
stanza.download("id")
nlp = stanza.Pipeline("id")

def lemmastanZa(teks):
  doc = nlp(teks)
  hasil = " ".join([word.lemma for sentence in doc.sentences for word in sentence.words])
  return hasil

# **Labeling Data**

In [None]:
pretrained_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
nlp = pipeline(
    "sentiment-analysis",
    model=pretrained_name,
    tokenizer=pretrained_name
)

def analyze_sentiment(text):
    if pd.isnull(text) or not isinstance(text, str):
        return {"label": "neutral", "score": 0.0}
    try:
        result = nlp(text)[0]
    except Exception as e:
        print(f"Error processing text: {text}, Error: {e}")
        return {"label": "neutral", "score": 0.0}
    return result

data['full_text'] = data['full_text'].astype(str)
data['sentiment_result'] = data['full_text'].apply(analyze_sentiment)

data['sentiment'] = data['sentiment_result'].apply(lambda x: x['label'])
data['sentiment_score'] = data['sentiment_result'].apply(lambda x: x['score'])