<a href="https://colab.research.google.com/github/rizqinursulistiasari/analisis-sentimen-rohingya/blob/main/1_preprocessing_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PreProcessing**

In [None]:
import pandas as pd

# Membaca file Excel menggunakan pandas
nama_file = "Data Mentah.csv"  # Ganti dengan nama file yang sesuai
Data = pd.read_csv(nama_file, encoding='ISO-8859-1')

# Menampilkan beberapa baris pertama dari dataframe
Data.head(15000)

In [None]:
Data.info()

In [None]:
#Tahap Awal Preprocessing
df = pd.DataFrame(Data['Content'])
df.head(15000)

In [None]:
#Case Folding
df['Case Folding'] = df['Content'].str.lower()
df.head(15000)

In [None]:
#Cleaning
import re
import string
import nltk

def remove_URL(tweet): #Menghapus URL/Link
  url = re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r'', tweet)

def remove_emoji(tweet): #Menghapus emoji
  emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"
    u"\U0001F300-\U0001F5FF"
    u"\U0001F680-\U0001F6FF"
    u"\U0001F1E0-\U0001F1FF"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u2600-\u2B55"
    u"\u200d"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\ufe0f"
    u"\u3030"
                        "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', tweet)

def remove_angka_dll(tweet):
  tweet = re.sub(r'\d+', '', tweet) #Menghapus angka
  tweet = re.sub(r'\$\w*', '', tweet) #Menghapus ticker pasar saham seperti $GE
  tweet = re.sub(r'^RT[\s]+', '', tweet) #Menghapus RT
  tweet = re.sub(r'#[^\s]+', '', tweet) #Menghapus hashtag
  tweet = re.sub(r'@[^\s]+', '', tweet) #Menghapus mention
  tweet = re.sub(r'\n', ' ', tweet) #Menghapus hal baru
  tweet = re.sub(r'&amp', '', tweet) #Menghapus tanda &amp
  tweet = re.sub(r'[^A-Za-z ]+', ' ', tweet) #Menghapus karakter non alfabet
  tweet = tweet.strip()
  return tweet

df['Cleaning'] = df['Case Folding'].apply(lambda x: remove_URL(x))
df['Cleaning'] = df['Cleaning'].apply(lambda x: remove_emoji(x))
df['Cleaning'] = df['Cleaning'].apply(lambda x: remove_angka_dll(x))
df.head(15000)

In [None]:
#Normalisasi
kamus_df = pd.read_csv('Kamus Normalisasi Rohingya.csv')

# Fungsi untuk normalisasi teks berdasarkan kamus normalisasi
def normalisasi(str_text):
    for i in norm:
        if pd.notna(norm[i]):  # Memeriksa apakah nilai normalisasi tidak NaN
            str_text = str_text.replace(i, norm[i])
        else:
            str_text = str_text.replace(i, '')  # Menghapus kata asal jika normalisasi adalah NaN
    return str_text

# Membuat kamus normalisasi dari DataFrame kamus_df
norm = dict(zip(kamus_df['Kata Asal'], kamus_df['Kata Normalisasi']))

# Normalisasi kolom 'Cleaning' pada DataFrame 'df'
df['Normalization'] = df['Cleaning'].apply(normalisasi)
df.head(15000)

In [None]:
#Tokenization
df['Tokenization'] = df['Normalization'].apply(lambda x:x.split())
df.head(15000)

In [None]:
#Filtering/Stopword Removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Ambil stopwords dalam bahasa Indonesia (atau sesuai kebutuhan)
stop_words = set(stopwords.words('indonesian'))
stop_words.remove('tidak')

# Proses filtering tanpa menghapus kata "tidak"
def remove_stopwords(text):
  return [word for word in text if word not in stop_words]

df['Filtering'] = df['Tokenization'].apply(lambda x: remove_stopwords(x))
df.head(15000)

In [None]:
#Steamming Data
!pip install Sastrawi

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(text):
  return [stemmer.stem(word) for word in text]

df['Stemming'] = df['Filtering'].apply(lambda x: ' '.join(stem_text(x)))
df.head(15000)

In [28]:
#Simpan ke file CSV
df.to_csv('Rohingya Preprocessing.csv', encoding='utf8', index=False)