In [1]:
# Load necessary library and module
import warnings
warnings.filterwarnings('ignore')

import sys
import pandas as pd
import seaborn as sns
import numpy as np
import re
import string
import unicodedata
import nltk

nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [57]:
# Read and show sample data
mlbb = pd.read_csv('dataset/baruambil-20062022-MLBB.csv', encoding='ISO-8859-1')

In [90]:
# Apply text pre-processing to DataFrame
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
snowball = nltk.stem.SnowballStemmer('english')
porter_stemmer = PorterStemmer()
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [58]:
def casefolding(text):
    text_clean = text.lower()
    return text_clean

In [59]:
mlbb['case_folding'] = mlbb['content'].progress_apply(lambda x: casefolding(x))

  0%|          | 0/7 [00:00<?, ?it/s]

In [60]:
mlbb.to_csv( "hasil/1_testmanualisasi_casefolding.csv", index=False, encoding='utf-8-sig')

In [61]:
def filtertext(text):
    text = text.strip()                                   # Remove whitespace
    text = re.sub('[-+]?[0-9]+', ' ', text)               # Remove numbers 
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)    # Remove URLs
    text = re.sub(r"pic.twitter.com\S+", ' ', text)       # Remove custom URLs for twitter
    text = re.sub(r'\@([\w]+)',' ', text)                 # Remove Mention @
    text = re.sub(r'\#([\w]+)',' ', text)                 # Remove #TAGAR
    text = re.sub('\S*@\S*\s?', ' ', text)                # Remove email
    return text

In [62]:
mlbb['filter_text'] = mlbb['case_folding'].progress_apply(lambda x: filtertext(x))
mlbb.to_csv( "hasil/1_testmanualisasi_filter1.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]

In [63]:
def filter_punctuation(text):
    text = re.sub(r'[^\w\s]', ' ', text)                  # Remove punctuation
    text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,.\/]', ' ', text)  # Tahap-5: simbol
    return text

In [64]:
mlbb['filter_punctuation'] = mlbb['filter_text'].progress_apply(lambda x: filter_punctuation(x))
mlbb.to_csv( "hasil/1_testmanualisasi_filter1.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]

In [65]:
def filter_space(text):
    text = re.sub(r'([a-zA-Z])\1\1','\\1', text)          # Tahap-7: koreksi duplikasi tiga karakter beruntun atau lebih (contoh. yukkk)
    text = re.sub(' +',' ', text)                         # remove multiple whitespace
    text = re.sub(r'^[ ]|[ ]$','', text)                  # Tahap-9: spasi di awal dan akhir kalimat
    return text

In [66]:
mlbb['filter_space'] = mlbb['filter_punctuation'].progress_apply(lambda x: filter_space(x))
mlbb.to_csv( "hasil/1_testmanualisasi_filter1.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]

In [67]:
def filter_ascii(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') # Remove non-ascii character
    return text

In [68]:
mlbb['filter_ascii'] = mlbb['filter_space'].progress_apply(lambda x: filter_ascii(x))
mlbb.to_csv( "hasil/1_testmanualisasi_filter1.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]

In [69]:
def tokenisasi(text):
    word_tokens = word_tokenize(text) # Word tokenize
    return word_tokens

In [81]:
mlbb['tokenisasi_content'] = mlbb['content'].progress_apply(lambda x: tokenisasi(x))
mlbb.to_csv( "hasil/1_testmanualisasi_filter1.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]

In [82]:
def nonenglish(text):
    words = set(nltk.corpus.words.words())
    text_token = word_tokenize(text)
    cleaner_words = [w for w in text_token if w.lower() in words or not w.isalpha()] #remove non english
    cleaner = ' '.join(cleaner_words)
    return cleaner

In [75]:
def stopwordfilter(text):
    # Define Indonesian stopwords removal
    word_tokens = word_tokenize(text) # Word tokenize
    stop_words = stopwords.words('english')  # NLTK Indonesian stopwords
    clean_words = [word for word in word_tokens if word not in stop_words] # stopwords removal
    clean_words = ' '.join(clean_words)
    return clean_words

In [86]:
mlbb['stopwordfilter'] = mlbb['filter_ascii'].progress_apply(lambda x:stopwordfilter(x))
mlbb.to_csv( "hasil/1_testmanualisasi_filter1.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]

In [95]:
def stemming(text):
    word_tokens = word_tokenize(text) # Word tokenize
    stem_words = [snowball.stem(w) for w in word_tokens]
    cleanword= ' '.join(stem_words)
    return cleanword

In [96]:
mlbb['stemming_word'] = mlbb['stopwordfilter'].progress_apply(lambda x:stemming(x))
mlbb.to_csv( "hasil/1_testmanualisasi_filter1.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]

In [99]:
mlbb['filter_noneng_baru'] = mlbb['stemming_word'].progress_apply(lambda x:nonenglish(x))
mlbb.to_csv( "hasil/1_testmanualisasi_filter1.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]

In [100]:
def text_preprocessing(text):
  text = text.lower()                                   # Lowercase all sentences
  text = text.strip()                                   # Remove whitespace
  text = re.sub('[-+]?[0-9]+', ' ', text)               # Remove numbers 
  text = re.sub(r'https?://\S+|www\.\S+', ' ', text)    # Remove URLs
  text = re.sub(r"pic.twitter.com\S+", ' ', text)       # Remove custom URLs for twitter
  text = re.sub(r'\@([\w]+)',' ', text)                 # Remove Mention @
  text = re.sub(r'\#([\w]+)',' ', text)                 # Remove #TAGAR
  text = re.sub('\S*@\S*\s?', ' ', text)                # Remove email
  text = re.sub(r'[^\w\s]', ' ', text)                  # Remove punctuation
  #text = re.sub(r'\b\w{1,3}\b','',text)                 #Remove n-chars,Remove less than 3 chars, minimum 4 character allowed "\b[a-zA-Z0-9]{3}\b"
  text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,.\/]', ' ', text)  # Tahap-5: simbol
  text = re.sub(r'([a-zA-Z])\1\1','\\1', text)          # Tahap-7: koreksi duplikasi tiga karakter beruntun atau lebih (contoh. yukkk)
  text = re.sub(' +',' ', text)                         #remove multiple whitespace
  text = re.sub(r'^[ ]|[ ]$','', text)                  # Tahap-9: spasi di awal dan akhir kalimat

  # text = re.sub('\b[a-zA-Z0-9]{3}\b','',text)
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') # Remove non-ascii character
  word_tokens = word_tokenize(text) # Word tokenize
  
  
  
  # Define Indonesian stopwords removal
  stop_words = stopwords.words('english')  # NLTK Indonesian stopwords
  clean_words = [word for word in word_tokens if word not in stop_words] # stopwords removal
  clean_words = ' '.join(clean_words)

  words = set(nltk.corpus.words.words())
  word_cleaner = word_tokenize(clean_words)
  cleaner_words = [w for w in word_cleaner if w.lower() in words or not w.isalpha()] #remove non english
  cleaner = ' '.join(cleaner_words)

  #stemming with porter
  stem_token=word_tokenize(cleaner)
  stem_words = [snowball.stem(w) for w in stem_token]
  stem_clean= ' '.join(stem_words)

  #   cleaner = " ".join(w for w in nltk.wordpunct_tokenize(clean_words) \
        #   if w.lower() in words or not w.isalpha())

  return stem_clean

In [101]:
mlbb['clean_content'] = mlbb['content'].progress_apply(lambda x: text_preprocessing(x))
mlbb.to_csv( "hasil/mlbb_clean_content_after_preprocessing_07072022_bab4.csv", index=False, encoding='utf-8-sig')

  0%|          | 0/7 [00:00<?, ?it/s]