## Import Library

In [None]:
!pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import csv
import Sastrawi
import re
import nltk
import pandas as pd 
import numpy as np
import sklearn
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dataset

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Dataset TA/iphone_labelling.csv')
dataset = dataset.drop(['date', 'jenis', 'username'],1)
df = dataset
df

Unnamed: 0,tweet,kamera,baterai,desain,harga,spesifikasi
0,Pake iphone enak \nAku pengen punya spesifika...,-1,0,-1,-1,1
1,OKE! Kamera iPhone XR Digandrungi Penyuka Foto...,1,0,1,-1,1
2,"Kalau dari segi spesifikasi mending iphone 11,...",-1,0,-1,-1,1
3,"Kamera depan IPhone 8 Super Canggih, desain ju...",1,0,1,-1,1
4,Kok aku pengen balik pakai iphone ya. Kangen s...,-1,0,-1,-1,1
...,...,...,...,...,...,...
4995,"iPhone seri terbaru ke atas, lebih bagus lagi ...",1,-1,1,-1,1
4996,"iphone kameranya semakin jernih,desain, dan sp...",1,-1,1,-1,1
4997,klo pengen iPhone mending naikin budget dikit ...,1,-1,1,-1,1
4998,"tipe iphone yg lebih bagus itu xr,kameranya je...",1,-1,1,-1,1


## Preprocessing

### Cleaning Data

In [None]:
import string 
import re #regex library

def hapus_tweet_special(text):
    # hapus tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # hapus non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # hapus mention @
    text = re.sub(r"[@][\w_-]+","", text)
    # hapus link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # hapus incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
df['cleaning'] = df['tweet'].apply(hapus_tweet_special)

#hapus number
def hapus_number(text):
    return  re.sub(r"\d+", "", text)

df['cleaning'] = df['cleaning'].apply(hapus_number)

#hapus punctuation
def hapus_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

df['cleaning'] = df['cleaning'].apply(hapus_punctuation)

#hapus whitespace leading & trailing
def hapus_whitespace_LT(text):
    return text.strip()

df['cleaning'] = df['cleaning'].apply(hapus_whitespace_LT)

#hapus multiple whitespace into single whitespace
def hapus_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['cleaning'] = df['cleaning'].apply(hapus_whitespace_multiple)

# hapus single char
def hapus_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['cleaning'] = df['cleaning'].apply(hapus_singl_char)

# hapus duplicate
#df.drop_duplicates(subset=['cleaning'], inplace=True)

df.loc[0:100,['tweet', 'cleaning']]

Unnamed: 0,tweet,cleaning
0,Pake iphone enak \nAku pengen punya spesifika...,Pake iphone enak Aku pengen punya spesifikasi ...
1,OKE! Kamera iPhone XR Digandrungi Penyuka Foto...,OKE Kamera iPhone XR Digandrungi Penyuka Fotog...
2,"Kalau dari segi spesifikasi mending iphone 11,...",Kalau dari segi spesifikasi mending iphone tap...
3,"Kamera depan IPhone 8 Super Canggih, desain ju...",Kamera depan IPhone Super Canggih desain juga ...
4,Kok aku pengen balik pakai iphone ya. Kangen s...,Kok aku pengen balik pakai iphone ya Kangen sa...
...,...,...
96,Hasil kamera iPhone 13 pro max emang luar bias...,Hasil kamera iPhone pro max emang luar biasa y...
97,pantes hasilnya selalu cakepp pake kamera ipho...,pantes hasilnya selalu cakepp pake kamera ipho...
98,cowo kpop kalo pake kamera iphone no filter tu...,cowo kpop kalo pake kamera iphone no filter tu...
99,w udh mantengin review review di yutub. Ujung-...,udh mantengin review review di yutub Ujunguju...


### Case Folding

In [None]:
df['case_folding'] = df['cleaning'].str.lower()
df.loc[0:100,['cleaning', 'case_folding']]

Unnamed: 0,cleaning,case_folding
0,Pake iphone enak Aku pengen punya spesifikasi ...,pake iphone enak aku pengen punya spesifikasi ...
1,OKE Kamera iPhone XR Digandrungi Penyuka Fotog...,oke kamera iphone xr digandrungi penyuka fotog...
2,Kalau dari segi spesifikasi mending iphone tap...,kalau dari segi spesifikasi mending iphone tap...
3,Kamera depan IPhone Super Canggih desain juga ...,kamera depan iphone super canggih desain juga ...
4,Kok aku pengen balik pakai iphone ya Kangen sa...,kok aku pengen balik pakai iphone ya kangen sa...
...,...,...
96,Hasil kamera iPhone pro max emang luar biasa y...,hasil kamera iphone pro max emang luar biasa y...
97,pantes hasilnya selalu cakepp pake kamera ipho...,pantes hasilnya selalu cakepp pake kamera ipho...
98,cowo kpop kalo pake kamera iphone no filter tu...,cowo kpop kalo pake kamera iphone no filter tu...
99,udh mantengin review review di yutub Ujunguju...,udh mantengin review review di yutub ujunguju...


### Tokenize

In [None]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['tokenize'] = df['case_folding'].apply(word_tokenize_wrapper)

df.loc[0:100,['case_folding', 'tokenize']]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,case_folding,tokenize
0,pake iphone enak aku pengen punya spesifikasi ...,"[pake, iphone, enak, aku, pengen, punya, spesi..."
1,oke kamera iphone xr digandrungi penyuka fotog...,"[oke, kamera, iphone, xr, digandrungi, penyuka..."
2,kalau dari segi spesifikasi mending iphone tap...,"[kalau, dari, segi, spesifikasi, mending, ipho..."
3,kamera depan iphone super canggih desain juga ...,"[kamera, depan, iphone, super, canggih, desain..."
4,kok aku pengen balik pakai iphone ya kangen sa...,"[kok, aku, pengen, balik, pakai, iphone, ya, k..."
...,...,...
96,hasil kamera iphone pro max emang luar biasa y...,"[hasil, kamera, iphone, pro, max, emang, luar,..."
97,pantes hasilnya selalu cakepp pake kamera ipho...,"[pantes, hasilnya, selalu, cakepp, pake, kamer..."
98,cowo kpop kalo pake kamera iphone no filter tu...,"[cowo, kpop, kalo, pake, kamera, iphone, no, f..."
99,udh mantengin review review di yutub ujunguju...,"[udh, mantengin, review, review, di, yutub, uj..."


### Normalisasi

In [None]:
normalized_word = pd.read_csv("/content/drive/MyDrive/Dataset TA/kamus_normalisasi.csv")

normalized_word_dict = {}

for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

df['normalisasi'] = df['tokenize'].apply(normalized_term)

df.loc[0:100,['tokenize', 'normalisasi']]

Unnamed: 0,tokenize,normalisasi
0,"[pake, iphone, enak, aku, pengen, punya, spesi...","[pakai, iphone, enak, aku, pengin, punya, spes..."
1,"[oke, kamera, iphone, xr, digandrungi, penyuka...","[oke, kamera, iphone, xr, digandrungi, penyuka..."
2,"[kalau, dari, segi, spesifikasi, mending, ipho...","[kalau, dari, segi, spesifikasi, mending, ipho..."
3,"[kamera, depan, iphone, super, canggih, desain...","[kamera, depan, iphone, super, canggih, desain..."
4,"[kok, aku, pengen, balik, pakai, iphone, ya, k...","[kok, aku, pengin, balik, pakai, iphone, ya, k..."
...,...,...
96,"[hasil, kamera, iphone, pro, max, emang, luar,...","[hasil, kamera, iphone, pro, max, memang, luar..."
97,"[pantes, hasilnya, selalu, cakepp, pake, kamer...","[pantas, hasilnya, selalu, cakep, pakai, kamer..."
98,"[cowo, kpop, kalo, pake, kamera, iphone, no, f...","[cowok, kpop, kalau, pakai, kamera, iphone, no..."
99,"[udh, mantengin, review, review, di, yutub, uj...","[sudah, mantengin, review, review, di, youtube..."


### Stopwords

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# Menambahkan stopword manual
list_stopwords.extend(["dg", "yg", "aja", "dgn", "kalo", "u", 'klo', 
                       'ny', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'tdk', 'krn', 'nya', 'nih', 'sih', 
                       'jd', 'tau', 'ga', 'tuh', 'utk', 'ya', 
                       'si', 'jgn', 'sdh', 'n', 'rt', 't', 
                       'nyg', 'hehe', 'pen', 'd', 'nan', 'loh',
                       '&amp', 'yah', 'jir', 'w', 'hihi', 'haha', 'wkwk'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("/content/drive/MyDrive/Dataset TA/stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    #kata = [word for word in words if word not in list_stopwords]
    return [word for word in words if word not in list_stopwords]

df['stopwords'] = df['normalisasi'].apply(stopwords_removal) 

df.loc[0:100,['normalisasi', 'stopwords']]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,normalisasi,stopwords
0,"[pakai, iphone, enak, aku, pengin, punya, spes...","[pakai, iphone, pengin, spesifikasi, bagus, du..."
1,"[oke, kamera, iphone, xr, digandrungi, penyuka...","[kamera, iphone, xr, digandrungi, penyuka, fot..."
2,"[kalau, dari, segi, spesifikasi, mending, ipho...","[segi, spesifikasi, mending, iphone, harganya,..."
3,"[kamera, depan, iphone, super, canggih, desain...","[kamera, iphone, super, canggih, desain, lucu,..."
4,"[kok, aku, pengin, balik, pakai, iphone, ya, k...","[pengin, pakai, iphone, kangen, spesifikasi, b..."
...,...,...
96,"[hasil, kamera, iphone, pro, max, memang, luar...","[hasil, kamera, iphone, pro, max, bangga, hasi..."
97,"[pantas, hasilnya, selalu, cakep, pakai, kamer...","[hasilnya, cakep, pakai, kamera, iphone, iphon..."
98,"[cowok, kpop, kalau, pakai, kamera, iphone, no...","[cowok, kpop, pakai, kamera, iphone, filter, g..."
99,"[sudah, mantengin, review, review, di, youtube...","[mantengin, review, review, youtube, ujungujun..."


### Stemming

In [None]:
!pip install swifter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swifter
  Downloading swifter-1.3.4.tar.gz (830 kB)
[K     |████████████████████████████████| 830 kB 5.4 MB/s 
Collecting psutil>=5.6.6
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[K     |████████████████████████████████| 280 kB 52.7 MB/s 
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 44.4 MB/s 
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.3.4-py3-none-any.whl size=16321 sha256=25f27a4c32fbeb752527611cca207b04c30c6f98374b45c8ffe820067bb9d53f
  Stored in directory: /root/.cache/pip/wheels/08/66/b4/921e351e63d88696932279d6163e125727c9da70ed8ca38419
Successfully built swifter
Installing collected p

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# buat stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['stopwords']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['stemming'] = df['stopwords'].swifter.apply(get_stemmed_term)

df.loc[0:100,['stopwords', 'stemming']]

Pandas Apply:   0%|          | 0/5000 [00:00<?, ?it/s]

Unnamed: 0,stopwords,stemming
0,"[pakai, iphone, pengin, spesifikasi, bagus, du...","[pakai, iphone, pengin, spesifikasi, bagus, du..."
1,"[kamera, iphone, xr, digandrungi, penyuka, fot...","[kamera, iphone, xr, gandrung, suka, fotografi..."
2,"[segi, spesifikasi, mending, iphone, harganya,...","[segi, spesifikasi, mending, iphone, harga, ma..."
3,"[kamera, iphone, super, canggih, desain, lucu,...","[kamera, iphone, super, canggih, desain, lucu,..."
4,"[pengin, pakai, iphone, kangen, spesifikasi, b...","[pengin, pakai, iphone, kangen, spesifikasi, b..."
...,...,...
96,"[hasil, kamera, iphone, pro, max, bangga, hasi...","[hasil, kamera, iphone, pro, max, bangga, hasi..."
97,"[hasilnya, cakep, pakai, kamera, iphone, iphon...","[hasil, cakep, pakai, kamera, iphone, iphone, ..."
98,"[cowok, kpop, pakai, kamera, iphone, filter, g...","[cowok, kpop, pakai, kamera, iphone, filter, g..."
99,"[mantengin, review, review, youtube, ujungujun...","[mantengin, review, review, youtube, ujungujun..."


### Final Preprocessing

In [None]:
df_prepro = df[['tweet','stemming','kamera', 'baterai', 'desain', 'harga', 'spesifikasi']].copy(deep=True)
df_prepro

Unnamed: 0,tweet,stemming,kamera,baterai,desain,harga,spesifikasi
0,Pake iphone enak \nAku pengen punya spesifika...,"[pakai, iphone, pengin, spesifikasi, bagus, du...",-1,0,-1,-1,1
1,OKE! Kamera iPhone XR Digandrungi Penyuka Foto...,"[kamera, iphone, xr, gandrung, suka, fotografi...",1,0,1,-1,1
2,"Kalau dari segi spesifikasi mending iphone 11,...","[segi, spesifikasi, mending, iphone, harga, ma...",-1,0,-1,-1,1
3,"Kamera depan IPhone 8 Super Canggih, desain ju...","[kamera, iphone, super, canggih, desain, lucu,...",1,0,1,-1,1
4,Kok aku pengen balik pakai iphone ya. Kangen s...,"[pengin, pakai, iphone, kangen, spesifikasi, b...",-1,0,-1,-1,1
...,...,...,...,...,...,...,...
4995,"iPhone seri terbaru ke atas, lebih bagus lagi ...","[iphone, seri, baru, bagus, prokameranya, jern...",1,-1,1,-1,1
4996,"iphone kameranya semakin jernih,desain, dan sp...","[iphone, kamera, jernihdesain, spesifikasi, ip...",1,-1,1,-1,1
4997,klo pengen iPhone mending naikin budget dikit ...,"[pengin, iphone, mending, naikin, budget, diki...",1,-1,1,-1,1
4998,"tipe iphone yg lebih bagus itu xr,kameranya je...","[tipe, iphone, bagus, xrkameranya, jernihdesai...",1,-1,1,-1,1


In [None]:
#Export to CSV
df_prepro.to_csv('Preprocessing.csv', index=False)