<a href="https://colab.research.google.com/github/salsaaakusuma/Analisis-Sentimen-SWOT/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PREPROCESSING DATA**

In [None]:
import pandas as pd

data = pd.read_csv('data_coc.csv', encoding='latin1')

data.head(5)

Unnamed: 0,ID,Username,Text,Created At
0,coc_1,sprintgkle,Clash of champions ini cast nya punya beragam ...,17 Juni 2024
1,coc_2,coldzonkbie,gue suka bgt sm university war - terus gamau b...,17 Juni 2024
2,coc_3,Q_Nyitt,UGM -&gt; Matematika -&gt; IPK 4.0 Kek nya ser...,18 Juni 2024
3,coc_4,todayis__gaby,Clash of champions seru juga degdegannya ada b...,29 Juni 2024
4,coc_5,yourjungx,abis nonton clash of champions terus champions...,29 Juni 2024


In [None]:
import re

def cleansing(text):
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"#\w+", '', text)
    text = re.sub(r"\d+", '', text)
    text = re.sub(r"[^\x00-\x7F]+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    return text

In [None]:
data['Cleansing'] = data['Text'].apply(cleansing)

data.head(5)

Unnamed: 0,ID,Username,Text,Created At,Cleansing
0,coc_1,sprintgkle,Clash of champions ini cast nya punya beragam ...,17 Juni 2024,Clash of champions ini cast nya punya beragam ...
1,coc_2,coldzonkbie,gue suka bgt sm university war - terus gamau b...,17 Juni 2024,gue suka bgt sm university war terus gamau ber...
2,coc_3,Q_Nyitt,UGM -&gt; Matematika -&gt; IPK 4.0 Kek nya ser...,18 Juni 2024,UGM gt Matematika gt IPK Kek nya seru nih nont...
3,coc_4,todayis__gaby,Clash of champions seru juga degdegannya ada b...,29 Juni 2024,Clash of champions seru juga degdegannya ada b...
4,coc_5,yourjungx,abis nonton clash of champions terus champions...,29 Juni 2024,abis nonton clash of champions terus champions...


In [None]:
def case_folding(text):
    return text.lower()

In [None]:
data['Case_Folding'] = data['Cleansing'].apply(case_folding)

data.head(5)

Unnamed: 0,ID,Username,Text,Created At,Cleansing,Case_Folding
0,coc_1,sprintgkle,Clash of champions ini cast nya punya beragam ...,17 Juni 2024,Clash of champions ini cast nya punya beragam ...,clash of champions ini cast nya punya beragam ...
1,coc_2,coldzonkbie,gue suka bgt sm university war - terus gamau b...,17 Juni 2024,gue suka bgt sm university war terus gamau ber...,gue suka bgt sm university war terus gamau ber...
2,coc_3,Q_Nyitt,UGM -&gt; Matematika -&gt; IPK 4.0 Kek nya ser...,18 Juni 2024,UGM gt Matematika gt IPK Kek nya seru nih nont...,ugm gt matematika gt ipk kek nya seru nih nont...
3,coc_4,todayis__gaby,Clash of champions seru juga degdegannya ada b...,29 Juni 2024,Clash of champions seru juga degdegannya ada b...,clash of champions seru juga degdegannya ada b...
4,coc_5,yourjungx,abis nonton clash of champions terus champions...,29 Juni 2024,abis nonton clash of champions terus champions...,abis nonton clash of champions terus champions...


In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def load_normalization_dict(file_path):
    normalization_dict = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if '=' in line:
                key, value = line.strip().split('=', 1)
                normalization_dict[key] = value
    return normalization_dict

normalization_dict = load_normalization_dict('normalization_dict.txt')

def normalize_text(text):
    words = text.split()
    normalized_words = [normalization_dict.get(word, word) for word in words]
    return ' '.join(normalized_words)

In [None]:
data['Normalized_Text'] = data['Case_Folding'].apply(normalize_text)
data.head(5)

Unnamed: 0,ID,Username,Text,Created At,Cleansing,Case_Folding,Normalized_Text
0,coc_1,sprintgkle,Clash of champions ini cast nya punya beragam ...,17 Juni 2024,Clash of champions ini cast nya punya beragam ...,clash of champions ini cast nya punya beragam ...,clash of champions ini cast nya punya beragam ...
1,coc_2,coldzonkbie,gue suka bgt sm university war - terus gamau b...,17 Juni 2024,gue suka bgt sm university war terus gamau ber...,gue suka bgt sm university war terus gamau ber...,saya suka banget sama university war terus tid...
2,coc_3,Q_Nyitt,UGM -&gt; Matematika -&gt; IPK 4.0 Kek nya ser...,18 Juni 2024,UGM gt Matematika gt IPK Kek nya seru nih nont...,ugm gt matematika gt ipk kek nya seru nih nont...,ugm gitu matematika gitu ipk seperti nya seru ...
3,coc_4,todayis__gaby,Clash of champions seru juga degdegannya ada b...,29 Juni 2024,Clash of champions seru juga degdegannya ada b...,clash of champions seru juga degdegannya ada b...,clash of champions seru juga degdegannya ada b...
4,coc_5,yourjungx,abis nonton clash of champions terus champions...,29 Juni 2024,abis nonton clash of champions terus champions...,abis nonton clash of champions terus champions...,habis menonton clash of champions terus champi...


In [None]:
def tokenize(text):
    tokens = text.split()
    return tokens

In [None]:
data['Tokenizing'] = data['Normalized_Text'].apply(tokenize)

data.head(5)

Unnamed: 0,ID,Username,Text,Created At,Cleansing,Case_Folding,Normalized_Text,Tokenizing
0,coc_1,sprintgkle,Clash of champions ini cast nya punya beragam ...,17 Juni 2024,Clash of champions ini cast nya punya beragam ...,clash of champions ini cast nya punya beragam ...,clash of champions ini cast nya punya beragam ...,"[clash, of, champions, ini, cast, nya, punya, ..."
1,coc_2,coldzonkbie,gue suka bgt sm university war - terus gamau b...,17 Juni 2024,gue suka bgt sm university war terus gamau ber...,gue suka bgt sm university war terus gamau ber...,saya suka banget sama university war terus tid...,"[saya, suka, banget, sama, university, war, te..."
2,coc_3,Q_Nyitt,UGM -&gt; Matematika -&gt; IPK 4.0 Kek nya ser...,18 Juni 2024,UGM gt Matematika gt IPK Kek nya seru nih nont...,ugm gt matematika gt ipk kek nya seru nih nont...,ugm gitu matematika gitu ipk seperti nya seru ...,"[ugm, gitu, matematika, gitu, ipk, seperti, ny..."
3,coc_4,todayis__gaby,Clash of champions seru juga degdegannya ada b...,29 Juni 2024,Clash of champions seru juga degdegannya ada b...,clash of champions seru juga degdegannya ada b...,clash of champions seru juga degdegannya ada b...,"[clash, of, champions, seru, juga, degdegannya..."
4,coc_5,yourjungx,abis nonton clash of champions terus champions...,29 Juni 2024,abis nonton clash of champions terus champions...,abis nonton clash of champions terus champions...,habis menonton clash of champions terus champi...,"[habis, menonton, clash, of, champions, terus,..."


In [None]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stemming(text_cleaning):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stemmed_tokens = [stemmer.stem(word) for word in text_cleaning]

    return stemmed_tokens

In [None]:
data['Stemming'] = data['Tokenizing'].apply(stemming)

data.head(5)

Unnamed: 0,ID,Username,Text,Created At,Cleansing,Case_Folding,Normalized_Text,Tokenizing,Stemming
0,coc_1,sprintgkle,Clash of champions ini cast nya punya beragam ...,17 Juni 2024,Clash of champions ini cast nya punya beragam ...,clash of champions ini cast nya punya beragam ...,clash of champions ini cast nya punya beragam ...,"[clash, of, champions, ini, cast, nya, punya, ...","[clash, of, champions, ini, cast, nya, punya, ..."
1,coc_2,coldzonkbie,gue suka bgt sm university war - terus gamau b...,17 Juni 2024,gue suka bgt sm university war terus gamau ber...,gue suka bgt sm university war terus gamau ber...,saya suka banget sama university war terus tid...,"[saya, suka, banget, sama, university, war, te...","[saya, suka, banget, sama, university, war, te..."
2,coc_3,Q_Nyitt,UGM -&gt; Matematika -&gt; IPK 4.0 Kek nya ser...,18 Juni 2024,UGM gt Matematika gt IPK Kek nya seru nih nont...,ugm gt matematika gt ipk kek nya seru nih nont...,ugm gitu matematika gitu ipk seperti nya seru ...,"[ugm, gitu, matematika, gitu, ipk, seperti, ny...","[ugm, gitu, matematika, gitu, ipk, seperti, ny..."
3,coc_4,todayis__gaby,Clash of champions seru juga degdegannya ada b...,29 Juni 2024,Clash of champions seru juga degdegannya ada b...,clash of champions seru juga degdegannya ada b...,clash of champions seru juga degdegannya ada b...,"[clash, of, champions, seru, juga, degdegannya...","[clash, of, champions, seru, juga, degdegannya..."
4,coc_5,yourjungx,abis nonton clash of champions terus champions...,29 Juni 2024,abis nonton clash of champions terus champions...,abis nonton clash of champions terus champions...,habis menonton clash of champions terus champi...,"[habis, menonton, clash, of, champions, terus,...","[habis, tonton, clash, of, champions, terus, c..."


In [None]:
data.to_csv('hasil_preprocessing.csv', index=False)