## Import Library

In [1]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import emoji

import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

## Read Dataset

### 01 Party

In [2]:
data_01_1 = pd.read_csv('kubu_01.csv')
data_01_2 = pd.read_csv('dataset/#01.csv')
data_01_3 = pd.read_csv('dataset/anies.csv')
data_01_4 = pd.read_csv('dataset/cak_imin.csv')

### 02 Party

In [None]:
# data_02_1 = pd.read_csv('kubu_02.csv')
# data_02_2 = pd.read_csv('dataset/#02.csv')
# data_02_3 = pd.read_csv('dataset/prabowo.csv')
# data_02_4 = pd.read_csv('dataset/gibran.csv')

### 03 Party

In [3]:
data_03_1 = pd.read_csv('kubu_03.csv')
data_03_2 = pd.read_csv('dataset/#03.csv')
data_03_3 = pd.read_csv('dataset/ganjar.csv')
data_03_4 = pd.read_csv('dataset/mahfud.csv')

## Merged Dataset

In [4]:
merged_kubu_01 = pd.concat([data_01_1, data_01_2, data_01_3, data_01_4], ignore_index=True)
# merged_kubu_02 = pd.concat([data_02_1, data_02_2, data_02_3], ignore_index=True)
merged_kubu_03 = pd.concat([data_03_1, data_03_2, data_03_3], ignore_index=True)

In [5]:
merged_kubu_01.shape

(256133, 15)

In [None]:
# merged_kubu_02.shape

In [6]:
merged_kubu_03.shape

(145956, 15)

## Drop Unnecessary Column 

In [7]:
column_names = ['created_at', 'id_str', 'full_text', 'lang', 'location',]

In [8]:
merged_kubu_01 = merged_kubu_01[column_names]
# merged_kubu_02 = merged_kubu_02[column_names]
merged_kubu_03 = merged_kubu_03[column_names]

In [9]:
merged_kubu_01.columns

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')

In [None]:
# merged_kubu_02.columns

In [10]:
merged_kubu_03.columns

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')

In [11]:
merged_kubu_01.to_csv('all_kubu_01.csv', index=False)
# merged_kubu_02.to_csv('all_kubu_02.csv', index=False)
merged_kubu_03.to_csv('all_kubu_03.csv', index=False)

In [12]:
merged_kubu_01.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
0,Wed Nov 01 13:57:05 +0000 2023,1719715185009791323,Bakal calon presiden Anies Baswedan menawarkan...,in,NKRI
1,Wed Nov 01 13:28:15 +0000 2023,1719707929064984595,@DPP_PKB @cakimiNOW @aniesbaswedan Menang calo...,in,
2,Wed Nov 01 12:49:12 +0000 2023,1719698099839598743,#TemanPemilih kamu perlu tahu inilah Profil Ba...,in,Kepulauan Meranti
3,Wed Nov 01 10:40:34 +0000 2023,1719665731682574510,Jelang Pemilu 2024 Ini Visi dan Misi Pasangan ...,in,"JKT, Indonesia"
4,Wed Nov 01 10:17:39 +0000 2023,1719659964279791750,untuk mengambil keputusan yang strategis terma...,in,


In [None]:
# merged_kubu_02.head()

In [13]:
merged_kubu_03.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
0,Wed Nov 01 23:50:57 +0000 2023,1719864638417072610,@Nikmatul_Sg @ganjarpranowo Sehat slalu pak Ga...,in,
1,Wed Nov 01 23:08:42 +0000 2023,1719854004728218083,@ganjarpranowo Pak Ganjar dan Pak Mahfud MD Ad...,in,
2,Wed Nov 01 22:39:15 +0000 2023,1719846593959252412,@Melihat_Indo Rahajeng Rawuh Di Bali Kedatanga...,in,
3,Wed Nov 01 16:00:38 +0000 2023,1719746277183988034,Kedatangan bakal calon presiden (bacapres) 202...,in,
4,Wed Nov 01 15:59:34 +0000 2023,1719746008316604750,Kedatangan bakal calon presiden (bacapres) 202...,in,


## Transform Column 'created_at' to Date

In [14]:
def simplify_and_sort_created_at(df):
    df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y').dt.date
    return df.sort_values(by='created_at')

In [15]:
merged_kubu_01 = simplify_and_sort_created_at(merged_kubu_01)
# merged_kubu_02 = simplify_and_sort_created_at(merged_kubu_02)
merged_kubu_03 = simplify_and_sort_created_at(merged_kubu_03)

In [16]:
merged_kubu_01.head()


Unnamed: 0,created_at,id_str,full_text,lang,location
0,2023-11-01,1719715185009791323,Bakal calon presiden Anies Baswedan menawarkan...,in,NKRI
241331,2023-11-01,1719588971724833271,Ketua Umum (Ketum) Partai Kebangkitan Bangsa (...,in,
241332,2023-11-01,1719670803023692019,Ÿæÿ¥ÿßŸàÿ±- JUI ⁄©€í ÿ¥ŸÖŸàŸÑ€åÿ™€å Ÿæÿ±Ÿà⁄Øÿ±ÿßŸÖ ŸÖ€å⁄∫ ŸÜŸàÿ¨ŸàÿßŸÜŸà⁄∫ ⁄©€å ...,ur,Pakistan
241333,2023-11-01,1719673301428006975,#ŸÇŸàŸÖ_⁄©ÿß_ÿ≠ŸàÿµŸÑ€Å_ÿπŸÖÿ±ÿßŸÜ_ÿÆÿßŸÜ @TeamiPians These are ...,en,
241334,2023-11-01,1719683033991966826,Anies Santri Tenan Cak Imin Santri Tenanan. Se...,in,"Papua Barat, Indonesia"


In [None]:
# merged_kubu_02.head()

In [17]:
merged_kubu_03.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
0,2023-11-01,1719864638417072610,@Nikmatul_Sg @ganjarpranowo Sehat slalu pak Ga...,in,
97412,2023-11-01,1719856723031412799,@Dennysiregar7 Belajarlah memahami ucapan pres...,in,
97411,2023-11-01,1719856723157188787,@Nikmatul_Sg @ganjarpranowo betulaaah... Pak G...,in,
97410,2023-11-01,1719856742971101218,Bantah Sri Mulyani Masuk TPN Ganjar-Mahfud Ars...,in,Jakarta
97409,2023-11-01,1719856758469087593,@Melihat_Indo Rahajeng Rawuh Di Bali Pak Ganja...,in,


# Pre process

## Drop Null

In [18]:
merged_kubu_01.dropna(inplace=True)
merged_kubu_01.isnull().sum()

created_at    0
id_str        0
full_text     0
lang          0
location      0
dtype: int64

In [None]:
# merged_kubu_02.dropna(inplace=True)
# merged_kubu_02.isnull().sum()

In [19]:
merged_kubu_03.dropna(inplace=True)
merged_kubu_03.isnull().sum()

created_at    0
id_str        0
full_text     0
lang          0
location      0
dtype: int64

## Drop Duplicate

In [20]:
merged_kubu_01.duplicated(subset=['id_str', 'full_text']).sum()
# merged_kubu_02.duplicated(subset=['id_str', 'full_text']).sum()
merged_kubu_03.duplicated(subset=['id_str', 'full_text']).sum()

np.int64(1768)

In [21]:
merged_kubu_01.drop_duplicates(subset=['id_str', 'full_text'])

Unnamed: 0,created_at,id_str,full_text,lang,location
0,2023-11-01,1719715185009791323,Bakal calon presiden Anies Baswedan menawarkan...,in,NKRI
241332,2023-11-01,1719670803023692019,Ÿæÿ¥ÿßŸàÿ±- JUI ⁄©€í ÿ¥ŸÖŸàŸÑ€åÿ™€å Ÿæÿ±Ÿà⁄Øÿ±ÿßŸÖ ŸÖ€å⁄∫ ŸÜŸàÿ¨ŸàÿßŸÜŸà⁄∫ ⁄©€å ...,ur,Pakistan
241334,2023-11-01,1719683033991966826,Anies Santri Tenan Cak Imin Santri Tenanan. Se...,in,"Papua Barat, Indonesia"
241335,2023-11-01,1719507597274038752,Ia dipanggil bay muzayadah. Rasulullah SAW per...,in,Nilai
241336,2023-11-01,1719687853624746016,Komitmen Paslon Anies-Muhaimin (AMIN) terhadap...,in,"Depok, Indonesia"
...,...,...,...,...,...
226247,2024-02-14,1757849900979548175,@ImranARaja1 kpk to azam umid war ja skte h na...,in,"Trentino-Alto Adige, Italia"
226249,2024-02-14,1757849880456802731,@ibadovnazim223 bu daha iyi,tr,"Frankfurt on the Main, Germany"
226250,2024-02-14,1757849860135673929,@CuanJuan01 @mhmmd_jati Idealis tapi tidak rea...,in,Bali
226252,2024-02-14,1757849828967723163,@unnesmenfess Kenapa kalau ini konteks diskusi...,in,"Semarang, Jawa Tengah"


In [None]:
# merged_kubu_02.drop_duplicates(subset=['id_str', 'full_text'])

In [22]:
merged_kubu_03.drop_duplicates(subset=['id_str', 'full_text'])

Unnamed: 0,created_at,id_str,full_text,lang,location
97410,2023-11-01,1719856742971101218,Bantah Sri Mulyani Masuk TPN Ganjar-Mahfud Ars...,in,Jakarta
97404,2023-11-01,1719856879810355514,@ganjarpranowo Beruntung nya bisa duduk satu m...,in,"Grogol Petamburan, Indonesia"
97402,2023-11-01,1719856885795549186,@ganjarpranowo Pak Ganjar tetep yg terbaik.. s...,in,"Kembangan, Indonesia"
97400,2023-11-01,1719856929210851776,@kiki_daliyo @ganjarpranowo Rahajeng Rawuh Di ...,in,Indonesia
97414,2023-11-01,1719856716500828491,@ganjarpranowo Kapan lagi pemuda bali bisa lan...,in,Jakarta Capital Region
...,...,...,...,...,...
93536,2024-02-14,1757871840494784802,Di liat secara simple aja dah keliatan emg yan...,in,"Malang, Jawa Timur"
93528,2024-02-14,1757872221689966934,@godig_iN @RH25chanel63462 @Ryan_Al_Faqir Arti...,in,Pulau Andalas Raya
93525,2024-02-14,1757872272118055251,SIREKAP LO MENDING GAUSAH BIKIN APS DEH,in,PALESTINE üáµüá∏
93522,2024-02-14,1757872390087086463,Berharap menang 01/03? Lawannya anak presiden ...,in,Las venturas


## Drop Tweet That Aren't ID

In [23]:
print(len(merged_kubu_01))

143295


In [None]:
# print(len(merged_kubu_02))

In [24]:
print(len(merged_kubu_03))

75397


In [25]:
merged_kubu_01 = merged_kubu_01[merged_kubu_01['lang'] == 'in']
# merged_kubu_02 = merged_kubu_02[merged_kubu_02['lang'] == 'in']
merged_kubu_03 = merged_kubu_03[merged_kubu_03['lang'] == 'in']

## Drop @Account

### Party 01

In [26]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'@(?!\w*(anies|anis)\w*)\w+', '', regex=True).str.strip()
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'@\w*(anies|anis)\w*', 'anies',  regex=True ).str.strip()

In [27]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'@(?!\w*(cakimin)\w*)\w+', '', regex=True).str.strip()
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'@\w*(cakimin)\w*', 'cak imin',  regex=True ).str.strip()

### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'@(?!\w*(prabowo)\w*)\w+', '', regex=True).str.strip()
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'@\w*(prabowo)\w*', 'prabowo',  regex=True ).str.strip()

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'@(?!\w*(gibran)\w*)\w+', '', regex=True).str.strip()
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'@\w*(gibran)\w*', 'gibran',  regex=True ).str.strip()

### Party 03

In [28]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'@(?!\w*(ganjar)\w*)\w+', '', regex=True).str.strip()
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'@\w*(ganjar)\w*', 'ganjar',  regex=True ).str.strip()

In [29]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'@(?!\w*(mahfud)\w*)\w+', '', regex=True).str.strip()
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'@\w*(mahfud)\w*', 'mahfud md',  regex=True ).str.strip()

## Drop Link in Colum Full Text

In [30]:
def remove_links(text):
    if not isinstance(text, str):
        return text
    return re.sub(r'https?://\S+', '', text)

In [31]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(remove_links)

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(remove_links)

In [32]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(remove_links)

## Filter Data

### Party 01

In [33]:
keyword_pattern = r'\banies\b|\banis\b|\b01\b|\bcak imin\b|\bimin\b'
date_pattern = r'\b01/\d{2}/\d{4}\b'
number_pattern = r'\d*01\d*'

In [34]:
keyword_mask = merged_kubu_01['full_text'].str.contains(keyword_pattern, case=False, na=False)
date_mask = merged_kubu_01['full_text'].str.contains(date_pattern, na=False)
number_mask = merged_kubu_01['full_text'].str.contains(number_pattern, na=False)

In [35]:
final_mask = keyword_mask | ~(date_mask | number_mask)

In [36]:
merged_kubu_01 = merged_kubu_01[final_mask]

In [37]:
print(len(merged_kubu_01))

125541


### Party 02

In [None]:
# keyword_pattern = r'\bprabowo\b|\bgibran\b|\b02\b'
# date_pattern = r'\b02/\d{2}/\d{4}\b'
# number_pattern = r'\d*02\d*'

In [None]:
# keyword_mask = merged_kubu_02['full_text'].str.contains(keyword_pattern, case=False, na=False)
# date_mask = merged_kubu_02['full_text'].str.contains(date_pattern, na=False)
# number_mask = merged_kubu_02['full_text'].str.contains(number_pattern, na=False)

In [None]:
# final_mask = keyword_mask | ~(date_mask | number_mask)

In [None]:
# merged_kubu_02 = merged_kubu_02[final_mask]

In [None]:
# print(len(merged_kubu_02))

### Party 03

In [38]:
keyword_pattern = r'\bganjar\b|\bmahfud\b|\b03\b'
date_pattern = r'\b03/\d{2}/\d{4}\b'
number_pattern = r'\d*03\d*'

In [39]:
keyword_mask = merged_kubu_03['full_text'].str.contains(keyword_pattern, case=False, na=False)
date_mask = merged_kubu_03['full_text'].str.contains(date_pattern, na=False)
number_mask = merged_kubu_03['full_text'].str.contains(number_pattern, na=False)

In [40]:
final_mask = keyword_mask | ~(date_mask | number_mask)

In [41]:
merged_kubu_03 = merged_kubu_03[final_mask]

In [42]:
print(len(merged_kubu_03))

73051


## Replace &amp to &

In [43]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'&amp', '&', regex=False)
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'&amp', '&', regex=False)
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'&amp', '&', regex=False)

## Normalization

In [44]:
def normalize_comments(text):
    # Remove emojis by filtering out any character in emoji.EMOJI_DATA
    emojis = [c for c in text if c not in emoji.EMOJI_DATA]

    # Buat text menjadi lower case
    text = text.lower()
    

    # Bersihkan teks dari karakter khusus
    text = re.sub(r'[^\w\s]', ' ', text)

    # Bersihkan karakter yang berulang
    normal_regex = re.compile(r"(.)\1{1,}") # compiling regex pattern for a repeating character in a word (e.g., haiiii -> 'i' is repeated several times)
    text = normal_regex.sub(r"\1\1", text) # removing the repeating characters 
    
    # Hapus multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
       
    return text

### Party 01

In [45]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(normalize_comments)
merged_kubu_01.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
0,2023-11-01,1719715185009791323,bakal calon presiden anies baswedan menawarkan...,in,NKRI
241334,2023-11-01,1719683033991966826,anies santri tenan cak imin santri tenanan sem...,in,"Papua Barat, Indonesia"
241335,2023-11-01,1719507597274038752,ia dipanggil bay muzayadah rasulullah saw pern...,in,Nilai
241336,2023-11-01,1719687853624746016,komitmen paslon anies muhaimin amin terhadap p...,in,"Depok, Indonesia"
241340,2023-11-01,1719650853089640904,ibu2 yang mengandung kena dengar kisah nusayba...,in,Kuala Lumpur Federal Territory


### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(normalize_comments)
# merged_kubu_02.head()

### Party 03

In [46]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(normalize_comments)
merged_kubu_03.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
97410,2023-11-01,1719856742971101218,bantah sri mulyani masuk tpn ganjar mahfud ars...,in,Jakarta
97404,2023-11-01,1719856879810355514,ganjar beruntung nya bisa duduk satu meja deng...,in,"Grogol Petamburan, Indonesia"
97402,2023-11-01,1719856885795549186,ganjar pak ganjar tetep yg terbaik semangat pa...,in,"Kembangan, Indonesia"
97400,2023-11-01,1719856929210851776,ganjar rahajeng rawuh di bali pak ganjar semog...,in,Indonesia
97414,2023-11-01,1719856716500828491,ganjar kapan lagi pemuda bali bisa langsung di...,in,Jakarta Capital Region


## Standarization

In [47]:
import json

def load_dict_from_json_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        dictionary = json.load(file)
    return dictionary

In [48]:
my_dict = load_dict_from_json_file('combined_slang_words.txt')

In [49]:
def normalize_slang_words(text, slang_words_dict=my_dict):
    words = text.split()
    standarized_words = []
    for word in words:
        standarized_word = slang_words_dict.get(word, word) # Mengembalikan kata asli jika kata tidak ditemukan dalam dict 
        standarized_words.append(standarized_word)
    return ' '.join(standarized_words)

### Party 01

In [50]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(normalize_slang_words)
merged_kubu_01.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
0,2023-11-01,1719715185009791323,bakal calon presiden anies baswedan menawarkan...,in,NKRI
241334,2023-11-01,1719683033991966826,anies santri tenan cak imin santri tenanan sem...,in,"Papua Barat, Indonesia"
241335,2023-11-01,1719507597274038752,ia dipanggil bay muzayadah rasulullah saw pern...,in,Nilai
241336,2023-11-01,1719687853624746016,komitmen paslon anies muhaimin amin terhadap p...,in,"Depok, Indonesia"
241340,2023-11-01,1719650853089640904,ibu2 yang mengandung kena dengar kisah nusayba...,in,Kuala Lumpur Federal Territory


### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(normalize_slang_words)
# merged_kubu_02.head()

### Party 03

In [51]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(normalize_slang_words)
merged_kubu_03.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
97410,2023-11-01,1719856742971101218,bantah sri mulyani masuk tpn ganjar mahfud ars...,in,Jakarta
97404,2023-11-01,1719856879810355514,ganjar beruntung nya bisa duduk satu meja deng...,in,"Grogol Petamburan, Indonesia"
97402,2023-11-01,1719856885795549186,ganjar pak ganjar tetep yang terbaik semangat ...,in,"Kembangan, Indonesia"
97400,2023-11-01,1719856929210851776,ganjar rahajeng rawuh di bali pak ganjar semog...,in,Indonesia
97414,2023-11-01,1719856716500828491,ganjar kapan lagi pemuda bali bisa langsung di...,in,Jakarta Capital Region


In [52]:
slangwords_collection = pd.read_csv('new_kamusalay.csv', header=None, encoding='latin-1')
replacements = dict(zip(slangwords_collection[0], slangwords_collection[1])) 

### Party 01

In [53]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(
    lambda text: normalize_slang_words(text, replacements))
merged_kubu_01.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
0,2023-11-01,1719715185009791323,bakal calon presiden anies baswedan menawarkan...,in,NKRI
241334,2023-11-01,1719683033991966826,anies santri benaran cak imin santri tenanan s...,in,"Papua Barat, Indonesia"
241335,2023-11-01,1719507597274038752,ia dipanggil selamat tinggal muzayadah rasulul...,in,Nilai
241336,2023-11-01,1719687853624746016,komitmen pasangan calon anies muhaimin amin te...,in,"Depok, Indonesia"
241340,2023-11-01,1719650853089640904,ibu ibu yang mengandung kena dengar kisah nusa...,in,Kuala Lumpur Federal Territory


### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(
#     lambda text: normalize_slang_words(text, replacements))
# merged_kubu_02.head()

### Party 03

In [54]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(
    lambda text: normalize_slang_words(text, replacements))
merged_kubu_03.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
97410,2023-11-01,1719856742971101218,bantah sri mulyani masuk tpn ganjar mahfud ars...,in,Jakarta
97404,2023-11-01,1719856879810355514,ganjar beruntung nya bisa duduk satu meja deng...,in,"Grogol Petamburan, Indonesia"
97402,2023-11-01,1719856885795549186,ganjar pak ganjar tetap yang terbaik semangat ...,in,"Kembangan, Indonesia"
97400,2023-11-01,1719856929210851776,ganjar rahajeng rawuh di bali pak ganjar semog...,in,Indonesia
97414,2023-11-01,1719856716500828491,ganjar kapan lagi pemuda bali bisa langsung di...,in,Jakarta Capital Region


## Removing Stop Words

In [55]:
def remove_whitespace(text):
    if not isinstance(text, str):
        return text  
    
    return ' '.join(text.split())

In [56]:
def remove_stopwords(text):
    if not isinstance(text, str):
        return text  
    text = remove_whitespace(text)
    stop_words = set(stopwords.words('indonesian')) 
    stop_words.update([])
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [57]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(remove_stopwords)
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(remove_stopwords)
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(remove_stopwords)

## Save Processed CSV for Labeling

In [58]:
merged_kubu_01.to_csv('processed_merged_kubu_01.csv',index=False)
# merged_kubu_02.to_csv('processed_merged_kubu_02.csv',index=False, encoding='utf-8-sig')
merged_kubu_03.to_csv('processed_merged_kubu_03.csv',index=False)

## Stemming Data

In [59]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [60]:
def stemming(text):
    if not isinstance(text, str):
        return text  
    
    token = nltk.word_tokenize(text)
    stem_kalimat = []
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)

    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [61]:
from tqdm import tqdm
tqdm.pandas()

### Party 01

In [62]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].progress_apply(stemming)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125541/125541 [2:28:03<00:00, 14.13it/s]  


In [64]:
merged_kubu_01.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
0,2023-11-01,1719715185009791323,calon presiden anies baswedan tawar program ke...,in,NKRI
241334,2023-11-01,1719683033991966826,anies santri benar cak imin santri tenanan mog...,in,"Papua Barat, Indonesia"
241335,2023-11-01,1719507597274038752,panggil selamat tinggal muzayadah rasulullah s...,in,Nilai
241336,2023-11-01,1719687853624746016,komitmen pasang calon anies muhaimin amin pesa...,in,"Depok, Indonesia"
241340,2023-11-01,1719650853089640904,kandung kena dengar kisah nusaybah binti kaab ...,in,Kuala Lumpur Federal Territory


### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].progress_apply(stemming)

In [None]:
# merged_kubu_02.head()

### Party 03

In [65]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].progress_apply(stemming)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 73051/73051 [36:22<00:00, 33.47it/s]  


In [66]:
merged_kubu_03.head()

Unnamed: 0,created_at,id_str,full_text,lang,location
97410,2023-11-01,1719856742971101218,bantah sri mulyani masuk tpn ganjar mahfud ars...,in,Jakarta
97404,2023-11-01,1719856879810355514,ganjar untung nya duduk meja ganjar iya,in,"Grogol Petamburan, Indonesia"
97402,2023-11-01,1719856885795549186,ganjar ganjar baik semangat bagi ilmu,in,"Kembangan, Indonesia"
97400,2023-11-01,1719856929210851776,ganjar rahajeng rawuh bal ganjar moga mudah la...,in,Indonesia
97414,2023-11-01,1719856716500828491,ganjar pemuda bal langsung diskusi ganjar insp...,in,Jakarta Capital Region


In [67]:
merged_kubu_01.to_csv('stemmed_merged_kubu_01.csv', index=False)
# merged_kubu_02.to_csv('stemmed_merged_kubu_02.csv', index=False)
merged_kubu_03.to_csv('stemmed_merged_kubu_03.csv', index=False)

## EDA

In [None]:
from wordcloud import WordCloud

### Party 01

In [None]:
merged_kubu_01.dropna(subset=['full_text'], inplace=True)
token_data = [row.split() for row in merged_kubu_01['full_text']]
all_words_no_stopwords = ' '.join([' '.join(tokens) for tokens in token_data])

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words_no_stopwords)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hilangkan axis
plt.show()

In [None]:
from collections import Counter

# Pecah string panjang menjadi list of words (tokenisasi)
token_list = all_words_no_stopwords.split()

# Hitung frekuensi kata menggunakan Counter
word_counts = Counter(token_list)

# Ambil 20 kata yang paling sering muncul
most_common_words = word_counts.most_common(20)

# Pisahkan kata dan frekuensinya untuk plotting
words, frequencies = zip(*most_common_words)

# Plot Bar Chart untuk kata-kata paling sering
plt.figure(figsize=(10, 6))
bars = plt.barh(words, frequencies, color='lightgreen')

# Menambahkan label frekuensi di dalam batang
for bar, frequency in zip(bars, frequencies):
    plt.text(bar.get_width() - 100,  # Mengatur agar teks berada sedikit di dalam batang
             bar.get_y() + bar.get_height() / 2,  # Posisi vertikal
             f'{frequency}',  # Nilai frekuensi yang akan ditampilkan
             va='center', ha='right', color='black', fontsize=10)  # Posisi dan gaya teks

# Label sumbu
plt.xlabel('Frekuensi')
plt.ylabel('Kata')
plt.title('Top 20 Kata Terbanyak')

# Membalik sumbu y agar kata dengan frekuensi tertinggi di atas
plt.gca().invert_yaxis()

# Tampilkan plot
plt.show()

In [None]:
import seaborn as sns

merged_kubu_01['created_at'] = pd.to_datetime(merged_kubu_01['created_at'])
tweets_per_day = merged_kubu_01.groupby(merged_kubu_01['created_at'].dt.date).size()
tweets_per_month = merged_kubu_01.groupby(merged_kubu_01['created_at'].dt.to_period('M')).size()

fig, axes = plt.subplots(2, 1, figsize=(12, 20))

# Per Day
sns.lineplot(ax=axes[0], x=tweets_per_day.index, y=tweets_per_day.values, marker='o', linewidth=2)
axes[0].set_title('Number of Tweets Per Day', fontsize=14)
axes[0].set_xlabel('Date', fontsize=12)
axes[0].set_ylabel('Number of Tweets', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True)

# Per Month
sns.barplot(ax=axes[1], x=tweets_per_month.index.astype(str), y=tweets_per_month.values, palette="Blues_r")
axes[1].set_title('Number of Tweets Per Month', fontsize=14)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Number of Tweets', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y')

### Party 02

In [None]:
# merged_kubu_02.dropna(subset=['full_text'], inplace=True)
# token_data = [row.split() for row in merged_kubu_02['full_text']]
# all_words_no_stopwords = ' '.join([' '.join(tokens) for tokens in token_data])

In [None]:
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words_no_stopwords)
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')  # Hilangkan axis
# plt.show()

In [None]:
# # Pecah string panjang menjadi list of words (tokenisasi)
# token_list = all_words_no_stopwords.split()

# # Hitung frekuensi kata menggunakan Counter
# word_counts = Counter(token_list)

# # Ambil 20 kata yang paling sering muncul
# most_common_words = word_counts.most_common(20)

# # Pisahkan kata dan frekuensinya untuk plotting
# words, frequencies = zip(*most_common_words)

# # Plot Bar Chart untuk kata-kata paling sering
# plt.figure(figsize=(10, 6))
# bars = plt.barh(words, frequencies, color='lightgreen')

# # Menambahkan label frekuensi di dalam batang
# for bar, frequency in zip(bars, frequencies):
#     plt.text(bar.get_width() - 100,  # Mengatur agar teks berada sedikit di dalam batang
#              bar.get_y() + bar.get_height() / 2,  # Posisi vertikal
#              f'{frequency}',  # Nilai frekuensi yang akan ditampilkan
#              va='center', ha='right', color='black', fontsize=10)  # Posisi dan gaya teks

# # Label sumbu
# plt.xlabel('Frekuensi')
# plt.ylabel('Kata')
# plt.title('Top 20 Kata Terbanyak')

# # Membalik sumbu y agar kata dengan frekuensi tertinggi di atas
# plt.gca().invert_yaxis()

# # Tampilkan plot
# plt.show()

In [None]:
# merged_kubu_02['created_at'] = pd.to_datetime(merged_kubu_02['created_at'])
# tweets_per_day = merged_kubu_02.groupby(merged_kubu_02['created_at'].dt.date).size()
# tweets_per_month = merged_kubu_02.groupby(merged_kubu_02['created_at'].dt.to_period('M')).size()

# fig, axes = plt.subplots(2, 1, figsize=(12, 20))

# # Per Day
# sns.lineplot(ax=axes[0], x=tweets_per_day.index, y=tweets_per_day.values, marker='o', linewidth=2)
# axes[0].set_title('Number of Tweets Per Day', fontsize=14)
# axes[0].set_xlabel('Date', fontsize=12)
# axes[0].set_ylabel('Number of Tweets', fontsize=12)
# axes[0].tick_params(axis='x', rotation=45)
# axes[0].grid(True)

# # Per Month
# sns.barplot(ax=axes[1], x=tweets_per_month.index.astype(str), y=tweets_per_month.values, palette="Blues_r")
# axes[1].set_title('Number of Tweets Per Month', fontsize=14)
# axes[1].set_xlabel('Month', fontsize=12)
# axes[1].set_ylabel('Number of Tweets', fontsize=12)
# axes[1].tick_params(axis='x', rotation=45)
# axes[1].grid(axis='y')

### Party 03

In [None]:
merged_kubu_03.dropna(subset=['full_text'], inplace=True)
token_data = [row.split() for row in merged_kubu_03['full_text']]
all_words_no_stopwords = ' '.join([' '.join(tokens) for tokens in token_data])

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words_no_stopwords)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hilangkan axis
plt.show()

In [None]:
# Pecah string panjang menjadi list of words (tokenisasi)
token_list = all_words_no_stopwords.split()

# Hitung frekuensi kata menggunakan Counter
word_counts = Counter(token_list)

# Ambil 20 kata yang paling sering muncul
most_common_words = word_counts.most_common(20)

# Pisahkan kata dan frekuensinya untuk plotting
words, frequencies = zip(*most_common_words)

# Plot Bar Chart untuk kata-kata paling sering
plt.figure(figsize=(10, 6))
bars = plt.barh(words, frequencies, color='lightgreen')

# Menambahkan label frekuensi di dalam batang
for bar, frequency in zip(bars, frequencies):
    plt.text(bar.get_width() - 100,  # Mengatur agar teks berada sedikit di dalam batang
             bar.get_y() + bar.get_height() / 2,  # Posisi vertikal
             f'{frequency}',  # Nilai frekuensi yang akan ditampilkan
             va='center', ha='right', color='black', fontsize=10)  # Posisi dan gaya teks

# Label sumbu
plt.xlabel('Frekuensi')
plt.ylabel('Kata')
plt.title('Top 20 Kata Terbanyak')

# Membalik sumbu y agar kata dengan frekuensi tertinggi di atas
plt.gca().invert_yaxis()

# Tampilkan plot
plt.show()

In [None]:
merged_kubu_03['created_at'] = pd.to_datetime(merged_kubu_03['created_at'])
tweets_per_day = merged_kubu_03.groupby(merged_kubu_03['created_at'].dt.date).size()
tweets_per_month = merged_kubu_03.groupby(merged_kubu_03['created_at'].dt.to_period('M')).size()

fig, axes = plt.subplots(2, 1, figsize=(12, 20))

# Per Day
sns.lineplot(ax=axes[0], x=tweets_per_day.index, y=tweets_per_day.values, marker='o', linewidth=2)
axes[0].set_title('Number of Tweets Per Day', fontsize=14)
axes[0].set_xlabel('Date', fontsize=12)
axes[0].set_ylabel('Number of Tweets', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True)

# Per Month
sns.barplot(ax=axes[1], x=tweets_per_month.index.astype(str), y=tweets_per_month.values, palette="Blues_r")
axes[1].set_title('Number of Tweets Per Month', fontsize=14)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Number of Tweets', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y')