# Library

In [1]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import emoji
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

# Read Dataset

In [2]:
data_gerindra = pd.read_csv('dataset/gerindra.csv')
data_golkar = pd.read_csv('dataset/golkar.csv')
data_pan = pd.read_csv('dataset/partai amanat nasional.csv')
data_demokrat = pd.read_csv('dataset/partai demokrat.csv')
data_pks = pd.read_csv('dataset/partai keadilan sejahtera.csv')
data_nasdem = pd.read_csv('dataset/partai nasional demokrat.csv')
data_pdip = pd.read_csv('dataset/pdip.csv')
data_pkb = pd.read_csv('dataset/PKB.csv')

## Drop Unnecessary Column 

In [3]:
column_names = ['created_at', 'id_str', 'full_text', 'lang', 'location',]

In [4]:
data_gerindra = data_gerindra[column_names]
data_golkar = data_golkar[column_names]
data_pan = data_pan[column_names]
data_demokrat = data_demokrat[column_names]
data_pks = data_pks[column_names]
data_nasdem = data_nasdem[column_names]
data_pdip = data_pdip[column_names]
data_pkb = data_pkb[column_names]

In [5]:
data_partai = [data_gerindra, data_golkar, data_pan, data_demokrat, data_pks, data_nasdem, data_pdip, data_pkb]

In [6]:
for i in range(len(data_partai)):
    print(data_partai[i].columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [7]:
for i in range(len(data_partai)):
    print(data_partai[i].head())
    print("==================================")

                       created_at               id_str  \
0  Wed Nov 01 09:21:28 +0000 2023  1719645824047788466   
1  Wed Nov 01 08:49:05 +0000 2023  1719637674661912853   
2  Wed Nov 01 07:15:56 +0000 2023  1719614232235475176   
3  Thu Nov 02 11:51:48 +0000 2023  1720046042262204510   
4  Thu Nov 02 11:41:55 +0000 2023  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
                       created_at               id_str  \
0  Wed Nov 01 09:21:28 +0000 2023  1719645824047788466   
1  Wed Nov 01 08:49:05 +0000 2023  1719637674661912853   
2  Wed Nov 01 07:15:56 +0000 2023  1719614232235475176   
3  Th

## Transform Column 'created_at' to Date

In [8]:
def simplify_and_sort_created_at(df):
    df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y').dt.date
    return df.sort_values(by='created_at')

In [9]:
for i in range(len(data_partai)):
    simplify_and_sort_created_at(data_partai[i])

In [10]:
for i in range(len(data_partai)):
    print(data_partai[i].head())
    print("==================================")

   created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720046042262204510   
4  2023-11-02  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
   created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720045422960595334   
4  2023-11-02  1720039369808154652   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in  

## Preprocess

### Drop Null

In [11]:
# for i in range(len(data_partai)):
#     data_partai[i].dropna(inplace=True)
#     print(data_partai[i].isnull().sum())
#     print("==================================")

### Drop Duplicates

In [12]:
print("Data Gerindra: ", data_gerindra.head())

Data Gerindra:     created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720046042262204510   
4  2023-11-02  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  


In [13]:
for i in range(len(data_partai)):
    print(data_partai[i].duplicated(subset=['id_str', 'full_text']).sum())
    print("==================================")

0
0
0
0
0
0
0
0


In [14]:
print("Data Gerindra: ", data_gerindra.head())

Data Gerindra:     created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720046042262204510   
4  2023-11-02  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  


In [15]:
for i in range(len(data_partai)):
    data_partai[i] = data_partai[i].drop_duplicates(subset=['id_str', 'full_text'])

## Drop Tweet That Aren't ID

In [16]:
for i in range(len(data_partai)):
    data_partai[i] = data_partai[i][data_partai[i]['lang'] == 'in']

## Drop @ Account

### Partain Gerindra

In [17]:
data_gerindra['full_text'] = data_gerindra['full_text'].str.replace(r'@(?!\w*(Gerindra)\w*)\w+', '', regex=True).str.strip()
data_gerindra['full_text'] = data_gerindra['full_text'].str.replace(r'(?i)@\w*(Gerindra)\w*', 'Gerindra',  regex=True ).str.strip()

### Partai Golkar

In [18]:
data_golkar['full_text'] = data_golkar['full_text'].str.replace(r'@(?!\w*(Golkar)\w*)\w+', '', regex=True).str.strip()
data_golkar['full_text'] = data_golkar['full_text'].str.replace(r'@\w*(Golkar)\w*', 'Golkar',  regex=True ).str.strip()

### PAN

In [19]:
data_pan['full_text'] = data_pan['full_text'].str.replace(r'(?i)@(?!\w*(Official_PAN)\w*)\w+', '', regex=True).str.strip()
data_pan['full_text'] = data_pan['full_text'].str.replace(r'(?i)@\w*(Official_PAN)\w*', 'PAN',  regex=True ).str.strip()

### Partai Demokrat

In [20]:
data_demokrat['full_text'] = data_demokrat['full_text'].str.replace(r'(?i)@(?!\w*(PDemokrat)\w*)\w+', '', regex=True).str.strip()
data_demokrat['full_text'] = data_demokrat['full_text'].str.replace(r'(?i)@\w*(PDemokrat)\w*', 'Demokrat',  regex=True ).str.strip()

### PKS

In [21]:
data_pks['full_text'] = data_pks['full_text'].str.replace(r'(?i)@(?!\w*(PKSejahtera)\w*)\w+', '', regex=True).str.strip()
data_pks['full_text'] = data_pks['full_text'].str.replace(r'(?i)@\w*(PKSejahtera)\w*', 'PKS',  regex=True ).str.strip()

### Nasdem

In [22]:
data_nasdem['full_text'] = data_nasdem['full_text'].str.replace(r'(?i)@(?!\w*(Nasdem)\w*)\w+', '', regex=True).str.strip()
data_nasdem['full_text'] = data_nasdem['full_text'].str.replace(r'(?i)@\w*(Nasdem)\w*', 'Nasdem',  regex=True ).str.strip()

### PDIP

In [23]:
data_pdip['full_text'] = data_pdip['full_text'].str.replace(r'(?i)@(?!\w*(PDI_Perjuangan)\w*)\w+', '', regex=True).str.strip()
data_pdip['full_text'] = data_pdip['full_text'].str.replace(r'(?i)@\w*(PDI_Perjuangan)\w*', 'PDIP',  regex=True ).str.strip()

### PKB

In [24]:
data_pkb['full_text'] = data_pkb['full_text'].str.replace(r'(?i)@(?!\w*(PKB)\w*)\w+', '', regex=True).str.strip()
data_pkb['full_text'] = data_pkb['full_text'].str.replace(r'(?i)@\w*(PKB)\w*', 'PKB',  regex=True ).str.strip()

## Drop Link in Colum Full Text

In [25]:
def remove_links(text):
    if not isinstance(text, str):
        return text
    return re.sub(r'https?://\S+', '', text)

In [26]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(remove_links)
    print(data_partai[i].head())

   created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720046042262204510   
4  2023-11-02  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
   created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720045422960595334   
4  2023-11-02  1720039369808154652   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in  

## Filter Data

### Gerindra

In [27]:
keyword_pattern = r'\b(?:gerindra|prabowo[-\w]*|gibran)\b'

In [28]:
data_gerindra['full_text'] = data_gerindra['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [29]:
data_gerindra = data_gerindra[data_gerindra['full_text'].str.contains(keyword_pattern, case=False, na=False)]

In [30]:
data_gerindra['full_text'] = data_gerindra['full_text'].apply(lambda x: ' '.join([word for word in x.split() if re.search(keyword_pattern, word, re.IGNORECASE)]))

### Golkar

In [31]:
keyword_pattern = r'\bgolkar\b|\bgolongan karya\b|\bprabowo\b|\bgibran\b'

In [32]:
data_golkar['full_text'] = data_golkar['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [33]:
data_golkar = data_golkar[data_golkar['full_text'].str.contains(keyword_pattern, case=False, na=False)]

In [34]:
data_golkar['full_text'] = data_golkar['full_text'].apply(lambda x: ' '.join([word for word in x.split() if re.search(keyword_pattern, word, re.IGNORECASE)]))

### PAN

In [35]:
keyword_pattern = r'\bPAN\b|\bpartai amanat nasional\b|\bprabowo\b|\bgibran\b'

In [36]:
data_pan['full_text'] = data_pan['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [37]:
data_pan = data_pan[data_pan['full_text'].str.contains(keyword_pattern, case=False, na=False)]

In [38]:
data_pan['full_text'] = data_pan['full_text'].apply(lambda x: ' '.join([word for word in x.split() if re.search(keyword_pattern, word, re.IGNORECASE)]))

### Demokrat

In [39]:
keyword_pattern = r'\bdemokrat\b|\bprabowo\b|\bgibran\b'

In [40]:
data_demokrat['full_text'] = data_demokrat['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [41]:
data_demokrat = data_demokrat[data_demokrat['full_text'].str.contains(keyword_pattern, case=False, na=False)]

In [42]:
data_demokrat['full_text'] = data_demokrat['full_text'].apply(lambda x: ' '.join([word for word in x.split() if re.search(keyword_pattern, word, re.IGNORECASE)]))

### PKS

In [43]:
keyword_pattern = r'\bPKS\b|\bPartai Keadilan Sejahtera\b|\banies\b|\banis\b|cak[\s\-_]?imin|\bimin\b'

In [44]:
data_pks['full_text'] = data_pks['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [45]:
data_pks = data_pks[data_pks['full_text'].str.contains(keyword_pattern, case=False, na=False)]

In [46]:
data_pks['full_text'] = data_pks['full_text'].apply(lambda x: ' '.join([word for word in x.split() if re.search(keyword_pattern, word, re.IGNORECASE)]))

### Nasdem

In [47]:
keyword_pattern = r'\bNasdem\b|\bPartai Nasional Demokrat\b|\banies\b|\banis\b|cak[\s\-_]?imin|\bimin\b'

In [48]:
data_nasdem['full_text'] = data_nasdem['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [49]:
data_nasdem = data_nasdem[data_nasdem['full_text'].str.contains(keyword_pattern, case=False, na=False)]

In [50]:
data_nasdem['full_text'] = data_nasdem['full_text'].apply(lambda x: ' '.join([word for word in x.split() if re.search(keyword_pattern, word, re.IGNORECASE)]))

### PDIP

In [51]:
keyword_pattern = r'\bPDIP\b|\bPartai Demokrasi Indonesia Perjuangan\b|\bganjar\b|\bmahfud\b'

In [52]:
data_pdip['full_text'] = data_pdip['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [53]:
data_pdip = data_pdip[data_pdip['full_text'].str.contains(keyword_pattern, case=False, na=False)]

In [54]:
data_pdip['full_text'] = data_pdip['full_text'].apply(lambda x: ' '.join([word for word in x.split() if re.search(keyword_pattern, word, re.IGNORECASE)]))

### PKB

In [55]:
keyword_pattern = r'\bPKB\b|\bPartai Kebangkitan Bangsa\b|\banies\b|\banis\b|cak[\s\-_]?imin|\bimin\b'

In [56]:
data_pkb['full_text'] = data_pkb['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [57]:
data_pkb = data_pkb[data_pkb['full_text'].str.contains(keyword_pattern, case=False, na=False)]

In [58]:
data_pkb['full_text'] = data_pkb['full_text'].apply(lambda x: ' '.join([word for word in x.split() if re.search(keyword_pattern, word, re.IGNORECASE)]))

In [59]:
for i in range(len(data_partai)):
    print(len(data_partai[i]))
    print("==================================")

4228
2700
1599
3400
3267
2494
881
1253


## Replace &amp to &

In [60]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].str.replace(r'&amp', '&', regex=False)

In [61]:
def normalize_comments(text):
    # Remove emojis by filtering out any character in emoji.EMOJI_DATA
    emojis = [c for c in text if c not in emoji.EMOJI_DATA]

    # Buat text menjadi lower case
    text = text.lower()
    

    # Bersihkan teks dari karakter khusus
    text = re.sub(r'[^\w\s]', ' ', text)

    # Bersihkan karakter yang berulang
    normal_regex = re.compile(r"(.)\1{1,}") # compiling regex pattern for a repeating character in a word (e.g., haiiii -> 'i' is repeated several times)
    text = normal_regex.sub(r"\1\1", text) # removing the repeating characters 
    
    # Hapus multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
       
    return text

In [62]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(normalize_comments)

In [63]:
for i in range(len(data_partai)):
    data_partai[i] = data_partai[i].drop_duplicates(subset=['id_str', 'full_text'])

In [64]:
nama_partai = ['gerindra', 'golkar', 'pan', 'demokrat', 'pks', 'nasdem', 'pdip', 'pkb']
for i in range(len(data_partai)):
    data_partai[i].to_csv(f'dataset/{nama_partai[i]}_full_context.csv', index=False)

## Standarization

In [65]:
import json

def load_dict_from_json_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        dictionary = json.load(file)
    return dictionary

In [66]:
my_dict = load_dict_from_json_file('combined_slang_words.txt')

In [67]:
def normalize_slang_words(text, slang_words_dict=my_dict):
    words = text.split()
    standarized_words = []
    for word in words:
        standarized_word = slang_words_dict.get(word, word) # Mengembalikan kata asli jika kata tidak ditemukan dalam dict 
        standarized_words.append(standarized_word)
    return ' '.join(standarized_words)

In [68]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(normalize_slang_words)

## Removing Stop Words

In [69]:
def remove_whitespace(text):
    if not isinstance(text, str):
        return text  
    
    return ' '.join(text.split())

In [70]:
def remove_stopwords(text):
    if not isinstance(text, str):
        return text  
    text = remove_whitespace(text)
    stop_words = set(stopwords.words('indonesian')) 
    stop_words.update(['iya', 'engga', 'gak', 'enggak', 'ga', 'dn', 'klu', 'klo', 'kalo', 'klo', 'nya'])
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [71]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(remove_stopwords)

## Stemming Data

In [72]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [73]:
def stemming(text):
    if not isinstance(text, str):
        return text  
    
    token = nltk.word_tokenize(text)
    stem_kalimat = []
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)

    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [74]:
from tqdm import tqdm
tqdm.pandas()

In [75]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].progress_apply(stemming)

100%|██████████| 4228/4228 [18:53<00:00,  3.73it/s]  
100%|██████████| 2700/2700 [08:59<00:00,  5.01it/s] 
100%|██████████| 1599/1599 [03:12<00:00,  8.31it/s]
100%|██████████| 3400/3400 [08:06<00:00,  6.98it/s]
100%|██████████| 3267/3267 [03:13<00:00, 16.87it/s] 
100%|██████████| 2494/2494 [04:32<00:00,  9.14it/s]
100%|██████████| 881/881 [00:58<00:00, 14.98it/s]
100%|██████████| 1253/1253 [01:17<00:00, 16.26it/s]


In [76]:
for i in range(len(data_partai)):
    data_partai[i].to_csv(f'dataset/{nama_partai[i]}_stemmed.csv', index=False)