# Library

In [197]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import emoji
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

# Read Dataset

In [198]:
data_gerindra = pd.read_csv('dataset/gerindra.csv')
data_golkar = pd.read_csv('dataset/golkar.csv')
data_pan = pd.read_csv('dataset/partai amanat nasional.csv')
data_demokrat = pd.read_csv('dataset/partai demokrat.csv')
data_pks = pd.read_csv('dataset/partai keadilan sejahtera.csv')
data_nasdem = pd.read_csv('dataset/partai nasional demokrat.csv')
data_pdip = pd.read_csv('dataset/pdip.csv')
data_pkb = pd.read_csv('dataset/PKB.csv')

## Drop Unnecessary Column 

In [199]:
column_names = ['created_at', 'id_str', 'full_text', 'lang', 'location',]

In [200]:
data_gerindra = data_gerindra[column_names]
data_golkar = data_golkar[column_names]
data_pan = data_pan[column_names]
data_demokrat = data_demokrat[column_names]
data_pks = data_pks[column_names]
data_nasdem = data_nasdem[column_names]
data_pdip = data_pdip[column_names]
data_pkb = data_pkb[column_names]

In [201]:
data_partai = [data_gerindra, data_golkar, data_pan, data_demokrat, data_pks, data_nasdem, data_pdip, data_pkb]

In [202]:
for i in range(len(data_partai)):
    print(data_partai[i].columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')
Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [203]:
for i in range(len(data_partai)):
    print(data_partai[i].head())
    print("==================================")

                       created_at               id_str  \
0  Wed Nov 01 09:21:28 +0000 2023  1719645824047788466   
1  Wed Nov 01 08:49:05 +0000 2023  1719637674661912853   
2  Wed Nov 01 07:15:56 +0000 2023  1719614232235475176   
3  Thu Nov 02 11:51:48 +0000 2023  1720046042262204510   
4  Thu Nov 02 11:41:55 +0000 2023  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
                       created_at               id_str  \
0  Wed Nov 01 09:21:28 +0000 2023  1719645824047788466   
1  Wed Nov 01 08:49:05 +0000 2023  1719637674661912853   
2  Wed Nov 01 07:15:56 +0000 2023  1719614232235475176   
3  Th

## Transform Column 'created_at' to Date

In [204]:
def simplify_and_sort_created_at(df):
    df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y').dt.date
    return df.sort_values(by='created_at')

In [205]:
for i in range(len(data_partai)):
    simplify_and_sort_created_at(data_partai[i])

In [206]:
for i in range(len(data_partai)):
    print(data_partai[i].head())
    print("==================================")

   created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720046042262204510   
4  2023-11-02  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
   created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720045422960595334   
4  2023-11-02  1720039369808154652   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in  

## Preprocess

### Drop Null

In [207]:
# for i in range(len(data_partai)):
#     data_partai[i].dropna(inplace=True)
#     print(data_partai[i].isnull().sum())
#     print("==================================")

### Drop Duplicates

In [208]:
print("Data Gerindra: ", data_gerindra.head())

Data Gerindra:     created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720046042262204510   
4  2023-11-02  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  


In [209]:
for i in range(len(data_partai)):
    print(data_partai[i].duplicated(subset=['id_str', 'full_text']).sum())
    print("==================================")

0
0
0
0
0
0
0
0


In [210]:
for i in range(len(data_partai)):
    data_partai[i] = data_partai[i].drop_duplicates(subset=['id_str', 'full_text'])

## Drop Tweet That Aren't ID

In [211]:
for i in range(len(data_partai)):
    data_partai[i] = data_partai[i][data_partai[i]['lang'] == 'in']

## Drop @ Account

In [212]:
def process_mentions(text, keep_username, replacement_username):
    words = text.split()  # Pisahkan teks menjadi kata-kata
    processed_words = []
    
    for word in words:
        # Periksa apakah kata dimulai dengan '@' dan bukan username yang ingin dipertahankan (case insensitive)
        if word.lower().startswith('@') and keep_username.lower() not in word.lower():
            processed_words.append('[user]')
        elif word.lower() == '@' + keep_username.lower():  # Jika sesuai dengan username yang ingin dipertahankan
            processed_words.append(replacement_username)  # Ganti dengan nama yang diinginkan (case-sensitif)
        else:
            processed_words.append(word)
    
    # Gabungkan kata-kata kembali menjadi satu string dan hapus [user]
    result_text = ' '.join([word for word in processed_words if word != '[user]']).strip()
    return result_text

### Partain Gerindra

In [213]:
data_gerindra['full_text'] = data_gerindra['full_text'].apply(lambda x: process_mentions(str(x), keep_username='Gerindra', replacement_username='Gerindra'))
data_partai[0] = data_gerindra

In [214]:
# data_gerindra['full_text'] = data_gerindra['full_text'].str.replace(r'@([^\s]*)(?!gerindra)', '[user]', regex=True, flags=re.IGNORECASE)
# data_gerindra['full_text'] = data_gerindra['full_text'].str.replace(r'\[user\]', '', regex=True).str.strip()
# data_gerindra['full_text'] = data_gerindra['full_text'].str.replace(r'@Gerindra', 'Gerindra', regex=True, flags=re.IGNORECASE)

### Partai Golkar

In [215]:
data_golkar['full_text'] = data_golkar['full_text'].apply(lambda x: process_mentions(str(x), keep_username='Golkar', replacement_username='Golkar'))
data_partai[1] = data_golkar

In [216]:
# data_golkar['full_text'] = data_golkar['full_text'].str.replace(r'@([^\s]*)(?!golkar)', '[user]', regex=True, flags=re.IGNORECASE)
# data_golkar['full_text'] = data_golkar['full_text'].str.replace(r'\[user\]', '', regex=True).str.strip()
# data_golkar['full_text'] = data_golkar['full_text'].str.replace(r'@Golkar', 'Golkar', regex=True, flags=re.IGNORECASE)

### PAN

In [217]:
data_pan['full_text'] = data_pan['full_text'].apply(lambda x: process_mentions(str(x), keep_username='Official_PAN', replacement_username='PAN'))
data_partai[2] = data_pan

In [218]:
# data_pan['full_text'] = data_pan['full_text'].str.replace(r'@([^\s]*)(?!Official_PAN)', '[user]', regex=True, flags=re.IGNORECASE)
# data_pan['full_text'] = data_pan['full_text'].str.replace(r'\[user\]', '', regex=True).str.strip()
# data_pan['full_text'] = data_pan['full_text'].str.replace(r'@Official_PAN', 'PAN', regex=True, flags=re.IGNORECASE)

### Partai Demokrat

In [219]:
data_demokrat['full_text'] = data_demokrat['full_text'].apply(lambda x: process_mentions(str(x), keep_username='Demokrat', replacement_username='Demokrat'))
data_partai[3] = data_demokrat

In [220]:
# data_demokrat['full_text'] = data_demokrat['full_text'].str.replace(r'@([^\s]*)(?!demokrat)', '[user]', regex=True, flags=re.IGNORECASE)
# data_demokrat['full_text'] = data_demokrat['full_text'].str.replace(r'\[user\]', '', regex=True).str.strip()
# data_demokrat['full_text'] = data_demokrat['full_text'].str.replace(r'@demokrat', 'Demokrat', regex=True, flags=re.IGNORECASE)

### PKS

In [221]:
data_pks['full_text'] = data_pks['full_text'].apply(lambda x: process_mentions(str(x), keep_username='PKSejahtera', replacement_username='PKS'))
data_partai[4] = data_pks

In [222]:
# data_pks['full_text'] = data_pks['full_text'].str.replace(r'@([^\s]*)(?!PKSejahtera)', '[user]', regex=True, flags=re.IGNORECASE)
# data_pks['full_text'] = data_pks['full_text'].str.replace(r'\[user\]', '', regex=True).str.strip()
# data_pks['full_text'] = data_pks['full_text'].str.replace(r'@PKSejahtera', 'PKS', regex=True, flags=re.IGNORECASE)


### Nasdem

In [223]:
data_nasdem['full_text'] = data_nasdem['full_text'].apply(lambda x: process_mentions(str(x), keep_username='Nasdem', replacement_username='Nasdem'))
data_partai[5] = data_nasdem

In [224]:
# data_nasdem['full_text'] = data_nasdem['full_text'].str.replace(r'@([^\s]*)(?!Nasdem)', '[user]', regex=True, flags=re.IGNORECASE)
# data_nasdem['full_text'] = data_nasdem['full_text'].str.replace(r'\[user\]', '', regex=True).str.strip()
# data_nasdem['full_text'] = data_nasdem['full_text'].str.replace(r'@Nasdem', 'Nasdem', regex=True, flags=re.IGNORECASE)

### PDIP

In [225]:
data_pdip['full_text'] = data_pdip['full_text'].apply(lambda x: process_mentions(str(x), keep_username='PDI_Perjuangan', replacement_username='PDIP'))
data_partai[6] = data_pdip

In [226]:
# data_pdip['full_text'] = data_pdip['full_text'].str.replace(r'@([^\s]*)(?!PDI_Perjuangan)', '[user]', regex=True, flags=re.IGNORECASE)
# data_pdip['full_text'] = data_pdip['full_text'].str.replace(r'\[user\]', '', regex=True).str.strip()
# data_pdip['full_text'] = data_pdip['full_text'].str.replace(r'@PDI_Perjuangan', 'PDIP', regex=True, flags=re.IGNORECASE)

### PKB

In [227]:
data_pkb['full_text'] = data_pkb['full_text'].apply(lambda x: process_mentions(str(x), keep_username='PKB', replacement_username='PKB'))
data_partai[7] = data_pkb

In [228]:
# data_pkb['full_text'] = data_pkb['full_text'].str.replace(r'@([^\s]*)(?!PKB)', '[user]', regex=True, flags=re.IGNORECASE)
# data_pkb['full_text'] = data_pkb['full_text'].str.replace(r'\[user\]', '', regex=True).str.strip()
# data_pkb['full_text'] = data_pkb['full_text'].str.replace(r'@PKB', 'PKB', regex=True, flags=re.IGNORECASE)

## Drop Link in Colum Full Text

In [229]:
def remove_links(text):
    if not isinstance(text, str):
        return text
    return re.sub(r'https?://\S+', '', text)

In [230]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(remove_links)
    print(data_partai[i].head())

   created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720046042262204510   
4  2023-11-02  1720043558672159108   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
1  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
2  Sementara itu pasangan Prabowo-Gibran diusung ...   in      NaN  
3  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
4  Kader Partai Gerakan Indonesia Raya (Gerindra)...   in      NaN  
   created_at               id_str  \
0  2023-11-01  1719645824047788466   
1  2023-11-01  1719637674661912853   
2  2023-11-01  1719614232235475176   
3  2023-11-02  1720045422960595334   
4  2023-11-02  1720039369808154652   

                                           full_text lang location  
0  Sementara itu pasangan Prabowo-Gibran diusung ...   in  

## Filter Data

In [231]:
def filter_by_keywords(df, keyword_pattern):
    if 'full_text' in df.columns:
        return df[df['full_text'].str.contains(keyword_pattern, flags=re.IGNORECASE, regex=True, na=False)]

### Gerindra

In [232]:
keyword_pattern = r'\bgerindra\b|\bgerakan indonesia raya\b|\bprabowo\b|\bgibran\b'
data_partai[0]['full_text'] = data_partai[0]['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_partai[0] = filter_by_keywords(data_partai[0], keyword_pattern)

### Golkar

In [233]:
keyword_pattern = r'\bgolkar\b|\bgolongan karya\b|\bprabowo\b|\bgibran\b'
data_partai[1]['full_text'] = data_partai[1]['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_partai[1] = filter_by_keywords(data_partai[1], keyword_pattern)

### PAN

In [234]:
keyword_pattern = r'\bPAN\b|\bpartai amanat nasional\b|\bprabowo\b|\bgibran\b'
data_partai[2]['full_text'] = data_partai[2]['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_partai[2] = filter_by_keywords(data_partai[2], keyword_pattern)

### Demokrat

In [235]:
keyword_pattern = r'\bdemokrat\b|\bprabowo\b|\bgibran\b'
data_partai[3]['full_text'] = data_partai[3]['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_partai[3] = filter_by_keywords(data_partai[3], keyword_pattern)

### PKS

In [236]:
keyword_pattern = r'\bPKS\b|\bPartai Keadilan Sejahtera\b|\banies\b|\banis\b|cak[\s\-_]?imin|\bimin\b'
data_partai[4]['full_text'] = data_partai[4]['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_partai[4] = filter_by_keywords(data_partai[4], keyword_pattern)

### Nasdem

In [237]:
keyword_pattern = r'\bNasdem\b|\bPartai Nasional Demokrat\b|\banies\b|\banis\b|cak[\s\-_]?imin|\bimin\b'
data_partai[5]['full_text'] = data_partai[5]['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_partai[5] = filter_by_keywords(data_partai[5], keyword_pattern)

### PDIP

In [238]:
keyword_pattern = r'\bPDIP\b|\bPartai Demokrasi Indonesia Perjuangan\b|\bganjar\b|\bmahfud\b'
data_partai[6]['full_text'] = data_partai[6]['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_partai[6] = filter_by_keywords(data_partai[6], keyword_pattern)

### PKB

In [239]:
keyword_pattern = r'\bPKB\b|\bPartai Kebangkitan Bangsa\b|\banies\b|\banis\b|cak[\s\-_]?imin|\bimin\b'
data_partai[7]['full_text'] = data_partai[7]['full_text'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_partai[7] = filter_by_keywords(data_partai[7], keyword_pattern)

In [240]:
for i in range(len(data_partai)):
    print(len(data_partai[i]))
    print("==================================")

1313
777
851
3282
3200
381
674
596


## Replace &amp to &

In [241]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].str.replace(r'&amp', '&', regex=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_partai[i]['full_text'] = data_partai[i]['full_text'].str.replace(r'&amp', '&', regex=False)


In [242]:
def normalize_comments(text):
    # Remove emojis by filtering out any character in emoji.EMOJI_DATA
    emojis = [c for c in text if c not in emoji.EMOJI_DATA]

    # Buat text menjadi lower case
    text = text.lower()
    

    # Bersihkan teks dari karakter khusus
    text = re.sub(r'[^\w\s]', ' ', text)

    # Bersihkan karakter yang berulang
    normal_regex = re.compile(r"(.)\1{1,}") # compiling regex pattern for a repeating character in a word (e.g., haiiii -> 'i' is repeated several times)
    text = normal_regex.sub(r"\1\1", text) # removing the repeating characters 
    
    # Hapus multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
       
    return text

In [243]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(normalize_comments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(normalize_comments)


In [244]:
for i in range(len(data_partai)):
    data_partai[i] = data_partai[i].drop_duplicates(subset=['id_str', 'full_text'])

## Standarization

In [245]:
import json

def load_dict_from_json_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        dictionary = json.load(file)
    return dictionary

In [246]:
my_dict = load_dict_from_json_file('combined_slang_words.txt')

In [247]:
def normalize_slang_words(text, slang_words_dict=my_dict):
    words = text.split()
    standarized_words = []
    for word in words:
        standarized_word = slang_words_dict.get(word, word) # Mengembalikan kata asli jika kata tidak ditemukan dalam dict 
        standarized_words.append(standarized_word)
    return ' '.join(standarized_words)

In [248]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(normalize_slang_words)

In [249]:
nama_partai = ['gerindra', 'golkar', 'pan', 'demokrat', 'pks', 'nasdem', 'pdip', 'pkb']
for i in range(len(data_partai)):
    data_partai[i].to_csv(f'dataset/{nama_partai[i]}_full_context.csv', index=False)

## Removing Stop Words

In [250]:
def remove_whitespace(text):
    if not isinstance(text, str):
        return text  
    
    return ' '.join(text.split())

In [251]:
def remove_stopwords(text):
    if not isinstance(text, str):
        return text  
    text = remove_whitespace(text)
    stop_words = set(stopwords.words('indonesian')) 
    stop_words.update(['iya', 'engga', 'gak', 'enggak', 'ga', 'dn', 'klu', 'klo', 'kalo', 'klo', 'nya'])
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [252]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].apply(remove_stopwords)

## Stemming Data

In [253]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [254]:
def stemming(text):
    if not isinstance(text, str):
        return text  
    
    token = nltk.word_tokenize(text)
    stem_kalimat = []
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)

    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [255]:
from tqdm import tqdm
tqdm.pandas()

In [256]:
for i in range(len(data_partai)):
    data_partai[i]['full_text'] = data_partai[i]['full_text'].progress_apply(stemming)

100%|██████████| 1313/1313 [05:13<00:00,  4.19it/s]
100%|██████████| 777/777 [01:51<00:00,  6.96it/s]
100%|██████████| 851/851 [01:19<00:00, 10.64it/s]
100%|██████████| 3282/3282 [05:36<00:00,  9.75it/s]
100%|██████████| 3200/3200 [02:32<00:00, 21.02it/s] 
100%|██████████| 381/381 [00:51<00:00,  7.40it/s]
100%|██████████| 674/674 [01:05<00:00, 10.35it/s]
100%|██████████| 596/596 [01:04<00:00,  9.23it/s]


In [257]:
for i in range(len(data_partai)):
    data_partai[i].to_csv(f'dataset/{nama_partai[i]}_stemmed.csv', index=False)