## Import Library

In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import emoji

import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

## Read Dataset

### 01 Party

In [None]:
data_01_1 = pd.read_csv('kubu_01.csv')
data_01_2 = pd.read_csv('dataset/#01.csv')
data_01_3 = pd.read_csv('dataset/anies.csv')
data_01_4 = pd.read_csv('dataset/cak_imin.csv')

### 02 Party

In [None]:
# data_02_1 = pd.read_csv('kubu_02.csv')
# data_02_2 = pd.read_csv('dataset/#02.csv')
# data_02_3 = pd.read_csv('dataset/prabowo.csv')
# data_02_4 = pd.read_csv('dataset/gibran.csv')

### 03 Party

In [None]:
data_03_1 = pd.read_csv('kubu_03.csv')
data_03_2 = pd.read_csv('dataset/#03.csv')
data_03_3 = pd.read_csv('dataset/ganjar.csv')
data_03_4 = pd.read_csv('dataset/mahfud.csv')

## Merged Dataset

In [None]:
merged_kubu_01 = pd.concat([data_01_1, data_01_2, data_01_3, data_01_4], ignore_index=True)
# merged_kubu_02 = pd.concat([data_02_1, data_02_2, data_02_3], ignore_index=True)
merged_kubu_03 = pd.concat([data_03_1, data_03_2, data_03_3], ignore_index=True)

In [None]:
merged_kubu_01.shape

In [None]:
# merged_kubu_02.shape

In [None]:
merged_kubu_03.shape

## Drop Unnecessary Column 

In [None]:
column_names = ['created_at', 'id_str', 'full_text', 'lang', 'location',]

In [None]:
merged_kubu_01 = merged_kubu_01[column_names]
# merged_kubu_02 = merged_kubu_02[column_names]
merged_kubu_03 = merged_kubu_03[column_names]

In [None]:
merged_kubu_01.columns

In [None]:
# merged_kubu_02.columns

In [None]:
merged_kubu_03.columns

In [None]:
merged_kubu_01.to_csv('all_kubu_01.csv', index=False)
# merged_kubu_02.to_csv('all_kubu_02.csv', index=False)
merged_kubu_03.to_csv('all_kubu_03.csv', index=False)

In [None]:
merged_kubu_01.head()

In [None]:
# merged_kubu_02.head()

In [None]:
merged_kubu_03.head()

## Transform Column 'created_at' to Date

In [None]:
def simplify_and_sort_created_at(df):
    df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y').dt.date
    return df.sort_values(by='created_at')

In [None]:
merged_kubu_01 = simplify_and_sort_created_at(merged_kubu_01)
# merged_kubu_02 = simplify_and_sort_created_at(merged_kubu_02)
merged_kubu_03 = simplify_and_sort_created_at(merged_kubu_03)

In [None]:
merged_kubu_01.head()


In [None]:
# merged_kubu_02.head()

In [None]:
merged_kubu_03.head()

# Pre process

## Drop Null

In [None]:
merged_kubu_01.dropna(inplace=True)
merged_kubu_01.isnull().sum()

In [None]:
# merged_kubu_02.dropna(inplace=True)
# merged_kubu_02.isnull().sum()

In [None]:
merged_kubu_03.dropna(inplace=True)
merged_kubu_03.isnull().sum()

## Drop Duplicate

In [None]:
merged_kubu_01.duplicated(subset=['id_str', 'full_text']).sum()
# merged_kubu_02.duplicated(subset=['id_str', 'full_text']).sum()
merged_kubu_03.duplicated(subset=['id_str', 'full_text']).sum()

In [None]:
merged_kubu_01.drop_duplicates(subset=['id_str', 'full_text'])

In [None]:
# merged_kubu_02.drop_duplicates(subset=['id_str', 'full_text'])

In [None]:
merged_kubu_03.drop_duplicates(subset=['id_str', 'full_text'])

## Drop Tweet That Aren't ID

In [None]:
print(len(merged_kubu_01))

In [None]:
# print(len(merged_kubu_02))

In [None]:
print(len(merged_kubu_03))

In [None]:
merged_kubu_01 = merged_kubu_01[merged_kubu_01['lang'] == 'in']
# merged_kubu_02 = merged_kubu_02[merged_kubu_02['lang'] == 'in']
merged_kubu_03 = merged_kubu_03[merged_kubu_03['lang'] == 'in']

## Drop @Account

### Party 01

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'@(?!\w*(anies|anis)\w*)\w+', '', regex=True).str.strip()
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'@\w*(anies|anis)\w*', 'anies',  regex=True ).str.strip()

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'@(?!\w*(cakimin)\w*)\w+', '', regex=True).str.strip()
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'@\w*(cakimin)\w*', 'cak imin',  regex=True ).str.strip()

### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'@(?!\w*(prabowo)\w*)\w+', '', regex=True).str.strip()
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'@\w*(prabowo)\w*', 'prabowo',  regex=True ).str.strip()

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'@(?!\w*(gibran)\w*)\w+', '', regex=True).str.strip()
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'@\w*(gibran)\w*', 'gibran',  regex=True ).str.strip()

### Party 03

In [None]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'@(?!\w*(ganjar)\w*)\w+', '', regex=True).str.strip()
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'@\w*(ganjar)\w*', 'ganjar',  regex=True ).str.strip()

In [None]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'@(?!\w*(mahfud)\w*)\w+', '', regex=True).str.strip()
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'@\w*(mahfud)\w*', 'mahfud md',  regex=True ).str.strip()

## Drop Link in Colum Full Text

In [None]:
def remove_links(text):
    if not isinstance(text, str):
        return text
    return re.sub(r'https?://\S+', '', text)

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(remove_links)

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(remove_links)

In [None]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(remove_links)

## Filter Data

### Party 01

In [None]:
keyword_pattern = r'\banies\b|\banis\b|\b01\b|\bcak imin\b|\bimin\b'
date_pattern = r'\b01/\d{2}/\d{4}\b'
number_pattern = r'\d*01\d*'

In [None]:
keyword_mask = merged_kubu_01['full_text'].str.contains(keyword_pattern, case=False, na=False)
date_mask = merged_kubu_01['full_text'].str.contains(date_pattern, na=False)
number_mask = merged_kubu_01['full_text'].str.contains(number_pattern, na=False)

In [None]:
final_mask = keyword_mask | ~(date_mask | number_mask)

In [None]:
merged_kubu_01 = merged_kubu_01[final_mask]

In [None]:
print(len(merged_kubu_01))

### Party 02

In [None]:
# keyword_pattern = r'\bprabowo\b|\bgibran\b|\b02\b'
# date_pattern = r'\b02/\d{2}/\d{4}\b'
# number_pattern = r'\d*02\d*'

In [None]:
# keyword_mask = merged_kubu_02['full_text'].str.contains(keyword_pattern, case=False, na=False)
# date_mask = merged_kubu_02['full_text'].str.contains(date_pattern, na=False)
# number_mask = merged_kubu_02['full_text'].str.contains(number_pattern, na=False)

In [None]:
# final_mask = keyword_mask | ~(date_mask | number_mask)

In [None]:
# merged_kubu_02 = merged_kubu_02[final_mask]

In [None]:
# print(len(merged_kubu_02))

### Party 03

In [None]:
keyword_pattern = r'\bganjar\b|\bmahfud\b|\b03\b'
date_pattern = r'\b03/\d{2}/\d{4}\b'
number_pattern = r'\d*03\d*'

In [None]:
keyword_mask = merged_kubu_03['full_text'].str.contains(keyword_pattern, case=False, na=False)
date_mask = merged_kubu_03['full_text'].str.contains(date_pattern, na=False)
number_mask = merged_kubu_03['full_text'].str.contains(number_pattern, na=False)

In [None]:
final_mask = keyword_mask | ~(date_mask | number_mask)

In [None]:
merged_kubu_03 = merged_kubu_03[final_mask]

In [None]:
print(len(merged_kubu_03))

## Replace &amp to &

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].str.replace(r'&amp', '&', regex=False)
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].str.replace(r'&amp', '&', regex=False)
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].str.replace(r'&amp', '&', regex=False)

## Normalization

In [None]:
def normalize_comments(text):
    # Remove emojis by filtering out any character in emoji.EMOJI_DATA
    emojis = [c for c in text if c not in emoji.EMOJI_DATA]

    # Buat text menjadi lower case
    text = text.lower()
    

    # Bersihkan teks dari karakter khusus
    text = re.sub(r'[^\w\s]', ' ', text)

    # Bersihkan karakter yang berulang
    normal_regex = re.compile(r"(.)\1{1,}") # compiling regex pattern for a repeating character in a word (e.g., haiiii -> 'i' is repeated several times)
    text = normal_regex.sub(r"\1\1", text) # removing the repeating characters 
    
    # Hapus multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
       
    return text

### Party 01

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(normalize_comments)
merged_kubu_01.head()

### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(normalize_comments)
# merged_kubu_02.head()

### Party 03

In [None]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(normalize_comments)
merged_kubu_03.head()

## Standarization

In [None]:
import json

def load_dict_from_json_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        dictionary = json.load(file)
    return dictionary

In [None]:
my_dict = load_dict_from_json_file('combined_slang_words.txt')

In [None]:
def normalize_slang_words(text, slang_words_dict=my_dict):
    words = text.split()
    standarized_words = []
    for word in words:
        standarized_word = slang_words_dict.get(word, word) # Mengembalikan kata asli jika kata tidak ditemukan dalam dict 
        standarized_words.append(standarized_word)
    return ' '.join(standarized_words)

### Party 01

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(normalize_slang_words)
merged_kubu_01.head()

### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(normalize_slang_words)
# merged_kubu_02.head()

### Party 03

In [None]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(normalize_slang_words)
merged_kubu_03.head()

In [None]:
slangwords_collection = pd.read_csv('new_kamusalay.csv', header=None, encoding='latin-1')
replacements = dict(zip(slangwords_collection[0], slangwords_collection[1])) 

### Party 01

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(
    lambda text: normalize_slang_words(text, replacements))
merged_kubu_01.head()

### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(
#     lambda text: normalize_slang_words(text, replacements))
# merged_kubu_02.head()

### Party 03

In [None]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(
    lambda text: normalize_slang_words(text, replacements))
merged_kubu_03.head()

## Removing Stop Words

In [None]:
def remove_whitespace(text):
    if not isinstance(text, str):
        return text  
    
    return ' '.join(text.split())

In [None]:
def remove_stopwords(text):
    if not isinstance(text, str):
        return text  
    text = remove_whitespace(text)
    stop_words = set(stopwords.words('indonesian')) 
    stop_words.update([])
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].apply(remove_stopwords)
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].apply(remove_stopwords)
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].apply(remove_stopwords)

## Save Processed CSV for Labeling

In [None]:
merged_kubu_01.to_csv('processed_merged_kubu_01.csv',index=False)
# merged_kubu_02.to_csv('processed_merged_kubu_02.csv',index=False, encoding='utf-8-sig')
merged_kubu_03.to_csv('processed_merged_kubu_03.csv',index=False)

## Stemming Data

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
def stemming(text):
    if not isinstance(text, str):
        return text  
    
    token = nltk.word_tokenize(text)
    stem_kalimat = []
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)

    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [None]:
from tqdm import tqdm
tqdm.pandas()

### Party 01

In [None]:
merged_kubu_01['full_text'] = merged_kubu_01['full_text'].progress_apply(stemming)

In [None]:
merged_kubu_01.head()

### Party 02

In [None]:
# merged_kubu_02['full_text'] = merged_kubu_02['full_text'].progress_apply(stemming)

In [None]:
# merged_kubu_02.head()

### Party 03

In [None]:
merged_kubu_03['full_text'] = merged_kubu_03['full_text'].progress_apply(stemming)

In [None]:
merged_kubu_03.head()

## EDA

In [None]:
from wordcloud import WordCloud

### Party 01

In [None]:
merged_kubu_01.dropna(subset=['full_text'], inplace=True)
token_data = [row.split() for row in merged_kubu_01['full_text']]
all_words_no_stopwords = ' '.join([' '.join(tokens) for tokens in token_data])

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words_no_stopwords)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hilangkan axis
plt.show()

In [None]:
from collections import Counter

# Pecah string panjang menjadi list of words (tokenisasi)
token_list = all_words_no_stopwords.split()

# Hitung frekuensi kata menggunakan Counter
word_counts = Counter(token_list)

# Ambil 20 kata yang paling sering muncul
most_common_words = word_counts.most_common(20)

# Pisahkan kata dan frekuensinya untuk plotting
words, frequencies = zip(*most_common_words)

# Plot Bar Chart untuk kata-kata paling sering
plt.figure(figsize=(10, 6))
bars = plt.barh(words, frequencies, color='lightgreen')

# Menambahkan label frekuensi di dalam batang
for bar, frequency in zip(bars, frequencies):
    plt.text(bar.get_width() - 100,  # Mengatur agar teks berada sedikit di dalam batang
             bar.get_y() + bar.get_height() / 2,  # Posisi vertikal
             f'{frequency}',  # Nilai frekuensi yang akan ditampilkan
             va='center', ha='right', color='black', fontsize=10)  # Posisi dan gaya teks

# Label sumbu
plt.xlabel('Frekuensi')
plt.ylabel('Kata')
plt.title('Top 20 Kata Terbanyak')

# Membalik sumbu y agar kata dengan frekuensi tertinggi di atas
plt.gca().invert_yaxis()

# Tampilkan plot
plt.show()

In [None]:
import seaborn as sns

merged_kubu_01['created_at'] = pd.to_datetime(merged_kubu_01['created_at'])
tweets_per_day = merged_kubu_01.groupby(merged_kubu_01['created_at'].dt.date).size()
tweets_per_month = merged_kubu_01.groupby(merged_kubu_01['created_at'].dt.to_period('M')).size()

fig, axes = plt.subplots(2, 1, figsize=(12, 20))

# Per Day
sns.lineplot(ax=axes[0], x=tweets_per_day.index, y=tweets_per_day.values, marker='o', linewidth=2)
axes[0].set_title('Number of Tweets Per Day', fontsize=14)
axes[0].set_xlabel('Date', fontsize=12)
axes[0].set_ylabel('Number of Tweets', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True)

# Per Month
sns.barplot(ax=axes[1], x=tweets_per_month.index.astype(str), y=tweets_per_month.values, palette="Blues_r")
axes[1].set_title('Number of Tweets Per Month', fontsize=14)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Number of Tweets', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y')

### Party 02

In [None]:
# merged_kubu_02.dropna(subset=['full_text'], inplace=True)
# token_data = [row.split() for row in merged_kubu_02['full_text']]
# all_words_no_stopwords = ' '.join([' '.join(tokens) for tokens in token_data])

In [None]:
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words_no_stopwords)
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')  # Hilangkan axis
# plt.show()

In [None]:
# # Pecah string panjang menjadi list of words (tokenisasi)
# token_list = all_words_no_stopwords.split()

# # Hitung frekuensi kata menggunakan Counter
# word_counts = Counter(token_list)

# # Ambil 20 kata yang paling sering muncul
# most_common_words = word_counts.most_common(20)

# # Pisahkan kata dan frekuensinya untuk plotting
# words, frequencies = zip(*most_common_words)

# # Plot Bar Chart untuk kata-kata paling sering
# plt.figure(figsize=(10, 6))
# bars = plt.barh(words, frequencies, color='lightgreen')

# # Menambahkan label frekuensi di dalam batang
# for bar, frequency in zip(bars, frequencies):
#     plt.text(bar.get_width() - 100,  # Mengatur agar teks berada sedikit di dalam batang
#              bar.get_y() + bar.get_height() / 2,  # Posisi vertikal
#              f'{frequency}',  # Nilai frekuensi yang akan ditampilkan
#              va='center', ha='right', color='black', fontsize=10)  # Posisi dan gaya teks

# # Label sumbu
# plt.xlabel('Frekuensi')
# plt.ylabel('Kata')
# plt.title('Top 20 Kata Terbanyak')

# # Membalik sumbu y agar kata dengan frekuensi tertinggi di atas
# plt.gca().invert_yaxis()

# # Tampilkan plot
# plt.show()

In [None]:
# merged_kubu_02['created_at'] = pd.to_datetime(merged_kubu_02['created_at'])
# tweets_per_day = merged_kubu_02.groupby(merged_kubu_02['created_at'].dt.date).size()
# tweets_per_month = merged_kubu_02.groupby(merged_kubu_02['created_at'].dt.to_period('M')).size()

# fig, axes = plt.subplots(2, 1, figsize=(12, 20))

# # Per Day
# sns.lineplot(ax=axes[0], x=tweets_per_day.index, y=tweets_per_day.values, marker='o', linewidth=2)
# axes[0].set_title('Number of Tweets Per Day', fontsize=14)
# axes[0].set_xlabel('Date', fontsize=12)
# axes[0].set_ylabel('Number of Tweets', fontsize=12)
# axes[0].tick_params(axis='x', rotation=45)
# axes[0].grid(True)

# # Per Month
# sns.barplot(ax=axes[1], x=tweets_per_month.index.astype(str), y=tweets_per_month.values, palette="Blues_r")
# axes[1].set_title('Number of Tweets Per Month', fontsize=14)
# axes[1].set_xlabel('Month', fontsize=12)
# axes[1].set_ylabel('Number of Tweets', fontsize=12)
# axes[1].tick_params(axis='x', rotation=45)
# axes[1].grid(axis='y')

### Party 03

In [None]:
merged_kubu_03.dropna(subset=['full_text'], inplace=True)
token_data = [row.split() for row in merged_kubu_03['full_text']]
all_words_no_stopwords = ' '.join([' '.join(tokens) for tokens in token_data])

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words_no_stopwords)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hilangkan axis
plt.show()

In [None]:
# Pecah string panjang menjadi list of words (tokenisasi)
token_list = all_words_no_stopwords.split()

# Hitung frekuensi kata menggunakan Counter
word_counts = Counter(token_list)

# Ambil 20 kata yang paling sering muncul
most_common_words = word_counts.most_common(20)

# Pisahkan kata dan frekuensinya untuk plotting
words, frequencies = zip(*most_common_words)

# Plot Bar Chart untuk kata-kata paling sering
plt.figure(figsize=(10, 6))
bars = plt.barh(words, frequencies, color='lightgreen')

# Menambahkan label frekuensi di dalam batang
for bar, frequency in zip(bars, frequencies):
    plt.text(bar.get_width() - 100,  # Mengatur agar teks berada sedikit di dalam batang
             bar.get_y() + bar.get_height() / 2,  # Posisi vertikal
             f'{frequency}',  # Nilai frekuensi yang akan ditampilkan
             va='center', ha='right', color='black', fontsize=10)  # Posisi dan gaya teks

# Label sumbu
plt.xlabel('Frekuensi')
plt.ylabel('Kata')
plt.title('Top 20 Kata Terbanyak')

# Membalik sumbu y agar kata dengan frekuensi tertinggi di atas
plt.gca().invert_yaxis()

# Tampilkan plot
plt.show()

In [None]:
merged_kubu_03['created_at'] = pd.to_datetime(merged_kubu_03['created_at'])
tweets_per_day = merged_kubu_03.groupby(merged_kubu_03['created_at'].dt.date).size()
tweets_per_month = merged_kubu_03.groupby(merged_kubu_03['created_at'].dt.to_period('M')).size()

fig, axes = plt.subplots(2, 1, figsize=(12, 20))

# Per Day
sns.lineplot(ax=axes[0], x=tweets_per_day.index, y=tweets_per_day.values, marker='o', linewidth=2)
axes[0].set_title('Number of Tweets Per Day', fontsize=14)
axes[0].set_xlabel('Date', fontsize=12)
axes[0].set_ylabel('Number of Tweets', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True)

# Per Month
sns.barplot(ax=axes[1], x=tweets_per_month.index.astype(str), y=tweets_per_month.values, palette="Blues_r")
axes[1].set_title('Number of Tweets Per Month', fontsize=14)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Number of Tweets', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y')