<a href="https://colab.research.google.com/github/rafiqmyura/Analytica-Sentiment-Data/blob/main/Analytica_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
!pip install ntscraper Sastrawi



In [34]:
from ntscraper import Nitter
from transformers import pipeline
from wordcloud import WordCloud
from prettytable import PrettyTable
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv, nltk, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.svm import SVC
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [35]:
nltk.download('punkt')
nltk.download('stopwords')
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Train Data

In [37]:
buzzer = ["PakPrabowo", "DadiPresidenku", "PrabowoGibran", "GaweAyem",
          "BersamaIndonesiaMaju", "PrabowoGemoy", "KodeKita08Gemoy", "2024gantiwarna"
          "IndonesiaSentris", "02Melanjutkan", "AnakMudaIndonesiaEmas", "MenangSeputaran"]

nbuzzer, invalid, irrelevant, duplicate = 0, 0, 0, 0

keyword = "prabowo"
jumlah = 1000
start = "2023-01-01"
end = "2023-02-14"
output = "tweet.csv"

scraper = Nitter(log_level=1)
results = scraper.get_tweets(keyword, mode="term", number=jumlah, since=start)
tweets = results['tweets']

table = PrettyTable()
table.field_names = ["Date", "Username", "Text"]

for tweet in tweets:
    link = tweet['link']
    text = tweet['text']
    username = tweet['user']['username']
    date = tweet['date']

    if any(teks in text for teks in buzzer): nbuzzer += 1
    elif any(name in username for name in official): irrelevant += 1
    elif any(text in row[2] for row in table._rows): duplicate += 1
    elif text == "" or not text: invalid += 1
    else: table.add_row([date, username, text])

table.align["Text"] = "l"
table.max_width["Text"] = 100

with open(output, "w", newline="", encoding="utf-8") as outfile:
    csv_writer = csv.writer(outfile)
    csv_writer.writerow(table.field_names)
    csv_writer.writerows(table._rows)

#print(table)
print(f"\n\n [*] Berhasil menghapus {invalid} tweet kosong")
print(f" [*] Berhasil menghapus {nbuzzer} tweet yang terdeteksi sebagai buzzer")
print(f" [*] Berhasil menghapus {irrelevant} tweet yang terdeteksi akun non-relevan")
print(f" [*] Berhasil menghapus {duplicate} tweet yang terdeteksi sebagai duplikat")
print(f" [*] Data {len(table._rows)} tweet disimpan di {output}")

Testing instances: 100%|██████████| 77/77 [01:27<00:00,  1.13s/it]
INFO:root:No instance specified, using random instance https://nitter.tux.pizza
INFO:root:Current stats for prabowo: 12 tweets, 0 threads...
INFO:root:Current stats for prabowo: 21 tweets, 0 threads...
INFO:root:Current stats for prabowo: 29 tweets, 0 threads...
INFO:root:Current stats for prabowo: 39 tweets, 0 threads...
INFO:root:Current stats for prabowo: 46 tweets, 0 threads...
INFO:root:Current stats for prabowo: 53 tweets, 0 threads...
INFO:root:Current stats for prabowo: 64 tweets, 0 threads...
INFO:root:Current stats for prabowo: 72 tweets, 0 threads...
INFO:root:Current stats for prabowo: 83 tweets, 0 threads...
INFO:root:Current stats for prabowo: 93 tweets, 0 threads...
INFO:root:Current stats for prabowo: 100 tweets, 0 threads...
INFO:root:Current stats for prabowo: 108 tweets, 0 threads...
INFO:root:Current stats for prabowo: 115 tweets, 0 threads...
INFO:root:Current stats for prabowo: 123 tweets, 0 thread



 [*] Berhasil menghapus 0 tweet kosong
 [*] Berhasil menghapus 4 tweet yang terdeteksi sebagai buzzer
 [*] Berhasil menghapus 23 tweet yang terdeteksi akun non-relevan
 [*] Berhasil menghapus 12 tweet yang terdeteksi sebagai duplikat
 [*] Data 445 tweet disimpan di tweet.csv


In [38]:
sentiment_pipeline = pipeline("sentiment-analysis", model="w11wo/indonesian-roberta-base-sentiment-classifier", max_length=512, truncation=True)

input_file = 'tweet.csv'
output_file = 'sentiment_data.csv'

with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    fieldnames = csv_reader.fieldnames + ['Sentiment']

    with open(output_file, 'w', newline='', encoding='utf-8') as output_csvfile:
        csv_writer = csv.DictWriter(output_csvfile, fieldnames=fieldnames)
        csv_writer.writeheader()

        for row in csv_reader:
            text = row['Text']
            result = sentiment_pipeline(text)
            sentiment = result[0]['label']

            if sentiment.lower() != 'neutral':
              print(f'[{sentiment}] {text}')
              row['Sentiment'] = sentiment
              csv_writer.writerow(row)


print(f"File {output_file} berhasil dihasilkan dengan kolom sentimen dan skor sentimen menggunakan model BERT.")


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/808k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/467k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

[negative] Kemarin TKN 02 mengingatkan bahwa Jokowi tak mempunyai kewenangan cawe-cawe dalam pemerintahan Prabowo-Gibran jika memenangkan Pilpres 2024.  Sekarang ada berita adik Prabowo garansi seseorang bakal jadi menteri prabowo.  Sy sih ga kaget, kan Prabowo sendiri yg bilang, "Dia (Hashim) adalah otak di belakang Gerindra. Saya hanyalah wayang".
[negative] Gerombolan yg sama ini dulunya adalah pendukung @prabowo di 2014 dan 2019. Begitu mereka pindah dukungan Prabowo langsung menang. Paham kan kenapa selama ini kami bilang beban @aniesbaswedan adalah pendukungnya sendiri.
[negative] Viral kan....!  Ini fakta kecurangan Tersetruktur Sistimatis dan masif 02 @KPU_ID @prabowo @jokowi
[positive] Yeehaaaa 💃🏽🕺🏽😜
[negative] We are kicking off the governance process by asking the DAO to confirm Primate (of @BakingBenjamins and @TezCapital) as our Acting Steward of Governance!  https://talk.tezos.domains/t/p-001-active-confirm-primate-tzc-tez-as-the-acting-steward-of-governance/83
[negative]

## Processing Data

In [40]:
file = 'sentiment_data.csv'
data = pd.read_csv(file)
print(data.head())

                          Date        Username  \
0   Feb 22, 2024 · 2:22 AM UTC   @ch_chotimah2   
1   Feb 22, 2024 · 3:26 AM UTC   @PartaiSocmed   
2   Feb 21, 2024 · 4:48 PM UTC  @H4T14K4LN4L42   
3  Feb 21, 2024 · 11:19 AM UTC   @HARIMAU_JAYA   
4   Oct 26, 2023 · 8:42 AM UTC   @tezosdomains   

                                                                                                                                                                                                                                                                                                                                                   Text  \
0  Kemarin TKN 02 mengingatkan bahwa Jokowi tak mempunyai kewenangan cawe-cawe dalam pemerintahan Prabowo-Gibran jika memenangkan Pilpres 2024.  Sekarang ada berita adik Prabowo garansi seseorang bakal jadi menteri prabowo.  Sy sih ga kaget, kan Prabowo sendiri yg bilang, "Dia (Hashim) adalah otak di belakang Gerindra. Saya hanyalah wayang".   
1    

In [41]:
data['Text'] = data['Text'].str.lower()
print(data['Text'].head())

0    kemarin tkn 02 mengingatkan bahwa jokowi tak mempunyai kewenangan cawe-cawe dalam pemerintahan prabowo-gibran jika memenangkan pilpres 2024.  sekarang ada berita adik prabowo garansi seseorang bakal jadi menteri prabowo.  sy sih ga kaget, kan prabowo sendiri yg bilang, "dia (hashim) adalah otak di belakang gerindra. saya hanyalah wayang".
1                                                                                                                             gerombolan yg sama ini dulunya adalah pendukung @prabowo di 2014 dan 2019. begitu mereka pindah dukungan prabowo langsung menang. paham kan kenapa selama ini kami bilang beban @aniesbaswedan adalah pendukungnya sendiri.
2                                                                                                                                                                                                                                                      viral kan....!  ini fakta kecurangan tersetruktur sistimatis 

In [42]:
def clean_text(text):
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ', text) # Remove https* and www*
    text = re.sub(r'@[^\s]+', ' ', text) # Remove username
    text = re.sub(r'[\s]+', ' ', text) # Remove additional white spaces
    text = re.sub(r'#([^\s]+)', ' ', text) # Remove hashtags
    text = re.sub(r'rt', ' ', text) # Remove retweet
    text = re.sub(r'[^\w\s]+', ' ', text) # Remove punctuation
    text = re.sub(r'\d', ' ', text) # Remove numbers
    return text

data['Cleaned'] = data['Text'].apply(clean_text)
print(data['Cleaned'].head())


0    kemarin tkn    mengingatkan bahwa jokowi tak mempunyai kewenangan cawe cawe dalam pemerintahan prabowo gibran jika memenangkan pilpres       sekarang ada berita adik prabowo garansi seseorang bakal jadi menteri prabowo  sy sih ga kaget  kan prabowo sendiri yg bilang   dia  hashim  adalah otak di belakang gerindra  saya hanyalah wayang 
1                                                                                                                                                  gerombolan yg sama ini dulunya adalah pendukung di      dan       begitu mereka pindah dukungan prabowo langsung menang  paham kan kenapa selama ini kami bilang beban adalah pendukungnya sendiri 
2                                                                                                                                                                                                                                                                                viral kan  ini fakta kecurangan tersetruk

In [43]:
data ['Tokenized'] = data['Cleaned'].apply(word_tokenize)
print(data['Tokenized'].head())

0    [kemarin, tkn, mengingatkan, bahwa, jokowi, tak, mempunyai, kewenangan, cawe, cawe, dalam, pemerintahan, prabowo, gibran, jika, memenangkan, pilpres, sekarang, ada, berita, adik, prabowo, garansi, seseorang, bakal, jadi, menteri, prabowo, sy, sih, ga, kaget, kan, prabowo, sendiri, yg, bilang, dia, hashim, adalah, otak, di, belakang, gerindra, saya, hanyalah, wayang]
1                                                                                                                                                                  [gerombolan, yg, sama, ini, dulunya, adalah, pendukung, di, dan, begitu, mereka, pindah, dukungan, prabowo, langsung, menang, paham, kan, kenapa, selama, ini, kami, bilang, beban, adalah, pendukungnya, sendiri]
2                                                                                                                                                                                                                                                           

In [44]:

# Normalization
def normalize_text(tokens):
    kamus = 'https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv'
    data = pd.read_csv(kamus)
    slang_to_formal = dict(zip(data['slang'], data['formal']))
    normalized_tokens = [slang_to_formal.get(token, token) for token in tokens]
    return normalized_tokens

data['Normalized'] = data['Tokenized'].apply(normalize_text)
print(data['Normalized'].head())

0    [kemarin, tkn, mengingatkan, bahwa, jokowi, tak, mempunyai, kewenangan, cawe, cawe, dalam, pemerintahan, prabowo, gibran, jika, memenangkan, pilpres, sekarang, ada, berita, adik, prabowo, garansi, seseorang, bakal, jadi, menteri, prabowo, saya, sih, enggak, kaget, kan, prabowo, sendiri, yang, bilang, dia, hashim, adalah, otak, di, belakang, gerindra, saya, hanyalah, wayang]
1                                                                                                                                                                        [gerombolan, yang, sama, ini, dulunya, adalah, pendukung, di, dan, begitu, mereka, pindah, dukungan, prabowo, langsung, menang, paham, kan, kenapa, selama, ini, kami, bilang, beban, adalah, pendukungnya, sendiri]
2                                                                                                                                                                                                                                           

In [48]:
stop_words = set(stopwords.words('indonesian'))
data['Stopwords'] = data['Normalized'].apply(lambda tokens: [token for token in tokens if token not in stop_words])
print(data['Stopwords'].head())

0    [kemarin, tkn, jokowi, kewenangan, cawe, cawe, pemerintahan, prabowo, gibran, memenangkan, pilpres, berita, adik, prabowo, garansi, menteri, prabowo, sih, kaget, prabowo, bilang, hashim, otak, gerindra, wayang]
1                                                                                                     [gerombolan, dulunya, pendukung, pindah, dukungan, prabowo, langsung, menang, paham, bilang, beban, pendukungnya]
2                                                                                                                                                           [viral, fakta, kecurangan, tersetruktur, sistimatis, masif]
3                                                                                                                                                                                                            [yeehaaaa]
4                                                                          [we, are, kicking, off, the, governance, process, by, asking,