<a href="https://colab.research.google.com/github/sinungadi/TwitterABSA/blob/master/sentiment_classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install xformers
!pip install numpyencoder

# 1. Import Required Libraries

In [2]:
import pandas as pd
import warnings
import random
import json
from numpyencoder import NumpyEncoder
from tqdm import tqdm

import torch
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline
import numpy as np
from scipy.special import softmax

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

# 2. Data Colection

In [3]:
data = pd.read_csv('data_filtered.csv')

In [4]:
docs = data['text_cleaned']

# 3. Create Functions

In [5]:
def getModel(model_name):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  config = AutoConfig.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name)

  return tokenizer, config, model

def process(sequence, tokenizer, config, model):
  encoded_input = tokenizer(sequence, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)

  # Print labels and scores
  ranking = np.argsort(scores)
  ranking = ranking[::-1]
  result = {}
  for i in range(scores.shape[0]):
      l = config.id2label[ranking[i]]
      s = scores[ranking[i]]
      result[l] = s

  return result

def process0Shot(sequence, labels, classifier):
  return classifier(sequence, labels)

def generateTemp():
  return {
      'id' : None,
      'sequence': None,
      'result': []
  }

In [6]:
# Dict to parse the results into each label and class
def parseResultList():
  return {
      'id' : None,
      'sequence': None,
      'sentiment' : None
}

def getSentiment(data):
  return max(data['result'], key=data['result'].get)

def getSentimentFromRoberta(data):
  return data['result'][0]['labels'][0]

# 3. Data Modeling

## Using indonesian-roberta-base-sentiment-classifier

In [7]:
model_name = f"w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer, config, model = getModel(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/808k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/467k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
resultList = []

for i, text in tqdm(enumerate(docs)):
  temp = generateTemp()

  temp.update({
      'id' : data['tweet_id'][i],
      'sequence' : text,
      'result' : process(text,tokenizer,config,model)
  })

  resultList.append(temp)

23729it [57:38,  6.86it/s]


In [None]:
random.sample(resultList, 5)

[{'id': 1635252546225991681,
  'sequence': 'Kalo saja charging station di Indonesia sudah tersedia secara merata, mungkin bakal banyak orang yang mulai berlomba-lomba untuk jadi yang pertama mengelilingi Indonesia dengan kendaraan listrik',
  'result': {'positive': 0.7547986,
   'neutral': 0.13014053,
   'negative': 0.11506089}},
 {'id': 1629376734939471872,
  'sequence': 'Home Charging Produk Layanan PLN Penuhi Kebutuhan Pengisian Baterai Kendaraan Listrik di Rumah: Layanan home charging ini juga tersambung dengan Electric Vehicle Digital Services (EVDS) yang disiapkan PLN.',
  'result': {'neutral': 0.9981945,
   'positive': 0.0010955199,
   'negative': 0.00070995156}},
 {'id': 1626522623776530433,
  'sequence': 'Statemen di atas bilang mendorong penggunaan kendaraan umum dan tidak profesional kendaraan pribadi, tapi di sisi lain adanya subsidi kendaraan listrik mendorong masyarakat untuk membeli kendaraan lagi????',
  'result': {'negative': 0.9846687,
   'neutral': 0.009373622,
   'p

In [None]:
with open('SentClfResult_Indo-Roberta.json', 'w') as file:
    json.dump(resultList, file, indent=4, sort_keys=False,
              separators=(', ', ': '), ensure_ascii=False,
              cls=NumpyEncoder)

In [None]:
resultData = []

for tweet in resultList:
  resultDict = parseResultList()

  resultDict.update({
      'id' : tweet['id'],
      'sequence' : tweet['sequence'],
      'sentiment' : getSentiment(tweet)
  })

  resultData.append(resultDict)

In [None]:
df = pd.DataFrame(resultData)

In [None]:
df.sample(5)

Unnamed: 0,id,sequence,sentiment
16781,1650534930043240451,"Bangun Pabrik Baterai Mobil Listrik di Kanada, VW Investasi Rp218,4 Triliun",neutral
20321,1660898545136959490,"Perbandingannya cukup jauh, dengan asumsi tarif listrik sebesar Rp1.699 per Kwh, hanya diperlukan sekitar Rp2.500 untuk sepeda motor listrik menempuh jarak 50 kamu dan 10 kamu untuk mobil listrik. Sedangkan, jika menggunakan BBM kamu harus menghabiskan sekitar Rp14 ribu -",positive
8958,1641060532856979458,"motor biasa saja sekali jajan ke bengkel lumayan, lah ini hype buat motor listrik, bengkel tidak jelas dimana, terus di pikir sekali jajan akan murah gitu..",negative
1811,1616541590528593920,"Bro tahu produksi listrik kita masih didominasi pakai batu bara? Pake kendaraan listrik itu tidak ngurangi polusi, cuma mindah polusi dari kota2 besar ke daerah produksi listrik. Kalo kamu pakai istilah distribusi polusi di Indonesia saya baru setuju. Singapura dapat listrik darimana?",negative
13675,1634719522119880704,"Fokus utang luar negeri, IKN, subsidi motor / mobil listrik, Mega proyek Rugi, jalan tol rugi gerus APBN untuk kesejahteraan rakyat duit pajak rakyat !;",negative


In [None]:
df.to_csv('SentClfResult_Indo-Roberta.csv', index=False)

## Using distilbert-base-multilingual-cased-sentiments-student

In [8]:
model_name = "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
tokenizer, config, model = getModel(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

In [18]:
resultList = []

for i, text in tqdm(enumerate(docs)):
  temp = generateTemp()

  temp.update({
      'id' : data['tweet_id'][i],
      'sequence' : text,
      'result' : process(text,tokenizer,config,model)
  })

  resultList.append(temp)

23729it [34:11, 11.57it/s]


In [19]:
random.sample(resultList, 5)

[{'id': 1617807631061745666,
  'sequence': 'Dari Indonesia untuk G20: "Kendaraan listrik adalah solusi palsu untuk perubahan iklim"',
  'result': {'positive': 0.59068495,
   'negative': 0.2543174,
   'neutral': 0.15499766}},
 {'id': 1633309979049590784,
  'sequence': 'Kebijakan itu harusnya melihat sekala prioritas, seperti prioritas pendidikan prioritas subsidi pertanian dll, kalau mobil listrik yang bisa beli iya orang-orang kaya, lagian ini mobil listrik impor kan...🤔🤔',
  'result': {'negative': 0.5795394,
   'positive': 0.28172404,
   'neutral': 0.13873659}},
 {'id': 1628593968345944064,
  'sequence': 'Presiden Jokowi Tegaskan Indonesia Harus Segera Wujudkan Ekosistem Kendaraan Listrik',
  'result': {'positive': 0.75652283,
   'negative': 0.16298135,
   'neutral': 0.08049584}},
 {'id': 1633426635071586305,
  'sequence': 'Buat yang belum tahu yah, kenapa mobil listrik ramah lingkungan?',
  'result': {'negative': 0.39303488,
   'positive': 0.31832024,
   'neutral': 0.28864482}},
 {'i

In [20]:
with open('SentClfResult_Distilbert.json', 'w') as file:
    json.dump(resultList, file, indent=4, sort_keys=False,
              separators=(', ', ': '), ensure_ascii=False,
              cls=NumpyEncoder)

In [21]:
resultData = []

for tweet in resultList:
  resultDict = parseResultList()

  resultDict.update({
      'id' : tweet['id'],
      'sequence' : tweet['sequence'],
      'sentiment' : getSentiment(tweet)
  })

  resultData.append(resultDict)

In [22]:
df1 = pd.DataFrame(resultData)

In [23]:
df1.sample(5)

Unnamed: 0,id,sequence,sentiment
17874,1646502785247232001,Motor listrik tidak menghasilkan emisi gas lantaran sumber tenaganya berasal dari listrik yang tersimpan di baterai,negative
13278,1635570462188244992,Tujuan subsidi EV ini apa iya bagi keberlangsungan kehidupan rakyat Indonesia? 🤦🏽‍♂️,negative
12708,1636312367138209792,"Tak Nyambung Petani kesulitan pupuk, karena mahal langka Yang disubsidi motor dan mobil listrik Oooiii siuman oooiii",negative
12322,1636550662942248960,"Wahh baru tahu ternyata motor listrik cuma butuh biaya 1.600 per kamu aja, jadi bisa lebih hemat",negative
6646,1626401619305971712,"PLN menciptakan ekosistem kendaraan listrik di tanah air dengan fokus utama membangun fasilitas pengisian energi SPKLU dan SPBKLU di setiap titik. Selain upaya ini, PLN juga telah membangun Electric Vehicle Digital Services (EVDS)",positive


In [24]:
df1.to_csv('SentClfResult_Distilbert.csv', index=False)

## Using xlm-roberta-large-xnli-anli

In [19]:
device = 0 if torch.cuda.is_available() else -1

classifier = pipeline("zero-shot-classification",
                       model="vicgalle/xlm-roberta-large-xnli-anli", device=device)

In [17]:
CANDIDATE_LABELS = [
    'Positive',
    'Neutral',
    'Negative'
]

In [21]:
resultList = []

for i, text in tqdm(enumerate(docs)):
  output = process0Shot(text, CANDIDATE_LABELS, classifier)

  temp = generateTemp()
  temp.update({
        'id' : data['tweet_id'][i],
        'sequence' : text
  })

  temp['result'].append({'labels' : output['labels'],
                         'scores' : output['scores']})

  resultList.append(temp)

23729it [28:08, 14.05it/s]


In [22]:
random.sample(resultList, 5)

[{'id': 1632071992118751232,
  'sequence': 'Apakah baterai EV juga bakal sama seperti bayre hp, lama-lama cembung bang? Klo iya nanti boilnya auto bengkok keatas juga tuh 🤣🤣',
  'result': [{'labels': ['Negative', 'Neutral', 'Positive'],
    'scores': [0.652890145778656,
     0.28720033168792725,
     0.059909552335739136]}]},
 {'id': 1632904072817680387,
  'sequence': 'Subsidi Motor Listrik untuk 200 Ribu Unit, Berlaku 20 Maret',
  'result': [{'labels': ['Positive', 'Negative', 'Neutral'],
    'scores': [0.8338955044746399,
     0.08614542335271835,
     0.07995910197496414]}]},
 {'id': 1633132058821406721,
  'sequence': 'EV cars ini kalau maaih anak charge guna sumber arang batu, lupakan je lah. Baik guna petrol. Unless, solar charge. Baru go green purely. Dah lah mahal anak mampus fast charging. Belum kira bateri dia cepat kong kalau memanjang guna FC.',
  'result': [{'labels': ['Positive', 'Negative', 'Neutral'],
    'scores': [0.5440291166305542, 0.3016342520713806, 0.1543366312980

In [23]:
with open('SentClfResult_xlm-roberta.json', 'w') as file:
    json.dump(resultList, file, indent=4, sort_keys=False,
              separators=(', ', ': '), ensure_ascii=False,
              cls=NumpyEncoder)

In [24]:
resultData = []

for tweet in resultList:
  resultDict = parseResultList()

  resultDict.update({
      'id' : tweet['id'],
      'sequence' : tweet['sequence'],
      'sentiment' : getSentimentFromRoberta(tweet)
  })

  resultData.append(resultDict)

In [25]:
df2 = pd.DataFrame(resultData)

In [26]:
df2.sample(5)

Unnamed: 0,id,sequence,sentiment
16429,1651588092762144772,"Mau liat-liat dulu jenis-jenis motor listrik impian kalian, atau mau langsung beli? Lewat aplikasi PLN mobile saja dijamin aman Bali Gabriel INDOMY POWER Badminton Asia Zayyan",Positive
20512,1660674251022798848,PLN terus berupaya mewujudkan ekosistem Molis (Mobil dan Motor Listrik) tumbuh subur di tanah air.,Positive
47,1620013660331716609,"Terkait motor listrik Gesits besutan ITS dan Garasindo yang sudah diciptakan sejak 2018, Prabowo Subianto menyebut, Pemerintah RI akan banyak memesan motor tersebut.",Positive
5507,1627832305313267713,"Untuk memastikan baterai berkinerja dengan baik dan tahan lama, Battery Management System (BMS) diperlukan.",Positive
11298,1637299120804368385,Saatnya beralih kendaraan listrik yuk semua sudah di dukung dan difasilitasi ciptakan indonesia yang lebih ramah lingkungan,Positive


In [27]:
df2.to_csv('SentClfResult_xlm-roberta.csv', index=False)