<a href="https://colab.research.google.com/github/sinungadi/TwitterABSA/blob/master/sentiment_classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install xformers
!pip install numpyencoder

# 1. Import Required Libraries

In [49]:
import pandas as pd
import warnings
import random
import json
from numpyencoder import NumpyEncoder
from tqdm import tqdm

import torch
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline
import numpy as np
from scipy.special import softmax

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

# 2. Data Colection

In [4]:
data = pd.read_csv('data_filtered.csv')

In [5]:
docs = data['text_cleaned']

# 3. Create Functions

In [34]:
def getModel(model_name):
  model_name = f"w11wo/indonesian-roberta-base-sentiment-classifier"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  config = AutoConfig.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name)

  return tokenizer, config, model

def process(sequence, tokenizer, config, model):
  encoded_input = tokenizer(sequence, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)

  # Print labels and scores
  ranking = np.argsort(scores)
  ranking = ranking[::-1]
  result = {}
  for i in range(scores.shape[0]):
      l = config.id2label[ranking[i]]
      s = scores[ranking[i]]
      result[l] = s

  return result

def generateTemp():
  return {
      'id' : None,
      'sequence': None,
      'result': []
  }

In [82]:
# Dict to parse the results into each label and class
def parseResultList():
  return {
      'id' : None,
      'sequence': None,
      'sentiment' : None
}

# Get the True False class for each label
def getSentiment(data):
  return max(data['result'], key=data['result'].get)

# 3. Data Modeling

In [85]:
model_name = f"w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
resultList = []

for i, text in tqdm(enumerate(docs)):
  temp = generateTemp()

  temp.update({
      'id' : data['tweet_id'][i],
      'sequence' : text,
      'result' : process(text,tokenizer,config,model)
  })

  resultList.append(temp)

16436it [23:37, 11.39it/s]

In [43]:
random.sample(resultList, 5)

[{'id': 1620198759077662720,
  'sequence': 'Bahlil siap kawal rencana investasi VW bangun industri baterai EV',
  'result': {'neutral': 0.99863786,
   'positive': 0.0010155096,
   'negative': 0.00034661463}},
 {'id': 1620198166334406657,
  'sequence': 'jika melihat rekam jejaknya,justru Anies telah mendukung percepatan kendaraan listrik saat dia menjabat Gubernur DKI Jakarta.Kala itu,dia telah menekan Peraturan Gubernur (Pergub)mengenai relaksasi Bea Balik Nama Kendaraan Bermotor (BBNKB) kendaraan listrik.Pergub No.3/2020',
  'result': {'positive': 0.7415657,
   'neutral': 0.24329808,
   'negative': 0.015136156}},
 {'id': 1620194952830660609,
  'sequence': 'Lumayan buat subsidi mobil listrik ratusan juta',
  'result': {'positive': 0.7270449,
   'neutral': 0.26321247,
   'negative': 0.009742727}},
 {'id': 1620187780260446209,
  'sequence': 'Dahlah nk chargenya lama tempat charge sikit. Aku pun rasa belum sampai lagi masa untuk pakai ev ramai2. Hybrid masih lagi practical.',
  'result': 

In [50]:
with open('SentClfResult_Indo-Roberta.json', 'w') as file:
    json.dump(resultList, file, indent=4, sort_keys=False,
              separators=(', ', ': '), ensure_ascii=False,
              cls=NumpyEncoder)

In [83]:
resultData = []

for tweet in resultList:
  resultDict = parseResultList()

  resultDict.update({
      'id' : tweet['id'],
      'sequence' : tweet['sequence'],
      'sentiment' : getSentiment(tweet)
  })

  resultData.append(resultDict)

In [86]:
df = pd.DataFrame(resultData)

In [89]:
df.sample(5)

Unnamed: 0,id,sequence,sentiment
8,1620163113671864320,"Nor did private EV subsidy. Padahal dirimu sering bilang bahwa biggest contributor polusi bukan di kendaraan... Eh tapinya. Yang diangkat awalnya terkait policy, bahkan ditimpali dengan car ownership.",negative
9,1620156264646070272,Mengisi kendaraan listrik bisa dilakukan dirumah dengan home charging services loh gaes. Jangan ragu beli kendaraan listrik,neutral
6,1620182374159106064,Api.pastinya berasap.. Ada sesegera mungkin pastinya ada api ..🤣🤣🤣 yang gilanya ..emisi mobil listrik lebih gede perkapitanya dari Bus ..yang berbahan bakar minyak ..yang isinya lebih banyak ..🤣🤣🤣🤣,positive
5,1620183244540100616,Gilak rs anak ibuk edelwuis yang di soetta punya area charging mobil listrik dan gratis pula😭 cuma bayar parkir aja🤣,positive
4,1620187492300517377,Ia berharap adanya kemudahan untuk mendapatkan bantuan pemerintah tersebut khususnya bagi driver ojek online (ojol).,neutral


In [None]:
df.to_csv('SentClfResult_Indo-Roberta.csv', index=False)