<a href="https://colab.research.google.com/github/sinungadi/TwitterABSA/blob/master/zero-shot%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch
!pip install transformers
!pip install sentencepiece

In [2]:
import torch
from transformers import pipeline, BertConfig, BertModel, AutoTokenizer, AutoModelForSequenceClassification

In [3]:
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [4]:
data = pd.read_csv("data_filtered.csv")

In [5]:
docs = data['text_cleaned']

In [83]:
def getModel(model_name):
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model_name = model_name
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

  return device, tokenizer, model

def process(sequence, label, device, tokenizer, model):
  premise = sequence
  hypothesis = f"teks ini tentang {label}"

  input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
  output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"

  prediction = torch.softmax(output["logits"][:,[0,2]][0], -1).tolist()
  label_names = ["True", "False"]
  prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}

  return prediction

def generateTemp():
    return {
        'id' : None,
        'sequence': None,
        'result': []
}

In [90]:
CANDIDATE_LABELS = [
    'fitur-fitur',
    'jangkauan mengemudi',
    'daya tahan baterai',
    'waktu pengisian baterai',
    'infrastruktur pengisian atau SPKLU',
    'harga pembelian',
    'pajak kendaraan bermotor',
    'subsidi dan insentif',
    'kelestarian lingkungan',
    'industri dan bahan baku baterai',
    'tampilan dan desain'
]

# Using mDeBERTa-v3-base-xnli-multilingual-nli-2mil7

In [167]:
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
device, tokenizer, model = getModel(model_name)

In [168]:
resultData = []

for i, text in tqdm(enumerate(docs)):
  temp = generateTemp()

  temp.update({
        'id' : data['tweet_id'][i],
        'sequence' : text
  })

  for label in CANDIDATE_LABELS:
    temp['result'].append({'label' : label,
                           'probability' : process(text, label, device, tokenizer, model)})

  resultData.append(temp)

23729it [1:56:09,  3.40it/s]


In [169]:
resultData

[{'id': 1620198759077662720,
  'sequence': 'DALANG PENGHANCUR REPUBLIK INI. Luhut Disinggung soal Subsidi Motor Listrik: Dia yang Produksi, Dia juga yang Subsidi, Pasti Makin Kaya Raya',
  'result': [{'label': 'fitur-fitur',
    'probability': {'True': 49.5, 'False': 50.5}},
   {'label': 'jangkauan mengemudi',
    'probability': {'True': 46.7, 'False': 53.3}},
   {'label': 'daya tahan baterai',
    'probability': {'True': 2.2, 'False': 97.8}},
   {'label': 'waktu pengisian baterai',
    'probability': {'True': 0.9, 'False': 99.1}},
   {'label': 'infrastruktur pengisian atau SPKLU',
    'probability': {'True': 74.5, 'False': 25.5}},
   {'label': 'harga pembelian', 'probability': {'True': 2.4, 'False': 97.6}},
   {'label': 'pajak kendaraan bermotor',
    'probability': {'True': 13.0, 'False': 87.0}},
   {'label': 'subsidi dan insentif',
    'probability': {'True': 99.1, 'False': 0.9}},
   {'label': 'kelestarian lingkungan',
    'probability': {'True': 21.0, 'False': 79.0}},
   {'label': 

In [170]:
def generateResultTemp():
  return {
      'id' : None,
      'sequence': None,
      'fitur-fitur' : None,
      'jangkauan mengemudi' : None,
      'daya tahan baterai' : None,
      'waktu pengisian baterai' : None,
      'infrastruktur pengisian atau SPKLU' : None,
      'harga pembelian' : None,
      'pajak kendaraan bermotor' : None,
      'subsidi dan insentif' : None,
      'kelestarian lingkungan' : None,
      'industri dan bahan baku baterai' : None,
      'tampilan dan desain' : None
}

In [171]:
def getProbability(data, label):
  if data['result'][label]['probability']['True'] >= 80.0:
    return True

  else:
    return False

In [172]:
resultList = []

for tweet in resultData:
  resultDict = generateResultTemp()

  resultDict.update({
      'id' : tweet['id'],
      'sequence': tweet['sequence'],
      'fitur-fitur' : getProbability(tweet, 0),
      'jangkauan mengemudi' : getProbability(tweet, 1),
      'daya tahan baterai' : getProbability(tweet, 2),
      'waktu pengisian baterai' : getProbability(tweet, 3),
      'infrastruktur pengisian atau SPKLU' : getProbability(tweet, 4),
      'harga pembelian' : getProbability(tweet, 5),
      'pajak kendaraan bermotor' : getProbability(tweet, 6),
      'subsidi dan insentif' : getProbability(tweet, 7),
      'kelestarian lingkungan' : getProbability(tweet, 8),
      'industri dan bahan baku baterai' : getProbability(tweet, 9),
      'tampilan dan desain' : getProbability(tweet, 10)
  })

  resultList.append(resultDict)

In [173]:
df1 = pd.DataFrame(resultList)

In [176]:
df1.sample(5)

Unnamed: 0,id,sequence,fitur-fitur,jangkauan mengemudi,daya tahan baterai,waktu pengisian baterai,infrastruktur pengisian atau SPKLU,harga pembelian,pajak kendaraan bermotor,subsidi dan insentif,kelestarian lingkungan,industri dan bahan baku baterai,tampilan dan desain
15116,1632942087305064449,"Salah satu motor lisrik yang mendapat subsidi, Gesits. FotoL SINDONews – Pemerintah bakal memberi insentif atau subsidi k... -",False,False,False,False,False,False,False,True,True,False,False
4592,1630136357300633600,"Diarahkan biar pakai mobil listrik biar bebas pajak, mungkin?",False,False,False,False,False,False,True,False,False,False,False
2924,1614217619019821057,Kelebihan kendaraan listrik: Kendaraan listrik bisa mengurangi polusi udara dan kebisingan. Setuju? Selain ramah lingkungan perawatan juga masuk di kantong,True,True,False,False,False,False,False,False,True,False,False
3005,1614127691443343361,"Inc telah mengurangkan harga kendaraan elektriknya secara sebanyak 20 peratus, melanjutkan usaha diskaun yang agresif dan pesaing yang mencabar selepas kehilangan anggaran penghantaran untuk 2022. Baca Lagi |",False,False,False,False,False,False,False,False,False,False,False
4683,1629817818433859590,"Jenis Motor Listrik Honda Terbaru 2023, Cek Harga di Sini!",False,False,False,False,False,False,False,False,False,False,False


In [175]:
df1.to_csv('0ShotResult_mDeBERTa.csv', index=False)

# Using bart-large-mnli

In [177]:
model_name = "facebook/bart-large-mnli"
device, tokenizer, model = getModel(model_name)

In [None]:
resultData = []

for i, text in tqdm(enumerate(docs)):
  temp = generateTemp()

  temp.update({
        'id' : data['tweet_id'][i],
        'sequence' : text
  })

  for label in CANDIDATE_LABELS:
    temp['result'].append({'label' : label,
                           'probability' : process(text, label, device, tokenizer, model)})

  resultData.append(temp)

23038it [2:29:30,  2.84it/s]

In [None]:
resultData

In [None]:
def generateResultTemp():
  return {
      'id' : None,
      'sequence': None,
      'fitur-fitur' : None,
      'jangkauan mengemudi' : None,
      'daya tahan baterai' : None,
      'waktu pengisian baterai' : None,
      'infrastruktur pengisian atau SPKLU' : None,
      'harga pembelian' : None,
      'pajak kendaraan bermotor' : None,
      'subsidi dan insentif' : None,
      'kelestarian lingkungan' : None,
      'industri dan bahan baku baterai' : None,
      'tampilan dan desain' : None
}

In [None]:
def getProbability(data, label):
  if data['result'][label]['probability']['True'] >= 80.0:
    return True

  else:
    return False

In [None]:
resultList = []

for tweet in resultData:
  resultDict = generateResultTemp()

  resultDict.update({
      'id' : tweet['id'],
      'sequence': tweet['sequence'],
      'fitur-fitur' : getProbability(tweet, 0),
      'jangkauan mengemudi' : getProbability(tweet, 1),
      'daya tahan baterai' : getProbability(tweet, 2),
      'waktu pengisian baterai' : getProbability(tweet, 3),
      'infrastruktur pengisian atau SPKLU' : getProbability(tweet, 4),
      'harga pembelian' : getProbability(tweet, 5),
      'pajak kendaraan bermotor' : getProbability(tweet, 6),
      'subsidi dan insentif' : getProbability(tweet, 7),
      'kelestarian lingkungan' : getProbability(tweet, 8),
      'industri dan bahan baku baterai' : getProbability(tweet, 9),
      'tampilan dan desain' : getProbability(tweet, 10)
  })

  resultList.append(resultDict)

In [None]:
df2 = pd.DataFrame(resultList)

In [None]:
df2.sample(5)

In [None]:
df2.to_csv('0ShotResult_bart.csv', index=False)

# Using xlm-roberta-large-xnli-anli

In [None]:
model_name = "vicgalle/xlm-roberta-large-xnli-anli"
device, tokenizer, model = getModel(model_name)

In [None]:
resultData = []

for i, text in tqdm(enumerate(docs)):
  temp = generateTemp()

  temp.update({
        'id' : data['tweet_id'][i],
        'sequence' : text
  })

  for label in CANDIDATE_LABELS:
    temp['result'].append({'label' : label,
                           'probability' : process(text, label, device, tokenizer, model)})

  resultData.append(temp)

365it [02:25,  2.56it/s]

In [None]:
resultData

In [None]:
def generateResultTemp():
  return {
      'id' : None,
      'sequence': None,
      'fitur-fitur' : None,
      'jangkauan mengemudi' : None,
      'daya tahan baterai' : None,
      'waktu pengisian baterai' : None,
      'infrastruktur pengisian atau SPKLU' : None,
      'harga pembelian' : None,
      'pajak kendaraan bermotor' : None,
      'subsidi dan insentif' : None,
      'kelestarian lingkungan' : None,
      'industri dan bahan baku baterai' : None,
      'tampilan dan desain' : None
}

In [None]:
def getProbability(data, label):
  if data['result'][label]['probability']['True'] >= 80.0:
    return True

  else:
    return False

In [None]:
resultList = []

for tweet in resultData:
  resultDict = generateResultTemp()

  resultDict.update({
      'id' : tweet['id'],
      'sequence': tweet['sequence'],
      'fitur-fitur' : getProbability(tweet, 0),
      'jangkauan mengemudi' : getProbability(tweet, 1),
      'daya tahan baterai' : getProbability(tweet, 2),
      'waktu pengisian baterai' : getProbability(tweet, 3),
      'infrastruktur pengisian atau SPKLU' : getProbability(tweet, 4),
      'harga pembelian' : getProbability(tweet, 5),
      'pajak kendaraan bermotor' : getProbability(tweet, 6),
      'subsidi dan insentif' : getProbability(tweet, 7),
      'kelestarian lingkungan' : getProbability(tweet, 8),
      'industri dan bahan baku baterai' : getProbability(tweet, 9),
      'tampilan dan desain' : getProbability(tweet, 10)
  })

  resultList.append(resultDict)

In [None]:
df3 = pd.DataFrame(resultList)

In [None]:
df3.sample(5)

In [None]:
df3.to_csv('0ShotResult_xlm-roberta.csv', index=False)