<a href="https://colab.research.google.com/github/sinungadi/TwitterABSA/blob/master/zero-shot%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install numpyencoder

# 1. Import Libraries

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
torch.cuda.is_available()

True

In [44]:
import pandas as pd
import warnings
import random
from tqdm import tqdm
from numpyencoder import NumpyEncoder

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

# 2. Data Collection

In [5]:
data = pd.read_csv("data_filtered.csv")

In [6]:
docs = data['text_cleaned']

# 3. Create Functions

In [26]:
# Initiate tokenizer and model class from HuggingFace
def getModel(model_name):
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model_name = model_name
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

  return device, tokenizer, model

# Innitiate classifier to process the text through model and return the results
def process(model_name, sequence, label, device, tokenizer, model):
  if model_name == "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7":
    premise = sequence
    hypothesis = f"This text is about {label}"

    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"

    prediction = torch.softmax(output["logits"][:,[0,2]][0], -1).tolist()
    label_names = ["True", "False"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}

    return prediction

  elif model_name == "vicgalle/xlm-roberta-large-xnli-anli":
    premise = sequence
    hypothesis = f"This text is about {label}"

    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"

    prediction = torch.softmax(output["logits"][:,[0,2]][0], -1).tolist()
    label_names = ["False", "True"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}

    return prediction

# Dict to assemble the results
def generateTemp():
    return {
        'id' : None,
        'sequence': None,
        'result': []
}

In [54]:
# Dict to parse the results into each label and class
def generateResultTemp():
  return {
      'id' : None,
      'sequence': None,
      'driving range' : None,
      'battery life' : None,
      'charging time' : None,
      'charging infrastructure' : None,
      'price value' : None,
      'incentive policy' : None,
      'environmental concern' : None,
      'look and design' : None,
}

# Get the True False class for each label
def getProbability(data, label):
  if data['result'][label]['probability']['True'] >= 80.0:
    return True

  else:
    return False

In [9]:
# These labels based on various research about what factors influenced consumers to buy E-Vehicle
CANDIDATE_LABELS = [
    'driving range',
    'battery life',
    'charging time',
    'charging infrastructure',
    'price value',
    'incentive policy',
    'environmental concern',
    'look and design'
]

# Zero-Shot Modeling

## 1. Using mDeBERTa-v3-base-xnli-multilingual-nli-2mil7

In [10]:
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
device, tokenizer, model = getModel(model_name)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 871ea53e-b8fd-42cb-855e-bbfcccac0514)')' thrown while requesting HEAD https://huggingface.co/MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7/resolve/main/tokenizer_config.json


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [11]:
resultData = []

for i, text in tqdm(enumerate(docs)):
  temp = generateTemp()

  temp.update({
        'id' : data['tweet_id'][i],
        'sequence' : text
  })

  for label in CANDIDATE_LABELS:
    temp['result'].append({'label' : label,
                           'probability' : process(model_name, text, label, device, tokenizer, model)})

  resultData.append(temp)

23729it [1:17:20,  5.11it/s]


In [12]:
random.sample(resultData, 3)

[{'id': 1633332092045250560,
  'sequence': 'Tidak ada jaminan mengurangi emisi karbon dan kemacetan lalulintas, Subsidi kendaraan listrik hanya menguntungkan produsen..',
  'result': [{'label': 'driving range',
    'probability': {'True': 1.9, 'False': 98.1}},
   {'label': 'battery life', 'probability': {'True': 7.7, 'False': 92.3}},
   {'label': 'charging time', 'probability': {'True': 0.7, 'False': 99.3}},
   {'label': 'charging infrastructure',
    'probability': {'True': 22.5, 'False': 77.5}},
   {'label': 'price value', 'probability': {'True': 0.2, 'False': 99.8}},
   {'label': 'incentive policy', 'probability': {'True': 99.6, 'False': 0.4}},
   {'label': 'environmental concern',
    'probability': {'True': 99.9, 'False': 0.1}},
   {'label': 'look and design', 'probability': {'True': 0.4, 'False': 99.6}}]},
 {'id': 1644101545632620544,
  'sequence': 'Lah EV kan perlu di charge yang listriknya dari coal',
  'result': [{'label': 'driving range',
    'probability': {'True': 0.1, 'Fal

In [13]:
resultList = []

for tweet in resultData:
  resultDict = generateResultTemp()

  resultDict.update({
      'id' : tweet['id'],
      'sequence': tweet['sequence'],
      'driving range' : getProbability(tweet, 0),
      'battery life' : getProbability(tweet, 1),
      'charging time' : getProbability(tweet, 2),
      'charging infrastructure' : getProbability(tweet, 3),
      'price value' : getProbability(tweet, 4),
      'incentive policy' : getProbability(tweet, 5),
      'environmental concern' : getProbability(tweet, 6),
      'look and design' : getProbability(tweet, 7),
  })

  resultList.append(resultDict)

In [14]:
df1 = pd.DataFrame(resultList)

In [15]:
df1.sample(5)

Unnamed: 0,id,sequence,driving range,battery life,charging time,charging infrastructure,price value,incentive policy,environmental concern,look and design
6978,1626201585293426689,"Ni ngombe iya Tesla, ini charging port",False,True,False,True,False,False,False,True
6684,1626401409741770753,Dengan PLN yang menciptakan ekosistem kendaraan listrik di indonesia semoga dapat membantu masyarakat dan juga lingkungan,False,False,False,False,False,True,True,False
14638,1633228457063899136,Dengan inovasi baru yang canggih yaitu kendaraan listrik kamu akan menjadi lebih irit sebagaimana penjelasan berikut,False,True,False,False,False,False,True,True
5941,1626791227533438977,Jokowi pastikan motor listrik jadi yang pertama diberi intensif,False,False,False,False,False,True,False,False
16997,1649397958813843457,solusi mobil listrik cuma mengatasi masalah polusi tapi tidak dengan kemacetan,False,False,False,False,False,False,True,False


In [16]:
df1.to_csv('0ShotResult_mDeBERTa.csv', index=False)

## 2. Using xlm-roberta-large-xnli-anli

In [46]:
model_name = "vicgalle/xlm-roberta-large-xnli-anli"
device, tokenizer, model = getModel(model_name)

In [47]:
resultData = []

for i, text in tqdm(enumerate(docs)):
  temp = generateTemp()

  temp.update({
        'id' : data['tweet_id'][i],
        'sequence' : text
  })

  for label in CANDIDATE_LABELS:
    temp['result'].append({'label' : label,
                           'probability' : process(model_name, text, label, device, tokenizer, model)})

  resultData.append(temp)

23729it [1:19:30,  4.97it/s]


In [48]:
random.sample(resultData, 3)

[{'id': 1624202467708915714,
  'sequence': 'Menteri PANRB Azwar Anas hari ini mewakili Menteri Luhut Binsar Pandjaitan meresmikan SPKLU di Prov. Banten. Adanya SPKLU ini menjadi salah satu wujud RB yang berdampak bagi masyarakat dalam membangun infrastruktur yang ramah lingkungan melalui kendaraan listrik.',
  'result': [{'label': 'driving range',
    'probability': {'False': 100.0, 'True': 0.0}},
   {'label': 'battery life', 'probability': {'False': 99.9, 'True': 0.1}},
   {'label': 'charging time', 'probability': {'False': 100.0, 'True': 0.0}},
   {'label': 'charging infrastructure',
    'probability': {'False': 85.1, 'True': 14.9}},
   {'label': 'price value', 'probability': {'False': 100.0, 'True': 0.0}},
   {'label': 'incentive policy', 'probability': {'False': 78.3, 'True': 21.7}},
   {'label': 'environmental concern',
    'probability': {'False': 0.1, 'True': 99.9}},
   {'label': 'look and design',
    'probability': {'False': 100.0, 'True': 0.0}}]},
 {'id': 1643257674057007106,

In [49]:
with open('0ShotResult_xlm-roberta.json', 'w') as file:
    json.dump(resultData, file, indent=4, sort_keys=False,
              separators=(', ', ': '), ensure_ascii=False,
              cls=NumpyEncoder)

In [50]:
resultList = []

for tweet in resultData:
  resultDict = generateResultTemp()

  resultDict.update({
      'id' : tweet['id'],
      'sequence': tweet['sequence'],
      'driving range' : getProbability(tweet, 0),
      'battery life' : getProbability(tweet, 1),
      'charging time' : getProbability(tweet, 2),
      'charging infrastructure' : getProbability(tweet, 3),
      'price value' : getProbability(tweet, 4),
      'incentive policy' : getProbability(tweet, 5),
      'environmental concern' : getProbability(tweet, 6),
      'look and design' : getProbability(tweet, 7),
  })

  resultList.append(resultDict)

In [51]:
df2 = pd.DataFrame(resultList)

In [52]:
df2.sample(5)

Unnamed: 0,id,sequence,driving range,battery life,charging time,charging infrastructure,price value,incentive policy,environmental concern,look and design
12179,1636555514103611394,"Sebenarnya sudah penasaran banget sih pakai kendaraan listrik gitu, soalnya hemat banget bisa irit isi dompet dong",False,False,False,False,True,False,False,False
9119,1640899633798062080,"Electrizen, PLN terus memberikan kemudahan bagi pengguna kendaraan listrik atau 𝘌𝘭𝘦𝘤𝘵𝘳𝘪𝘤 𝘝𝘦𝘩𝘪𝘤𝘭𝘦 (EV), salah satunya melalui layanan pengisian daya kendaraan listrik di rumah atau 𝘩𝘰𝘮𝘦 𝘤𝘩𝘢𝘳𝘨𝘪𝘯𝘨.",False,False,False,False,False,True,False,False
4662,1629850572118704130,"Nahhh dalam Targetkan Net Zero Emission, Pertamina Siapkan Ekosistem Kendaraan Listrik. Moga bisa capai target",False,False,False,False,False,False,True,False
21777,1659358103748616192,Kan termasuk juga motor listrik. Di desa-desa Kecamatan Ciampea dan Cigudeg banyak masyarakat menikmati subsidi motor listrik dan menggunakannya sendiri atau menyewakannya Nggak pernah ke Bogor kah,False,False,False,False,False,True,False,False
22597,1659011551175782401,"Subsidi mobil listrik | Serba-serbi MMC Alasan pemerintah bahwa hal ini untuk mengurangi emisi karbon, dianggap mengada-ngada. Aroma bisnis oligarki pun sangat tercium tajam.",False,False,False,False,False,True,True,False


In [53]:
df2.to_csv('0ShotResult_xlm-roberta.csv', index=False)