# Libraries

In [5]:
import pandas as pd

# Step 1

In [6]:
column_names = ['text', 'label']

# Reading the TSV file
tsv__test_file = 'test.tsv'
df_test = pd.read_csv(tsv__test_file, delimiter='\t', names=column_names)

# Save as CSV for test
csv_file = 'test.csv'
df_test.to_csv(csv_file, index=False)

Preprocessing


In [7]:
def delete_hashtag_usernames(text):
  try:
    result = []
    for word in text.split():
      if word[0] not in ['@', '#']:
        result.append(word)
    return ' '.join(result)
  except:
    return ''

def delete_url(text):
  text = re.sub(r'http\S+', '', text)
  return text

def delete_ex(text):
  text = re.sub(r'\u200c', '', text)
  return text

In [8]:
import re
# Function to clean a single tweet with prepared functions
def clean_tweet(tweet):
    tweet = delete_hashtag_usernames(tweet)
    tweet = delete_url(tweet)
    tweet = delete_ex(tweet)
    return tweet


df_test['text'] = df_test['text'].apply(clean_tweet)

In [9]:
df_test

Unnamed: 0,text,label
0,اين شايد اولين عزاى عمومى واقعى است كه ياد دار...,SAD
1,دیشب بعد از ارسال تویت مربوط به آثار باستانی ت...,HAPPY
2,کدوم شعبه پول نداده، بگو الان برات آمار دقیق ب...,OTHER
3,امروز وسط یه بحث با بابا مامانم گفتم آدم باید ...,HAPPY
4,امشب گفت نامزدی دوستش که ادم روشنفکری است بهم ...,SAD
...,...,...
1146,یعنی این آهنگ مرغ سحر جوری ساخته شده و روی این...,HAPPY
1147,درود بر همه ایرانیان شریف که در این سرما در تظ...,HAPPY
1148,امروز تولدم است،عید است ولی شاد نیستم،عید و تو...,SAD
1149,لعنت به اونی که دلتنگ نگهت میداره...,SAD


# Step 2

loading the model

In [10]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [19]:
from transformers import AutoModelForSequenceClassification, AutoConfig
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

model_name = 'distilbert-base-multilingual-cased'  # or 'NousResearch/Meta-Llama-3-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Step 3

In [20]:
# By using pipeline:
import torch
from transformers import pipeline

# Create a sentiment-analysis pipeline
pipe = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
)

# Load the preprocessed test data (first 100 examples)
test_data = df_test.head(100)
texts = test_data['text'].tolist()

In [21]:
# Generate predictions
predictions = pipe(texts)

# Step 4

In [32]:
from sklearn.metrics import accuracy_score, f1_score

# Extract true labels and convert to model's labels
true_labels = test_data['label'].tolist()
true_label_map = {'SAD': 'LABEL_1', 'HAPPY': 'LABEL_2', 'SURPRISE': 'LABEL_3', 'HATE': 'LABEL_4', 'FEAR': 'LABEL_5', 'ANGRY': 'LABEL_6', 'OTHER': 'LABEL_7'}  # Adjust mapping as per your dataset
true_labels = [true_label_map[label] for label in true_labels]

# Calculate accuracy and F1-score
accuracy = accuracy_score(true_labels, predicted_labels)
f1_macro = f1_score(true_labels, predicted_labels, average='macro')  # Macro-averaged F1-score for multi-class

print(f'Accuracy: {accuracy:.2f}')
print(f'F1-score (macro): {f1_macro:.2f}')

Accuracy: 0.31
F1-score (macro): 0.07


In [23]:
true_labels

['SAD',
 'HAPPY',
 'OTHER',
 'HAPPY',
 'SAD',
 'HAPPY',
 'HAPPY',
 'HAPPY',
 'SURPRISE',
 'FEAR',
 'SAD',
 'HAPPY',
 'SAD',
 'OTHER',
 'SURPRISE',
 'HATE',
 'SAD',
 'OTHER',
 'HAPPY',
 'ANGRY',
 'HATE',
 'ANGRY',
 'OTHER',
 'ANGRY',
 'HAPPY',
 'FEAR',
 'SURPRISE',
 'SURPRISE',
 'ANGRY',
 'OTHER',
 'ANGRY',
 'SAD',
 'SAD',
 'SAD',
 'HAPPY',
 'HAPPY',
 'SAD',
 'HAPPY',
 'HATE',
 'ANGRY',
 'HAPPY',
 'ANGRY',
 'SURPRISE',
 'OTHER',
 'SAD',
 'SAD',
 'HAPPY',
 'SAD',
 'SAD',
 'SAD',
 'FEAR',
 'SAD',
 'ANGRY',
 'FEAR',
 'HAPPY',
 'ANGRY',
 'HAPPY',
 'SAD',
 'ANGRY',
 'SURPRISE',
 'HAPPY',
 'OTHER',
 'HAPPY',
 'HATE',
 'HAPPY',
 'FEAR',
 'SAD',
 'HAPPY',
 'OTHER',
 'OTHER',
 'HAPPY',
 'SAD',
 'SAD',
 'FEAR',
 'SAD',
 'HAPPY',
 'HATE',
 'SAD',
 'SAD',
 'SURPRISE',
 'SAD',
 'SAD',
 'FEAR',
 'SAD',
 'HAPPY',
 'SAD',
 'OTHER',
 'SAD',
 'ANGRY',
 'HAPPY',
 'HAPPY',
 'SAD',
 'ANGRY',
 'SURPRISE',
 'SAD',
 'SURPRISE',
 'ANGRY',
 'SAD',
 'OTHER',
 'SAD']

In [33]:
predicted_labels

['LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LABEL_1',
 'LA