# Activate GPU and Install Dependencies

In [2]:
import warnings,logging

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [1]:
!apt-get install -qq git-lfs

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Preprocess data

In [3]:
import pandas as pd

train_df=pd.read_csv('data/train.csv')
test_df=pd.read_csv('data/test.csv')

len(train_df), len(test_df)

(21156, 2000)

In [4]:
train_df.describe(include='all').T

Unnamed: 0,Input,label
count,21156,21156.0
unique,19817,
top,Zo'r,
freq,149,
mean,,0.514842
std,,0.499791
min,,0.0
25%,,0.0
50%,,1.0
75%,,1.0


In [5]:
from transformers import AutoTokenizer
tokz = AutoTokenizer.from_pretrained("rifkat/uztext-3Gb-BPE-Roberta")

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/941k [00:00<?, ?B/s]

In [6]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [7]:
def preprocess_function(examples):
    return tokz(examples["Input"], truncation=True, padding=True)

tokenized_train = train_ds.map(preprocess_function, batched=True, load_from_cache_file=False, remove_columns='Input')
tokenized_test = test_ds.map(preprocess_function, batched=True, load_from_cache_file=False, remove_columns='Input')

columns_to_return = ['input_ids', 'label', 'attention_mask']
tokenized_train.set_format(type='torch', columns=columns_to_return)

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokz)

# Training the model

In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("rifkat/uztext-3Gb-BPE-Roberta", num_labels=2)

Downloading:   0%|          | 0.00/319M [00:00<?, ?B/s]

In [10]:
import numpy as np
from datasets import load_metric
 
def compute_metrics(eval_pred):
    #load_precision=load_metric('precision')
    #load_recall=load_metric('recall')
    #load_f1 = load_metric("f1")
    load_accuracy = load_metric("accuracy")
  
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    #precision = load_precision.compute(predictions=predictions, references=labels)["precision"]
    #recall = load_recall.compute(predictions=predictions, references=labels)["recall"]
    #f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    # "Precision": precision, "Recall": recall, "F1": f1, 
    return {"Accuracy": accuracy}

In [11]:
dds = tokenized_train.train_test_split(0.10)
dds

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 19040
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2116
    })
})

In [13]:
from transformers import TrainingArguments, Trainer
 
repo_name = "uzroberta-finetuned-sa"
 
training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=6,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    report_to='none'
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=dds['train'],
   eval_dataset=dds['test'],
   tokenizer=tokz,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [14]:
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3416,0.283263,0.889414
2,0.2522,0.273155,0.898393


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

OSError: [Errno 122] Disk quota exceeded

In [None]:
trainer.evaluate()

In [None]:
trainer.save()

# Analyzing new data with the model

In [None]:
trainer.push_to_hub()

In [2]:
from transformers import pipeline
 
sentiment_model = pipeline(model="murodbek/uzroberta-sentiment-analysis")
p = sentiment_model(["Yomon emas", "Yaxshimas"])

Downloading:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/786 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/941k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [7]:
test_df['preds'] = -1
for i in range(len(test_df)):
    test_df['preds'][i]=sentiment_model(test_df['Input'][i])[0]['label']

In [8]:
((test_df['preds']=='LABEL_1')==test_df['label']).mean()

0.9655