In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers scipy numpy pandas datasets evaluate


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
from datasets import load_dataset

In [None]:
# Loading...  game revieuws from google play store
df = pd.read_csv("reviews_processed.csv", delimiter=";")

print(f"Your dataset contains : {len(df['title'].unique())} games")
if df.duplicated().sum() == 0:
	print("no duplicates")
else:
	print(f"we deleting {df.duplicated().sum()} duplicates")
	df.drop_duplicates(inplace=True)

print(df.shape)

Your dataset contains : 374 games
no duplicates
(25384, 18)


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

inputs = tokenizer("Ok, I'm hopelessly hooked. Installed just to get coins in RR3, now I'm 180+ consecutive days playing, never had time to RR3 again. You can progress without paying real money (I did spend a few bucks, mostly from Google Rewards credits, and did it happily to pay back for a lot of fun), but of course then you'll need more patience.", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

'NEGATIVE'

Let's go FineTune distilBert to hope get the best accuracy !!

In [None]:
dataset = load_dataset("csv", data_files="reviews_processed.csv", delimiter=";")
train_test = dataset["train"].train_test_split(test_size=0.2)
test_valid = train_test["test"].train_test_split(test_size=0.5)

train_dataset = train_test["train"]
valid_dataset = test_valid["train"]
test_dataset = test_valid["test"]

In [None]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

def preprocess_function(examples):
    # 0 as negative 1 as positive
    examples["label"] = [1 if x == "positive" else 0 for x in examples["sentiment"]]
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/20307 [00:00<?, ? examples/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

In [None]:
train_dataset = train_dataset.remove_columns(["text", "sentiment"])
train_dataset.set_format("torch")

valid_dataset = valid_dataset.remove_columns(["text", "sentiment"])
valid_dataset.set_format("torch")

test_dataset = test_dataset.remove_columns(["text", "sentiment"])
test_dataset.set_format("torch")


In [None]:
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")


In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate

# f1 and accuracy
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": accuracy, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate = 1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True,
    warmup_steps=600,
    metric_for_best_model="f1"  # f1 as best accuracy
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0624,0.548461,0.870764,0.899325
2,0.0314,0.767244,0.86643,0.899137
3,0.0473,0.780386,0.870764,0.900425


TrainOutput(global_step=1905, training_loss=0.03705526620706939, metrics={'train_runtime': 662.67, 'train_samples_per_second': 91.933, 'train_steps_per_second': 2.875, 'total_flos': 2017511598380544.0, 'train_loss': 0.03705526620706939, 'epoch': 3.0})

In [None]:
# After training is complete
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
print(len(labels))

from sklearn.metrics import classification_report

# If you know your classes, for example 0 = negative, 1 = positive
target_names = ["negative", "positive"]

print(classification_report(labels, preds, target_names=target_names))


2539
              precision    recall  f1-score   support

    negative       0.81      0.85      0.83       893
    positive       0.92      0.89      0.90      1646

    accuracy                           0.88      2539
   macro avg       0.86      0.87      0.87      2539
weighted avg       0.88      0.88      0.88      2539

