In [1]:
import requests
import tarfile
import pandas as pd
import os

def get_imdb_dataframe():
    """
    Downloads and extracts the IMDB dataset from a given URL and returns it as a pandas DataFrame.

    The IMDB dataset contains 50,000 movie reviews for natural language processing or text analytics. 
    This dataset is used for binary sentiment classification and includes 25,000 highly polar movie reviews 
    for training and 25,000 for testing. The goal is to predict the number of positive and negative reviews 
    using classification or deep learning algorithms.

    Returns:
        pd.DataFrame: DataFrame containing the IMDB dataset.
    """
    # URL of the dataset
    url = 'https://github.com/pruhlo/data_ML/raw/master/IMDB_Dataset.tar.xz'
    
    # Download the tar.xz file
    response = requests.get(url, stream=True)
    tar_xz_path = 'IMDB_Dataset.tar.xz'
    
    with open(tar_xz_path, 'wb') as file:
        file.write(response.content)
    
    # Extract the tar.xz file
    with tarfile.open(tar_xz_path, 'r:xz') as tar:
        tar.extractall()

    # Assuming the CSV file is named 'IMDB Dataset.csv' inside the tar.xz archive
    csv_file_path = 'IMDB Dataset.csv'
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Clean up the downloaded and extracted files
    os.remove(tar_xz_path)
    os.remove(csv_file_path)
    
    return df


In [3]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch

# –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è IMDB –¥–∞—Ç–∞—Å–µ—Ç—É
df = get_imdb_dataframe()

# –ü–µ—Ä–µ–π–º–µ–Ω—É–≤–∞–Ω–Ω—è –∫–æ–ª–æ–Ω–æ–∫ –¥–ª—è —è—Å–Ω–æ—Å—Ç—ñ
df.columns = ['review', 'sentiment']

# –ú–∞–ø—ñ–Ω–≥ sentiment –Ω–∞ –±—ñ–Ω–∞—Ä–Ω—ñ –∑–Ω–∞—á–µ–Ω–Ω—è (positive: 1, negative: 0)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# –ü–æ–¥—ñ–ª –¥–∞—Ç–∞—Å–µ—Ç—É –Ω–∞ —Ç—Ä–µ–Ω—É–≤–∞–ª—å–Ω—É —ñ —Ç–µ—Å—Ç–æ–≤—É –≤–∏–±—ñ—Ä–∫–∏
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# –ö–æ–Ω–≤–µ—Ä—Ç–∞—Ü—ñ—è –¥–æ —Ñ–æ—Ä–º–∞—Ç—É Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['review', 'label']])
test_dataset = Dataset.from_pandas(test_df[['review', 'label']])

# –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è –ø–æ–ø–µ—Ä–µ–¥–Ω—å–æ –Ω–∞–≤—á–µ–Ω–æ–≥–æ —Ç–æ–∫–µ–Ω—ñ–∑–∞—Ç–æ—Ä–∞ DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# –¢–æ–∫–µ–Ω—ñ–∑–∞—Ü—ñ—è –¥–∞—Ç–∞—Å–µ—Ç—É –∑ –≤–∏–∫–æ—Ä–∏—Å—Ç–∞–Ω–Ω—è–º –æ–±–º–µ–∂–µ–Ω–Ω—è max_length –¥–æ 128 —Å–∏–º–≤–æ–ª—ñ–≤
def tokenize_function(examples):
    return tokenizer(examples['review'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è –º–æ–¥–µ–ª—ñ DistilBERT –¥–ª—è –±—ñ–Ω–∞—Ä–Ω–æ—ó –∫–ª–∞—Å–∏—Ñ—ñ–∫–∞—Ü—ñ—ó
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# –ù–∞–ª–∞—à—Ç—É–≤–∞–Ω–Ω—è –ø–∞—Ä–∞–º–µ—Ç—Ä—ñ–≤ —Ç—Ä–µ–Ω—É–≤–∞–Ω–Ω—è –∑ –≤–∫–ª—é—á–µ–Ω–Ω—è–º –∑–º—ñ—à–∞–Ω–æ—ó —Ç–æ—á–Ω–æ—Å—Ç—ñ (fp16)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,  # –ó–º–µ–Ω—à–µ–Ω–æ –∫—ñ–ª—å–∫—ñ—Å—Ç—å –µ–ø–æ—Ö
    weight_decay=0.01,
    fp16=True  # –í–∏–∫–æ—Ä–∏—Å—Ç–∞–Ω–Ω—è –∑–º—ñ—à–∞–Ω–æ—ó —Ç–æ—á–Ω–æ—Å—Ç—ñ –¥–ª—è –ø—Ä–∏—à–≤–∏–¥—à–µ–Ω–Ω—è
)

# –í–∏–∑–Ω–∞—á–µ–Ω–Ω—è –º–µ—Ç—Ä–∏–∫ –æ—Ü—ñ–Ω—é–≤–∞–Ω–Ω—è
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    acc = accuracy_score(p.label_ids, preds)
    precision = precision_score(p.label_ids, preds)
    recall = recall_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds)

    # –û–±—á–∏—Å–ª–µ–Ω–Ω—è ROC AUC
    roc_auc = roc_auc_score(p.label_ids, torch.nn.functional.softmax(torch.tensor(p.predictions), dim=1)[:, 1].numpy())

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }

# –°—Ç–≤–æ—Ä–µ–Ω–Ω—è –µ–∫–∑–µ–º–ø–ª—è—Ä—É Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# –¢—Ä–µ–Ω—É–≤–∞–Ω–Ω—è –º–æ–¥–µ–ª—ñ
trainer.train()

# –û—Ü—ñ–Ω–∫–∞ –º–æ–¥–µ–ª—ñ –Ω–∞ —Ç–µ—Å—Ç–æ–≤—ñ–π –≤–∏–±—ñ—Ä—Ü—ñ
eval_results = trainer.evaluate()

# –í–∏–≤–µ–¥–µ–Ω–Ω—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ñ–≤ –æ—Ü—ñ–Ω–∫–∏
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40000/40000 [03:44<00:00, 178.47 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:56<00:00, 177.34 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.2986,0.278416,0.8835,0.874,0.8962,0.884961,0.95527
2,0.2041,0.303191,0.8868,0.879364,0.8966,0.887899,0.958403


eval_loss: 0.3032
eval_accuracy: 0.8868
eval_precision: 0.8794
eval_recall: 0.8966
eval_f1: 0.8879
eval_roc_auc: 0.9584
eval_runtime: 1243.2427
eval_samples_per_second: 8.0430
eval_steps_per_second: 0.5030
epoch: 2.0000
