In [1]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

# Load the Excel file
excel_file = pd.ExcelFile('/kaggle/input/t-hsab/T-HSAB.xlsx')

# Print all sheet names in the Excel file
print(excel_file.sheet_names)


['T-HSAB_Annotated']


In [22]:
# Define your column names, e.g., 'text' and 'label'
column_names = ['text', 'label']

# Load the data with specified column names
df = pd.read_excel('/kaggle/input/t-hsab/T-HSAB.xlsx', sheet_name='T-HSAB_Annotated', header=None, names=column_names)

# Display the first few rows
print(df.head())


                                                text   label
0  اسغي ياشعب تونس تدعوا بالاسلام كفار الحمدلله ن...    hate
1  قطع يد السارق توفرت الشروط شرط الحد الأدنى قيم...  normal
2                             تلوموش لطفي لعبدلي شرف  normal
3  مستغرب شعب يسمع تفاهة شانو لى الدرجة الشعب تاف...  normal
4  هههخ غزلتني مافهمتش شمدخلها الموضوع تتنطر وحده...  normal


In [23]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Drop any rows with missing values
df = df.dropna()

# Confirm the class distribution
print("\nClass distribution:\n", df['label'].value_counts())


Missing values per column:
 text     0
label    0
dtype: int64

Class distribution:
 label
normal     3820
abusive    1126
hate       1078
Name: count, dtype: int64


In [24]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df['label'] == 'normal']
df_abusive = df[df['label'] == 'abusive']
df_hate = df[df['label'] == 'hate']

# Upsample minority classes
df_abusive_upsampled = resample(df_abusive, replace=True, n_samples=len(df_majority), random_state=42)
df_hate_upsampled = resample(df_hate, replace=True, n_samples=len(df_majority), random_state=42)

# Combine majority class with upsampled minority classes
df_balanced = pd.concat([df_majority, df_abusive_upsampled, df_hate_upsampled])

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new distribution
print(df_balanced['label'].value_counts())


label
abusive    3820
hate       3820
normal     3820
Name: count, dtype: int64


                                                text    label  encoded_label
0                                     شرف متعكم لقحب  abusive              2
1  راجل وسيدهم وديع المستويات الكلاب والطحانه يمش...  abusive              2
2  الشاب بشير كرهتوا الحلقة ظهر فارغ ثقافيا فكريا...  abusive              2
3  اخوكم الجزاءر شعب ركيك وبدون معنى الاسلام دين ...     hate              1
4  حرية مؤخرتي يتشاف يفطر قدام العباد ندخلوا بالك...  abusive              2


In [26]:
# Custom label mapping
label_map = {'normal': 0, 'hate': 1, 'abusive': 2}

# Apply the custom label mapping to the DataFrame
df_balanced['encoded_label'] = df_balanced['label'].map(label_map)

# Check the label encoding
print("Custom Label Encoding:", df_balanced[['label', 'encoded_label']])

Custom Label Encoding:          label  encoded_label
0      abusive              2
1      abusive              2
2      abusive              2
3         hate              1
4      abusive              2
...        ...            ...
11455     hate              1
11456  abusive              2
11457  abusive              2
11458   normal              0
11459  abusive              2

[11460 rows x 2 columns]


In [None]:
print(df_balanced.head())

In [28]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Initialize tokenizer for TuniBert model
tokenizer = AutoTokenizer.from_pretrained("AhmedBou/TuniBert")

# Define a Dataset class for tokenizing and converting text to tensors
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',  # Padding to max_length
            max_length=self.max_length,  # Define max sequence length
            return_tensors="pt"  # Return PyTorch tensors
        )
        # Flatten the tensors and add the label
        item = {key: val.squeeze(0) for key, val in encoding.items()}  # Remove batch dimension
        item['labels'] = torch.tensor(label, dtype=torch.long)  # Convert label to tensor
        return item





# Split data into training and validation sets (80-20 split)
texts = df_balanced['text'].tolist()
labels = df_balanced['encoded_label'].tolist()  # Use the encoded labels
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create Dataset instances for training and validation
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

# Create DataLoader instances to load data in batches for training and validation
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# You can now use `train_loader` and `val_loader` with a Hugging Face Trainer or manual training loop.


In [29]:
print("Custom Label Mapping:", label_map)

Custom Label Mapping: {'normal': 0, 'hate': 1, 'abusive': 2}


In [30]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("AhmedBou/TuniBert", num_labels=len(set(labels)))


In [39]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",  # Use eval_strategy instead of evaluation_strategy
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [40]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.0335,0.502788
2,0.0648,0.502935
3,0.1136,0.522923
4,0.0224,0.452368
5,0.0359,0.519052
6,0.0295,0.568775
7,0.0068,0.42974
8,0.0033,0.65948
9,0.0644,0.454966
10,0.0056,0.56705


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

TrainOutput(global_step=5740, training_loss=0.016004357453088237, metrics={'train_runtime': 2770.0887, 'train_samples_per_second': 66.193, 'train_steps_per_second': 2.072, 'total_flos': 1.206111906865152e+16, 'train_loss': 0.016004357453088237, 'epoch': 20.0})

In [41]:
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 0.6207791566848755, 'eval_runtime': 10.7781, 'eval_samples_per_second': 212.653, 'eval_steps_per_second': 6.68, 'epoch': 20.0}


In [42]:
model.save_pretrained('./fine_tuned_tunibert')
tokenizer.save_pretrained('./fine_tuned_tunibert')


('./fine_tuned_tunibert/tokenizer_config.json',
 './fine_tuned_tunibert/special_tokens_map.json',
 './fine_tuned_tunibert/vocab.txt',
 './fine_tuned_tunibert/added_tokens.json',
 './fine_tuned_tunibert/tokenizer.json')

In [165]:
from transformers import pipeline
import random

# Load the fine-tuned model and tokenizer
fine_tuned_pipe = pipeline("text-classification", model="./fine_tuned_tunibert", tokenizer=tokenizer , device=0)

# Define your label mapping for interpreting the results
label_map = {0: 'normal', 1: 'hate', 2: 'abusive'}

# Pick a random test sample (you can adjust it to use your own test data)
random_idx = random.randint(0, len(val_texts) - 1)
random_sample = val_texts[random_idx]
real_label = val_labels[random_idx]  # The real label corresponding to the sample

# Get the prediction from the fine-tuned model
results = fine_tuned_pipe(random_sample)

# Convert the model's label index to the real label
predicted_label = results[0]['label']
predicted_label_index = label_map[int(predicted_label.split('_')[1])]  # Extract label from 'LABEL_X' and map it

# Print the prediction result
print(f"Test Sentence: {random_sample}")
print(f"Real Label: {label_map[real_label]}")
print(f"Predicted Label: {predicted_label_index}, with confidence score: {results[0]['score']}")


Test Sentence: كلاب ضالة
Real Label: abusive
Predicted Label: abusive, with confidence score: 0.9999406337738037


In [141]:
from transformers import pipeline
from sklearn.metrics import accuracy_score

# Load the fine-tuned model and tokenizer
fine_tuned_pipe = pipeline("text-classification", model="./fine_tuned_tunibert", tokenizer=tokenizer, device=0)  # Use GPU if available

# Define label mapping for interpretation (if necessary)
label_map = {0: 'normal', 1: 'hate', 2: 'abusive'}

# Create a function to evaluate accuracy on a test dataset
def evaluate_accuracy(test_texts, true_labels):
    predicted_labels = []
    
    # Iterate through the test texts and get predictions
    for text in test_texts:
        # Get prediction from the model
        result = fine_tuned_pipe(text)
        
        # Extract predicted label from the model output
        # The 'label' is in the format 'LABEL_X', so we extract the index by splitting
        predicted_label = result[0]['label']
        
        # Map the label from the model's output (e.g., 'LABEL_0', 'LABEL_1', 'LABEL_2') to the numeric index
        predicted_label_index = int(predicted_label.split('_')[1])  # Extract the number after 'LABEL_'
        
        # Append the predicted label index
        predicted_labels.append(predicted_label_index)
    
    # Calculate accuracy by comparing predicted labels to true labels
    accuracy = accuracy_score(true_labels, predicted_labels)
    return accuracy

# Example usage: Evaluate accuracy on validation set or test set
accuracy = evaluate_accuracy(val_texts, val_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Test Accuracy: 93.32%


In [170]:
import shutil
import os

def zip_folders(folder1, folder2, zip_name):
    # Create a temporary directory to store both folders
    temp_dir = 'temp_folder_for_zip'
    os.makedirs(temp_dir, exist_ok=True)

    # Copy both folders into the temporary directory
    shutil.copytree(folder1, os.path.join(temp_dir, os.path.basename(folder1)))
    shutil.copytree(folder2, os.path.join(temp_dir, os.path.basename(folder2)))

    # Create a zip file containing both folders
    shutil.make_archive(zip_name, 'zip', temp_dir)

    # Remove the temporary directory after zipping
    shutil.rmtree(temp_dir)
    
    print(f"Zip file '{zip_name}.zip' created successfully!")

# Example usage
zip_folders('/kaggle/working/fine_tuned_tunibert', '/kaggle/working/results/checkpoint-5740', '/kaggle/working/fine_tuned_tunibert')  # Replace 'folder1', 'folder2' with your folder paths


OSError: [Errno 28] No space left on device

In [168]:
from transformers import pipeline
from sklearn.metrics import accuracy_score

# Load the fine-tuned model and tokenizer from the checkpoint directory
checkpoint_path = "/kaggle/working/results/checkpoint-5000"  # Replace with the correct path to your checkpoint
fine_tuned_pipe = pipeline("text-classification", model=checkpoint_path, tokenizer=tokenizer, device=0)  # Use GPU if available

# Define label mapping for interpretation (if necessary)
label_map = {0: 'normal', 1: 'hate', 2: 'abusive'}

# Create a function to evaluate accuracy on a test dataset
def evaluate_accuracy(test_texts, true_labels):
    predicted_labels = []
    
    # Iterate through the test texts and get predictions
    for text in test_texts:
        # Get prediction from the model
        result = fine_tuned_pipe(text)
        
        # Extract predicted label from the model output
        predicted_label = result[0]['label']
        
        # Map the label from the model's output (e.g., 'LABEL_0', 'LABEL_1', 'LABEL_2') to the numeric index
        predicted_label_index = int(predicted_label.split('_')[1])  # Extract the number after 'LABEL_'
        
        # Append the predicted label index
        predicted_labels.append(predicted_label_index)
    
    # Calculate accuracy by comparing predicted labels to true labels
    accuracy = accuracy_score(true_labels, predicted_labels)
    return accuracy

# Example usage: Evaluate accuracy on validation set or test set
accuracy = evaluate_accuracy(val_texts, val_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 93.94%


In [177]:
import shutil

# Specify the directory path you want to zip
folder_path = '/kaggle/working/results/checkpoint-5740'  # Replace with your folder path
output_zip_path = '/kaggle/working/results/checkpoint-5740'  # Replace with your desired output zip file path

# Zip the folder
shutil.make_archive(output_zip_path.replace('.zip', ''), 'zip', folder_path)

print(f"Folder {folder_path} has been zipped to {output_zip_path}.")


Folder /kaggle/working/results/checkpoint-5740 has been zipped to /kaggle/working/results/checkpoint-5740.
