run in colab

# Text Classification using Hugging Face

import modules

In [None]:
!pip install datasets
import datasets
import string
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder
import torch
!pip install transformers
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer , AutoTokenizer

In [None]:
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Preprocessing

# dataset link: https://drive.google.com/file/d/1ceGK4_Fx8Fbma7011LQatvUffUq89ohp/view?usp=sharing

In [None]:
dataset = datasets.load_dataset('csv', data_files = '/content/drive/MyDrive/uniacco assignment/dataset.csv', delimiter = ',' , encoding = 'unicode_escape')
dataset



  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['news', 'type'],
        num_rows: 2225
    })
})



*   remove punctuation
*   remove stopwords



In [None]:
news = dataset['train']['news']
punc = string.punctuation
stop_word = set(stopwords.words('english'))

def clean_text(news):
    # Remove punctuations
    news = ''.join(char for char in news if char not in punc)
    
    # Remove stopwords
    news = ' '.join(word for word in news.split() if word.lower() not in stop_word)
    
    return news

clean_news = [clean_text(i) for i in news]

clean_news[0]

'China role Yukos splitup China lent Russia 6bn Â£32bn help Russian government renationalise key Yuganskneftegas unit oil group Yukos revealed Kremlin said Tuesday 6bn Russian state bank VEB lent stateowned Rosneft help buy Yugansk turn came Chinese banks revelation came Russian government said Rosneft signed longterm oil supply deal China deal sees Rosneft receive 6bn credits Chinas CNPC According Russian newspaper Vedomosti credits would used pay loans Rosneft received finance purchase Yugansk Reports said CNPC offered 20 Yugansk return providing finance company opted longterm oil supply deal instead Analysts said one factor might influenced Chinese decision possibility litigation Yukos Yugansks former owner CNPC become shareholder Rosneft VEB declined comment two companies Rosneft CNPC agreed prepayment longterm deliveries said Russian oil official Sergei Oganesyan nothing unusual prepayment five six years announcements help explain Rosneft mediumsized indebted relatively unknown fi



1.   encode the labels (news types)
2.   tokenize the news



In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
encoded_label = LabelEncoder()
encoded_label = encoded_label.fit(['sport','business','politics','tech', 'entertainment'])
Y = encoded_label.transform(dataset['train']['type'])
# Get the labels from the dataset
labels = Y


tokenized_news = []

for i in range(len(clean_news)):
    tokenized_article = tokenizer(clean_news[i], truncation=True, padding='max_length', max_length=512)
    tokenized_article['label'] = labels[i]
    tokenized_news.append(tokenized_article)

In [None]:
tokenized_news[0]

{'input_ids': [101, 2859, 2535, 9805, 15710, 3975, 6279, 2859, 15307, 3607, 1020, 24700, 1037, 29646, 16703, 24700, 2393, 2845, 2231, 14916, 3370, 13911, 2063, 3145, 9805, 5289, 6711, 2638, 6199, 29107, 2015, 3131, 3514, 2177, 9805, 15710, 3936, 1047, 28578, 4115, 2056, 9857, 1020, 24700, 2845, 2110, 2924, 2310, 2497, 15307, 2110, 12384, 2098, 20996, 2015, 2638, 6199, 2393, 4965, 9805, 5289, 6711, 2735, 2234, 2822, 5085, 11449, 2234, 2845, 2231, 2056, 20996, 2015, 2638, 6199, 2772, 2146, 3334, 2213, 3514, 4425, 3066, 2859, 3066, 5927, 20996, 2015, 2638, 6199, 4374, 1020, 24700, 6495, 2859, 2015, 27166, 15042, 2429, 2845, 3780, 2310, 9527, 14122, 2072, 6495, 2052, 2109, 3477, 10940, 20996, 2015, 2638, 6199, 2363, 5446, 5309, 9805, 5289, 6711, 4311, 2056, 27166, 15042, 3253, 2322, 9805, 5289, 6711, 2709, 4346, 5446, 2194, 12132, 2146, 3334, 2213, 3514, 4425, 3066, 2612, 18288, 2056, 2028, 5387, 2453, 5105, 2822, 3247, 6061, 15382, 9805, 15710, 9805, 5289, 6711, 2015, 2280, 3954, 27166, 1

split data into train and validata sets

In [None]:
# Split the tokenized articles into training and validation sets
train_tokenized_news, val_tokenized_news = tts(tokenized_news, test_size=0.2, random_state=42)


# fine tune using BERT

In [None]:
# Fine-tune a pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy = 'epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=200,
    per_device_eval_batch_size=100,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_news,
    eval_dataset = val_tokenized_news
)
trainer.train()

# model evaluation

In [None]:
#load saved model
model = BertForSequenceClassification.from_pretrained('path/to/saved/model')

In [None]:
# Get the input IDs, attention masks, and labels for the validation set
val_input_ids = [tokenized_news['input_ids'] for tokenized_news in val_tokenized_news]
val_attention_masks = [tokenized_news['attention_mask'] for tokenized_news in val_tokenized_news]
val_labels = [tokenized_news['label'] for tokenized_news in val_tokenized_news]


val_input_ids = torch.tensor(val_input_ids)
val_attention_masks = torch.tensor(val_attention_masks)
val_labels = torch.tensor(val_labels)

In [None]:
#model evaluation
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_recall_fscore_support
with torch.no_grad():
    logits = model(val_input_ids, attention_mask=val_attention_masks)[0]
    predictions = torch.argmax(logits, dim=-1)

accuracy = accuracy_score(val_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(val_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")          #print accuracy
print(f"Precision: {precision:.4f}")        #print precision
print(f"Recall: {recall:.4f}")              #print recall
print(f"F1-Score: {f1:.4f}")                #print f1-score

# make prediction on sample news

In [None]:
# Load the trained model and tokenizer
model = TFBertForSequenceClassification.from_pretrained('path/to/saved/model')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a few sample news
news = [
    "The stock market is booming!",
    "The government is introducing a new tax policy.",
    "The latest fashion trends for spring 2022.",
    "Scientists have discovered a cure for cancer.",
    "The soccer game was a tie."
]

# Tokenize the newss and convert them to input features
inputs = tokenizer(news, padding=True, truncation=True, max_length=128, return_tensors="tf")

# Make predictions on the input features
outputs = model(inputs)

# Get the predicted labels
predicted_labels = tf.argmax(outputs.logits, axis=1).numpy()

# Print the predicted labels
for i, label in enumerate(predicted_labels):
    print(f"News {i+1}: {texts[i]}")
    print(f"Predicted label: {label}\n")