## Loading dataset

In [2]:
import pandas as pd
df = pd.read_csv("/content/bbc-text.csv")
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [3]:
# ! pip install transformers==4.37.2

In [4]:
df.isnull().sum()

category    0
text        0
dtype: int64

## Balacing Classes

In [5]:
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

X = df['text'].values.reshape(-1, 1)  # Reshape to 2D array
y = df['category']
X_resampled, y_resampled = oversampler.fit_resample(X, y)

df = pd.DataFrame({'text': X_resampled.flatten(), 'category': y_resampled})

In [6]:
df['category'].value_counts()

category
tech             511
business         511
sport            511
entertainment    511
politics         511
Name: count, dtype: int64

In [7]:
df.shape

(2555, 2)

## Train and Test Set

In [8]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)


In [9]:
df_train['category'].value_counts()

category
politics         425
sport            414
business         404
entertainment    403
tech             398
Name: count, dtype: int64

## Cleaning Text

In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)

    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Join tokens back into a single string
    text = ' '.join(tokens)

    return text


df['clean_text'] = df['text'].apply(lambda x: preprocess_text(x))

## Converting our Target column into Categorical data

In [12]:
df_test['category'].value_counts()

category
tech             113
entertainment    108
business         107
sport             97
politics          86
Name: count, dtype: int64

In [13]:
df_test.shape

(511, 2)

In [14]:
encoded_dict = {"sport":0,"business":1, 'politics':2, "entertainment":3,'tech':4}
df_train['category'] = df_train['category'].map(encoded_dict)

In [15]:
df_test['category'] = df_test['category'].map(encoded_dict)

In [16]:
from keras.utils import to_categorical
y_train = to_categorical(df_train['category'])

In [17]:
y_test = to_categorical(df_test['category'])

In [18]:
y_train.shape, y_test.shape

((2044, 5), (511, 5))

# Loading Model and Tokenizer from the transformers package

In [19]:
from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [20]:
max_len_train = max(len(text) for text in df_train['text'])
max_len_test= max(len(text) for text in df_test['text'])

In [21]:
max_len_train, max_len_test

(18387, 25483)

## Tokenizing the input

In [22]:
max_len = 70
x_train = tokenizer(
    text=df_train.text.tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)
x_test = tokenizer(
    text=df_test.text.tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

In [23]:
input_ids = x_train['input_ids']
attention_mask = x_train['attention_mask']

In [24]:
print("input ids:",input_ids.shape)
print("attention_mask:", attention_mask.shape)

input ids: (2044, 70)
attention_mask: (2044, 70)


# Model Building

## Importing necessary libraries.



In [25]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

In [26]:
# Define input layers
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

input_ids.shape,input_mask.shape

(TensorShape([None, 70]), TensorShape([None, 70]))

In [27]:
# ! pip install transformers==4.37.2

In [28]:
# Define model architecture (assuming 'bert' is already defined)
embeddings = bert(input_ids, attention_mask=input_mask)[0]
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)
y = Dense(5, activation='sigmoid')(out)

# Create the model
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

## Model Compilation


In [29]:
from keras.optimizers import Adam
from keras.optimizers.schedules import ExponentialDecay
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy

# Define learning rate schedule
initial_learning_rate = 5e-05
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=10000,  # Adjust this value according to your needs
    decay_rate=0.01,    # Adjust this value according to your needs
    staircase=True)
# Define optimizer, loss, and metrics
optimizer = Adam(
    learning_rate=lr_schedule,
    epsilon=1e-08,
    clipnorm=1.0
)
loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy(name='balanced_accuracy')

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)

# Model Training


In [30]:
train_history = model.fit(
    x={'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']},
    y=y_train,
    validation_data=(
        {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']},
        y_test
    ),
    epochs=2,
    batch_size=36
)


Epoch 1/2


  output, from_logits = _get_logits(


Epoch 2/2


# Model Evaluation

Testing our model on the test data.


In [31]:
predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})
predicted_raw[0]



array([0.2416537 , 0.05749706, 0.98819613, 0.09611759, 0.08824101],
      dtype=float32)

In [32]:
# Taking the index of value having maximum probability.
import numpy as np
y_predicted = np.argmax(predicted_raw, axis = 1)
y_true = df_test.category
y_true

1266    2
1749    0
2050    1
393     1
1544    1
       ..
2429    2
2462    4
2137    0
361     4
2272    3
Name: category, Length: 511, dtype: int64

In [37]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_predicted))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98        97
           1       0.99      0.93      0.96       107
           2       0.95      1.00      0.97        86
           3       0.97      0.98      0.98       108
           4       0.96      0.96      0.96       113

    accuracy                           0.97       511
   macro avg       0.97      0.97      0.97       511
weighted avg       0.97      0.97      0.97       511



## Prediction Pipeline


In [38]:
def predict(text):
    x_val = tokenizer(
        text=texts,
        add_special_tokens=True,
        max_length=70,
        truncation=True,
        padding='max_length',
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )
    validation = model.predict({'input_ids': x_val['input_ids'], 'attention_mask': x_val['attention_mask']}) * 100

    labels = []
    scores = []

    for key, value in zip(encoded_dict.keys(), validation[0]):
        labels.append(key)
        scores.append(value)


    predicted_label = labels[scores.index(max(scores))]

    return labels, scores, predicted_label, max(scores)


In [39]:
texts="The smartphone market experienced a much-needed 6 per cent year-on-year growth in the second quarter of 2024. This growth was driven by the sale of generative AI-powered devices. According to Counterpoint Research’s smartphone 360 monthly tracker, it is the highest growth the industry has seen in the last three years.Markets like Europe and Latin America, which have witnessed double-digit growth, heavily contributed to this growth. The top five brands remain identical to Q1 2024, led by Samsung with a 20 per cent market share, followed by Apple with a 16 per cent market share, while Xiaomi, Vivo, and Oppo are in third, fourth, and fifth positions, respectively."
labels, scores, predicted_label, pred_prob = predict(texts)

for name, score in zip(labels,scores):
  print("Labels : ", name, " Scores : ", score)

print("====================================================")
print("predicted_label : ", predicted_label, " Scores : ", pred_prob)

Labels :  sport  Scores :  2.4106047
Labels :  business  Scores :  53.680714
Labels :  politics  Scores :  5.107882
Labels :  entertainment  Scores :  8.3015995
Labels :  tech  Scores :  97.56446
predicted_label :  tech  Scores :  97.56446
