## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adadelta
from tensorflow.keras.regularizers import l2

##  Load and Preprocess the Data

In [2]:
# Load the dataset
df = pd.read_csv("bbc_text_cls.csv")  # Replace with your file path

# Encode labels
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['labels'])
num_classes = len(df['category'].unique())

# Split the dataset
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(df['text'], df['category'], test_size=0.15, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_val_texts, train_val_labels, test_size=0.20, random_state=42)

## Text Tokenization and Padding
Convert the text data into sequences of word indices and apply padding to make sequences uniform in length.

In [3]:
# Hyperparameters for tokenization and padding
vocab_size = 10000  # Maximum number of words in the vocabulary
max_length = 200    # Maximum length of each sequence

# Tokenize and pad the text data
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

## Build the CNN Model
Define a CNN architecture with an embedding layer, convolutional layers, and dense layers for classification.

* Transfer function: rectified linear
  * ReLU: Often used in convolutional and dense layers to introduce non-linearity without suffering from the vanishing gradient problem.
  * Softmax: Used in the final layer of a multi-class classification model to output class probabilities.
* Kernel sizes: 2, 4, 5.
* Number of filters: 100.
* Dropout rate: 0.5.
* Weight regularization (L2): 3.
* Batch Size: 50.
* Update Rule: Adadelta. (optimizer=Adadelta())

In [4]:
# Hyperparameters for the model
embedding_dim = 100  # Embedding dimension for word vectors
filter_size = 128  # Filters in Conv1D layers
kernel_size = 5  # Kernel size for convolution
#weight_regularization = 0.01  # Weight regularization factor (NO GOOD, BAD)
#weight_regularization = 3  # Weight regularization factor (NO GOOD, WORSE)
dropout_rate = 0.5  # Dropout rate for regularization

# Build the CNN model
model = Sequential([
    InputLayer(shape=(max_length,)),
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    Conv1D(filters=filter_size, kernel_size=kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(dropout_rate),
    Dense(64, activation='relu'),
    Dropout(dropout_rate),
    Dense(num_classes, activation='softmax')
])

# What is "adam" optimizer? Accuracy: 93.71%
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Adadelta with custom parameters. Accurracy: 91.32%
#model.compile(optimizer=Adadelta(learning_rate=1.0, rho=0.95), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Adadelta default parameter. Accuracy: 16.17%
#model.compile(optimizer=Adadelta(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

## Train the Model
Train the model on the training set and evaluate it on the test set.

In [5]:
# Train the model
# Accuracy: 94.01, 95.21%
batch_size = 32

# Accurracy: 94.61%, 94.01%
#batch_size = 50

epochs = 10

history = model.fit(
    train_padded, train_labels,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(val_padded, val_labels),
    verbose=2
)

Epoch 1/10
48/48 - 5s - 101ms/step - accuracy: 0.2235 - loss: 1.6112 - val_accuracy: 0.3958 - val_loss: 1.5847
Epoch 2/10
48/48 - 4s - 92ms/step - accuracy: 0.3294 - loss: 1.5438 - val_accuracy: 0.5119 - val_loss: 1.4687
Epoch 3/10
48/48 - 5s - 105ms/step - accuracy: 0.4749 - loss: 1.3814 - val_accuracy: 0.6992 - val_loss: 1.1350
Epoch 4/10
48/48 - 7s - 139ms/step - accuracy: 0.6713 - loss: 0.9764 - val_accuracy: 0.8311 - val_loss: 0.6439
Epoch 5/10
48/48 - 4s - 74ms/step - accuracy: 0.8241 - loss: 0.5923 - val_accuracy: 0.9077 - val_loss: 0.3309
Epoch 6/10
48/48 - 3s - 59ms/step - accuracy: 0.9213 - loss: 0.3230 - val_accuracy: 0.9393 - val_loss: 0.2146
Epoch 7/10
48/48 - 3s - 58ms/step - accuracy: 0.9563 - loss: 0.1925 - val_accuracy: 0.9551 - val_loss: 0.1556
Epoch 8/10
48/48 - 5s - 109ms/step - accuracy: 0.9669 - loss: 0.1285 - val_accuracy: 0.9525 - val_loss: 0.1368
Epoch 9/10
48/48 - 3s - 58ms/step - accuracy: 0.9835 - loss: 0.0829 - val_accuracy: 0.9578 - val_loss: 0.1201
Epoch 

## Evaluate the Model
Assess the model’s performance on the test data.

In [6]:
# Evaluate on test data
loss, accuracy = model.evaluate(test_padded, test_labels, verbose=0)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 93.71%


## Save the Model

In [7]:
import pickle

# Save the trained CNN model in HDF5 format
model.save("cnn_model.keras")

# Save the tokenizer
with open("cnn_tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label encoder
with open("cnn_label_encoder.pkl", "wb") as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Load the Model

In [8]:
from tensorflow.keras.models import load_model

# Load the saved model, tokenizer, and label encoder
prod_model = load_model("cnn_model.keras")

with open("cnn_tokenizer.pkl", "rb") as handle:
    prod_tokenizer = pickle.load(handle)

with open("cnn_label_encoder.pkl", "rb") as handle:
    prod_label_encoder = pickle.load(handle)

## Making Predictions
Use the trained model to make predictions on new articles.

In [9]:
# https://www.bbc.com/news/articles/c8jy2dpv722o
article = '''
Spain fines budget airlines including Ryanair €179m

Spain has fined five budget airlines a total of €179m (£149m) for "abusive practices" including charging for hand luggage.

Ryanair has been given the largest fine of €108m (£90m), followed by EasyJet's penalty of €29m (£24m).

Vueling, Norwegian and Volotea were issued with sanctions by Spain's Consumer Rights Ministry on Friday.

The ministry said it plans to ban practices such as charging extra for carry-on hand luggage and reserving seats for children.

The fines are the biggest sanction issued by the ministry, and follow an investigation into the budget airline industry.

The ministry said it had upheld fines that were first announced in May after dismissing appeals lodged by the companies.

Vueling, the budget arm of British Airways owner IAG, has been fined €39m (£32m), while Norwegian Airlines and Volotea have been fined €1.6m (£1.3m) and €1.2m (£1m) respectively.

The fines were issued because the airlines were found to have provided misleading information and were not transparent with prices, "which hinders consumers' ability to compare offers" and make informed decisions, the ministry said.

Ryanair was accused of violating a range of consumer rights, including charging for larger carry-on luggage, seat selection, and asking for "a disproportionate amount" to print boarding passes at terminals.

Each fine was calculated based on the "illicit profit" obtained by each airline from these practices.

Ryanair boss Michael O'Leary said the fines were "illegal" and "baseless", adding that he will appeal the case and take it to the EU courts.

"Ryanair has for many years used bag fees and airport check-in fees to change passenger behaviour and we pass on these cost savings in the form of lower fares to consumers," he said.

Easyjet and Norwegian said they would also appeal the decision.

The Spanish airline industry watchdog, ALA, plans a further appeal and has called the ministry's decision "nonsense", arguing the fine infringes EU free market rules.

But Andrés Barragán, secretary general for consumer affairs and gambling at the ministry, defended the fines, saying the government's decision was based on Spanish and EU law.

"It is an abuse to charge €20 for just printing the boarding card in the airport, [it's] something no one wants," he told the BBC's World Business Report programme.

"This is a problem consumers are facing not only in Spain but in other EU countries."

Consumer rights association Facua, which has campaigned against the fees for six years, said the decision was "historic".
'''

In [10]:
prod_sequence = prod_tokenizer.texts_to_sequences([article])
prod_padded_sequence = pad_sequences(prod_sequence, maxlen=max_length, padding='post')

# Predict the category
prod_prediction = prod_model.predict(prod_padded_sequence)
category_index = prod_prediction.argmax(axis=1)[0]
category_name = prod_label_encoder.inverse_transform([category_index])[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 482ms/step


In [11]:
category_name

'business'