In [17]:
#importing necessary libraries.

!pip install spacy
!python -m spacy download en_core_web_sm
!pip install contractions
import contractions
import spacy
import pandas as pd
import numpy as np
import pickle
import locale
import os  
import nltk
import tensorflow.keras as tf
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
from keras.layers import TextVectorization
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
import re
from nltk.corpus import treebank
from nltk import word_tokenize,pos_tag
from keras.layers import Input, Embedding, LSTM, GRU, Dense
from keras.models import Sequential


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# The locale module in Python is used for handling cultural-specific formatting of numbers, dates, and currencies. It provides a way to adapt programs to the cultural conventions of a specific locale or region. The locale module allows you to customize the formatting of numeric and time-related information based on the user's language and region settings.

In [18]:
locale.getpreferredencoding=lambda:"UTF-8"

In [19]:
cpu_count()

4

# Loading dataset

In [20]:
data=pd.read_csv("/kaggle/input/news-classification/BBC News Train.csv")

In [21]:
data.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [22]:
data.shape

(1490, 3)

# Now we will convert the text in our data into lower form for easy understanding.

In [23]:
def normalize_text(text):
    return text.lower()

In [24]:
with ThreadPoolExecutor(max_workers=4) as pool:
    data["Text"]=list(pool.map(normalize_text,list(data["Text"])))

In [25]:
data["Text"].head()

0    worldcom ex-boss launches defence lawyers defe...
1    german business confidence slides german busin...
2    bbc poll indicates economic gloom citizens in ...
3    lifestyle  governs mobile choice  faster  bett...
4    enron bosses in $168m payout eighteen former e...
Name: Text, dtype: object

# Now we will expand the contractions of words available in data.
# example. 
don't (do not) ,
can't (cannot)

In [26]:
def expand_contractions(text):
    return contractions.fix(text)

In [27]:
with ThreadPoolExecutor(max_workers=4) as pool:

     data["Text"] = list(pool.map(expand_contractions,list(data["Text"])))



In [28]:
data["Text"].head()

0    worldcom ex-boss launches defence lawyers defe...
1    german business confidence slides german busin...
2    bbc poll indicates economic gloom citizens in ...
3    lifestyle  governs mobile choice  faster  bett...
4    enron bosses in $168m payout eighteen former e...
Name: Text, dtype: object

# Tokenization:In this we will break down a text into smaller units, known as tokens.
# Example: "Tokenization is crucial in NLP," 
result in the tokens: ["Tokenization", "is", "crucial", "in", "NLP"].

In [29]:
def tokenize_text(text):

    return word_tokenize(text)

In [30]:
with ThreadPoolExecutor(max_workers=4) as pool:

    data["Text"] = list(pool.map(tokenize_text,list(data["Text"])))

In [31]:
data["Text"].head()

0    [worldcom, ex-boss, launches, defence, lawyers...
1    [german, business, confidence, slides, german,...
2    [bbc, poll, indicates, economic, gloom, citize...
3    [lifestyle, governs, mobile, choice, faster, b...
4    [enron, bosses, in, $, 168m, payout, eighteen,...
Name: Text, dtype: object

In [32]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

# Now will remove words that are often used in text for efficiency.

In [33]:
def is_stopword(token):

    return not(token in en_stop_words)



In [34]:
def remove_stopwords(text):

    return list(filter(is_stopword,text))

In [35]:
with ThreadPoolExecutor(max_workers=4) as pool:

    data["Text"] = list(pool.map(remove_stopwords,list(data["Text"])))

In [36]:
data["Text"].head()

0    [worldcom, ex-boss, launches, defence, lawyers...
1    [german, business, confidence, slides, german,...
2    [bbc, poll, indicates, economic, gloom, citize...
3    [lifestyle, governs, mobile, choice, faster, b...
4    [enron, bosses, $, 168m, payout, eighteen, enr...
Name: Text, dtype: object

In [37]:
# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Lemmatization: reducing words to their base or dictionary form,lemmatization considers the context of the word and applies more sophisticated morphological analysis. 
# Example: The lemma of words like "am," "is," and "are" is "be." Lemmatization aims to reduce words to their canonical form.

In [38]:
# Lemmatization using SpaCy
def lemmatize_text(text):
    doc = nlp(" ".join(text))
    return [token.lemma_ for token in doc]

with ThreadPoolExecutor(max_workers=cpu_count()) as pool:
    data["Text"] = list(pool.map(lemmatize_text, list(data["Text"])))


# Convert the list of tokens back to a single string

In [39]:
data["Text"] = data["Text"].apply(lambda x: ' '.join(x))

In [40]:
data["Text"]

0       worldcom ex - boss launch defence lawyer defen...
1       german business confidence slide german busine...
2       bbc poll indicate economic gloom citizen major...
3       lifestyle govern mobile choice fast well funky...
4       enron boss $ 168 m payout eighteen enron direc...
                              ...                        
1485    double eviction big brother model caprice holb...
1486    dj double act revamp chart dj duo jk joel take...
1487    weak dollar hit reuters revenue media group re...
1488    apple ipod family expand market apple expand i...
1489    santy worm make unwelcome visit thousand websi...
Name: Text, Length: 1490, dtype: object

In [41]:
data.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex - boss launch defence lawyer defen...,business
1,154,german business confidence slide german busine...,business
2,1101,bbc poll indicate economic gloom citizen major...,business
3,1976,lifestyle govern mobile choice fast well funky...,tech
4,917,enron boss $ 168 m payout eighteen enron direc...,business


In [42]:
data.shape

(1490, 3)

# Feature extraction using TextVectorization layer.

In [43]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
#text_ds = tf.data.Dataset.from_tensor_slices(data["Text"].values).batch(128)
vectorizer.adapt(data["Text"])

In [44]:
# Vectorize the text
X = vectorizer(np.array([[s] for s in data["Text"]])).numpy()
y = pd.get_dummies(data["Category"]).values

# Now we will Split the dataset into training and testing part.

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Build a simple LSTM model.

In [57]:
''''model = Sequential([
    Input(shape=(200,)),
    Embedding(input_dim=20000, output_dim=128),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(32, activation='relu'),
    Dense(y.shape[1], activation='softmax')
])'''''
from tensorflow.keras.layers import BatchNormalization,Dropout


# Define the model with Batch Normalization
model = Sequential([
    Input(shape=(200,)),
    Embedding(input_dim=20000, output_dim=128),
    LSTM(128, return_sequences=True),
    Dropout(0.5),
    BatchNormalization(),
    LSTM(64),
    Dropout(0.5),
    BatchNormalization(),
    Dense(32, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

# Compile the model

In [58]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



# Train the model

In [59]:
history = model.fit(X_train, y_train, epochs=25, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/25
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 503ms/step - accuracy: 0.2190 - loss: 2.0230 - val_accuracy: 0.2550 - val_loss: 1.5995
Epoch 2/25
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 473ms/step - accuracy: 0.3245 - loss: 1.5833 - val_accuracy: 0.2617 - val_loss: 1.5909
Epoch 3/25
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 487ms/step - accuracy: 0.4919 - loss: 1.2478 - val_accuracy: 0.3490 - val_loss: 1.5670
Epoch 4/25
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 463ms/step - accuracy: 0.5693 - loss: 1.0059 - val_accuracy: 0.4228 - val_loss: 1.4826
Epoch 5/25
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 471ms/step - accuracy: 0.6096 - loss: 0.8925 - val_accuracy: 0.4094 - val_loss: 1.4712
Epoch 6/25
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 502ms/step - accuracy: 0.6588 - loss: 0.8420 - val_accuracy: 0.5336 - val_loss: 1.1777
Epoch 7/25
[1m19/19[0m 

# Evaluate the accuracy and Loss

In [60]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 120ms/step - accuracy: 0.9043 - loss: 0.5227
Loss: 0.4855267405509949, Accuracy: 0.9093959927558899
