In [6]:
import pandas as pd
import numpy as np
import re

In [7]:
data = pd.read_csv('bbc-news-data.csv' , sep='\t')

In [8]:
data.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [9]:
data.rename(columns={'category': 'Label'}, inplace=True)
data.rename(columns={'title': 'Headline'}, inplace=True)
data.rename(columns={'content': 'Description'}, inplace=True)
data.drop(columns=['filename'], inplace=True)
data.head()

Unnamed: 0,Label,Headline,Description
0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [10]:
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def clean_description(description):
    description = description.lower()
    description = description.translate(str.maketrans('', '', string.punctuation))
    description = description.strip()

    tokens = word_tokenize(description)

    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word for word in tokens if word not in stop_words]

    cleaned_description = ' '.join(cleaned_tokens)

    return cleaned_description
data['Description'] = data['Description'].apply(clean_description)
data['Headline'] = data['Headline'].apply(clean_description)


In [13]:
data.head()

Unnamed: 0,Label,Headline,Description
0,business,ad sales boost time warner profit,quarterly profits us media giant timewarner ju...
1,business,dollar gains greenspan speech,dollar hit highest level euro almost three mon...
2,business,yukos unit buyer faces loan claim,owners embattled russian oil giant yukos ask b...
3,business,high fuel prices hit bas profits,british airways blamed high fuel prices 40 dro...
4,business,pernod takeover talk lifts domecq,shares uk drinks food firm allied domecq risen...


In [15]:
rows_to_drop = data[data['Label'] == 'entertainment'].index
data = data.drop(rows_to_drop)

In [16]:
data.to_csv('./data.csv', index=False)


In [17]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [18]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['Headline'] + ' ' + train_data['Description'])

X_train = tokenizer.texts_to_sequences(train_data['Headline'] + ' ' + train_data['Description'])
X_val = tokenizer.texts_to_sequences(val_data['Headline'] + ' ' + val_data['Description'])
X_test = tokenizer.texts_to_sequences(test_data['Headline'] + ' ' + test_data['Description'])

max_length = 1000  # مقدار دلخواه برای طول دنباله‌ها
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_val = pad_sequences(X_val, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

label_dict = {label: idx for idx, label in enumerate(train_data['Label'].unique())}
y_train = np.array([label_dict[label] for label in train_data['Label']])
y_val = np.array([label_dict[label] for label in val_data['Label']])
y_test = np.array([label_dict[label] for label in test_data['Label']])


In [20]:
# RNN
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_length))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=len(label_dict), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=100)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7ceca684a140>

In [21]:
from keras.optimizers import SGD

# RNN
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_length))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=len(label_dict), activation='softmax'))

# SGD
optimizer = SGD(learning_rate=0.01, momentum=0.9)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cec9d67f8e0>

In [19]:
from keras.optimizers import SGD

# RNN
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_length))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=len(label_dict), activation='softmax'))

optimizer = SGD(learning_rate=0.001, momentum=0.9)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78e2c0251b10>

In [22]:
from keras.optimizers import SGD

# RNN
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_length))
model.add(LSTM(units=128, dropout=0.25, recurrent_dropout=0.2))
model.add(Dense(units=len(label_dict), activation='softmax'))
# model.add(LSTM(units=64, dropout=0.3, recurrent_dropout=0.25))

# model.add(Dense(units=len(label_dict), activation='softmax'))

optimizer = SGD(learning_rate=0.001, momentum=0.95)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78e294f110f0>