In [None]:
!unzip --no-check-certificate \
    https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv \
    -O /tmp/bbc-text.csv

UnZip 6.00 of 20 April 2009, by Debian. Original by Info-ZIP.

Usage: unzip [-Z] [-opts[modifiers]] file[.zip] [list] [-x xlist] [-d exdir]
  Default action is to extract files in list, except those in xlist, to exdir;
  file[.zip] may be a wildcard.  -Z => ZipInfo mode ("unzip -Z" for usage).

  -p  extract files to pipe, no messages     -l  list files (short format)
  -f  freshen existing files, create none    -t  test compressed archive data
  -u  update files, create if necessary      -z  display archive comment only
  -v  list verbosely/show version info       -T  timestamp archive to latest
  -x  exclude files that follow (in xlist)   -d  extract files into exdir
modifiers:
  -n  never overwrite existing files         -q  quiet mode (-qq => quieter)
  -o  overwrite files WITHOUT prompting      -a  auto-convert any text files
  -j  junk paths (do not make directories)   -aa treat ALL files as text
  -U  use escapes for all non-ASCII Unicode  -UU ignore any Unicode fields
  -C  mat

In [None]:
!wget --no-check-certificate \
    https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv \
    -O /tmp/bbc-text.csv

--2022-12-12 07:41:05--  https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.31.128, 74.125.134.128, 74.125.139.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.31.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [text/csv]
Saving to: ‘/tmp/bbc-text.csv’


2022-12-12 07:41:05 (134 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]



In [None]:
import csv
import numpy as np
import pandas as pd

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional
from tensorflow.keras.optimizers import Adam

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
stopword_list = set(stopwords.words('english'))
print(stopword_list)

{'hasn', 'each', "wasn't", 'shouldn', 'but', 'was', 'weren', 'those', 'the', 'himself', "isn't", 'after', "don't", 'why', 'just', 'than', 'same', 'them', 'not', 'as', 'on', 'd', 'it', "you're", 'from', 's', 'we', 'out', 'y', 'his', 'ain', 'does', 'a', 'theirs', 'wouldn', 'who', 'to', 'yours', "should've", 'mustn', 'what', 'its', 'is', 'you', 'doing', 'll', 'he', 'below', 't', 'needn', 'if', 'now', 'down', 'during', 'wasn', 'because', "couldn't", 'haven', 'these', 'off', 'have', 'ma', 'between', "you'd", 'him', 'before', 'being', 'i', 'should', 'this', 'up', 'until', 'any', 'other', "weren't", 'there', 'their', 'when', "aren't", 'by', "shouldn't", 'didn', "you'll", 'into', 'will', 'through', 'while', 're', 'here', 'that', 'own', 'myself', 'couldn', 'mightn', 'further', "won't", 'which', 'or', 'of', 'few', 'no', "hasn't", 'for', 'hadn', 'most', 'shan', 'all', 'with', "doesn't", 'whom', 'in', "didn't", 'don', 'having', 'nor', 'over', 'above', 'such', 'only', 'once', 'me', 'an', 'under', "

In [None]:
news_text = []
news_topics = []

with open("/tmp/bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        news_topics.append(row[0])
        t = row[1]
        lowered = t.lower()
        # remove numbers 
        no_numbers = re.sub(pattern=r'\d+', repl=" ", string=lowered)
        # remove punctuations 
        no_punct = no_numbers.translate(str.maketrans('', '', string.punctuation))
        # remove stopwords 
        for term in stopword_list:
            token = ' ' + term + ' '
            no_stopwords = no_punct.replace(token, ' ')
            no_stopwords = no_stopwords.replace(' ', ' ')
        # remove whitespace (more than single space)
        no_ws = re.sub(pattern=r'\s+[a-zA-Z]\s+', repl=" ", string=no_stopwords)
        # lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(no_ws)
        for i in range(len(tokens)):
            lemma_word = lemmatizer.lemmatize(tokens[i])
            tokens[i] = lemma_word
        text = " ".join(tokens)
        news_text.append(text)

In [None]:
df = pd.DataFrame()
df['Text'] = news_text
df['Topic'] = news_topics
df.head()

Unnamed: 0,Text,Topic
0,tv future in the hand of viewer with home thea...,tech
1,worldcom bos left book alone former worldcom b...,business
2,tiger wary of farrell gamble leicester say the...,sport
3,yeading face newcastle in fa cup premiership s...,sport
4,ocean twelve raid box office ocean twelve the ...,entertainment


In [None]:
df['Topic'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: Topic, dtype: int64

In [None]:
news_train, news_test, topic_train, topic_test = train_test_split(df['Text'], df['Topic'], test_size=0.2, random_state=42, shuffle=True, stratify=df['Topic'])

In [None]:
text_tokenizer = Tokenizer(num_words = 5000, oov_token='<OOV>')
text_tokenizer.fit_on_texts(news_train)
print(text_tokenizer.word_index)



In [None]:
train_sequences = text_tokenizer.texts_to_sequences(news_train)
validation_sequences = text_tokenizer.texts_to_sequences(news_test)
train_padded = pad_sequences(train_sequences, maxlen=200, padding='post', truncating='post')
validation_padded = pad_sequences(validation_sequences, maxlen=200, padding='post', truncating='post')

In [None]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(news_topics)
training_label_seq = np.array(label_tokenizer.texts_to_sequences(topic_train))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(topic_test))

In [None]:
print(label_tokenizer.word_index)

{'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}


In [None]:
model = Sequential([Embedding(5000,16),
                    Dropout(0.5),  
                    Bidirectional(LSTM(64)),
                    Dense(128, activation='relu'),
                    Dropout(0.5), 
                    Dense(64, activation='relu'),
                    Dense(6, activation='softmax')])
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001, decay=1e-6), metrics=['accuracy'])
model.fit(train_padded, training_label_seq, epochs=7, validation_data=(validation_padded, validation_label_seq), verbose=1)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f3cbb6d90d0>