In [None]:
import pandas as pd
df = pd.read_csv("../input/bbc-fulltext-and-category/bbc-text.csv")

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["category"] = encoder.fit_transform(df["category"])

In [None]:
y = df["category"]
X = df["text"]

In [None]:
y.unique()

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.2, random_state = 60)

In [None]:
import re

def normalize(data):
    normalized = []
    
    for i in data:
        i = i.lower()
         # get rid of urls
        i = re.sub('https?://\S+|www\.\S+', '', i)
        # get rid of non words and extra spaces
        i = re.sub('\\W', ' ', i)
        i = re.sub('(?<=[a-z])\'(?=[a-z])', '', i)
        i = re.sub("(?<=[a-z])'(?=[a-z])", "", i)
        i = re.sub('\n', '', i)
        i = re.sub(' +', ' ', i)
        i = re.sub('^ ', '', i)
        i = re.sub(' $', '', i)
        normalized.append(i)
        
    return normalized    

In [None]:
X_train = normalize(X_train)
X_test = normalize(X_test)

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
max_vocab = 10000

In [None]:
tokenizer = Tokenizer(num_words = max_vocab)

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
X_train = pad_sequences(X_train, padding = "post", maxlen = 256)
X_test = pad_sequences(X_test, padding = "post", maxlen = 256)

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Embedding, Dense, Dropout, SpatialDropout1D

In [None]:
model = Sequential()

model.add(Embedding(50000, 300))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM( 128, return_sequences = True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(5, activation = "softmax"))

# model.add(Embedding(50000, 100))
# model.add(SpatialDropout1D(0.2))
# model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(1, activation='softmax'))

In [None]:
model.summary()

In [None]:
from keras.optimizers import SGD
opt = SGD(lr=0.01)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "adam", metrics = "accuracy")

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
model.fit(X_train, y_train, epochs = 10, batch_size = 3, validation_split = 0.2, shuffle= True, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)] )