In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

# load stopwords from the nltk library
STOPWORDS = set(stopwords.words('english'))

# load data
df = pd.read_csv('train.csv')

articles = df['text']
labels = df['label']

# remove stopwords using above loaded stopwords
for article in articles :
    for word in STOPWORDS :
        token = ' ' + word + ' '
        article = article.replace(token, ' ')
        article = article.replace(' ', ' ')

# split into train and test sets
train_df, test_df = df[:int(0.9*len(df))], df[int(0.9*len(df)):]

# tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['text'])
word_index = tokenizer.word_index

# pad sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
train_padded = pad_sequences(train_sequences, maxlen=840, padding='post', truncating='post')
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
test_padded = pad_sequences(test_sequences, maxlen=840, padding='post', truncating='post')

# convert labels to one-hot encoding
train_labels = pd.get_dummies(train_df['label']).values
test_labels = pd.get_dummies(test_df['label']).values

# build CNN model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, input_length=840))
model.add(Conv1D(filters=96, kernel_size=5, activation='relu'))
model.add(Dropout(0.5))
model.add(GlobalMaxPooling1D())
model.add(Dense(84, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(84, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(5, activation='softmax'))

# early stopping callback to prevent overfitting
earlystop_callback = EarlyStopping(monitor='val_accuracy', patience=3)

# optimizer function
adam = Adam(learning_rate=0.0075, beta_1=0.9175, beta_2=0.999, epsilon=3.75e-07)

# compile model
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

# create checkpoint for best accuracy weights
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# train model
h1 = model.fit(train_padded, train_labels, epochs=20, batch_size=32, validation_split=0.2, callbacks=[earlystop_callback, checkpoint])
h2 = model.fit(train_padded, train_labels, epochs=20, batch_size=32, validation_split=0.2, callbacks=[earlystop_callback, checkpoint])
# evaluate model on test set
print("Evaluating the model: \n\n")
test_loss, test_acc = model.evaluate(test_padded, test_labels, verbose=1)
print('Test accuracy:', test_acc)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Evaluating the model: 


Test accuracy: 0.875


In [5]:
# load best accuracy weights
model.load_weights('best_model.h5')

# evaluate test loss and accuracy
test_loss, test_acc = model.evaluate(test_padded, test_labels, verbose=1)
print('Test Accuracy:', test_acc)

Test Accuracy: 0.9083333611488342


In [10]:
# create a dictionary to map integer labels to string labels
label_map = { num: dpt for num, dpt in enumerate(list(set(labels)))}
 
# load test set
test_df = pd.read_csv('test.csv')

# tokenize and pad test data
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
test_padded = pad_sequences(test_sequences, maxlen=840, padding='post', truncating='post')

# make predictions on test set
predictions = model.predict(test_padded)
predicted_labels = [label_map[np.argmax(pred)] for pred in predictions]

# create submission dataframe
submission_df = pd.DataFrame({'index': test_df['index'], 'label': predicted_labels})

# save submission dataframe to CSV file
submission_df.to_csv('submission.csv', index=False)

