0. Import required packages

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf
import keras.preprocessing.sequence as sequence
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel

1. Load and process data

In [3]:
train_data = pd.read_csv('NLP_exam_real_case_dataset/train.csv')
test_data = pd.read_csv('NLP_exam_real_case_dataset/test.csv')
complete_data = pd.read_csv('NLP_exam_real_case_dataset/complete.csv')

X_train = train_data['article_title'].values
y_train = train_data['is_ecology'].values

X_test = test_data['article_title'].values
y_test = test_data['is_ecology'].values

max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_seq_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length)

X_train_pad, X_val_pad, y_train, y_val = train_test_split(X_train_pad, y_train, test_size=0.2, random_state=42)

2. Fully Connected Neural Network Model

In [4]:
model_fcnn = Sequential()
model_fcnn.add(Dense(128, activation='relu', input_shape=(max_seq_length,)))
model_fcnn.add(Dense(64, activation='relu'))
model_fcnn.add(Dense(1, activation='sigmoid'))

model_fcnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_fcnn.summary()

model_fcnn.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_val_pad, y_val))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               12928     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 21249 (83.00 KB)
Trainable params: 21249 (83.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1247ebba770>

3. Recurrent Neural Network (LSTM) Model

In [5]:
model_lstm = Sequential()
model_lstm.add(Embedding(max_words, 100, input_length=max_seq_length))
model_lstm.add(LSTM(64))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.summary()

model_lstm.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_val_pad, y_val))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1042305 (3.98 MB)
Trainable params: 1042305 (3.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1240005fa30>

4. BERT model

In [9]:
X_train = train_data['article_title'].values
y_train = train_data['is_ecology'].values

X_test = test_data['article_title'].values
y_test = test_data['is_ecology'].values

max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_seq_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length)

X_train_pad, X_val_pad, y_train, y_val = train_test_split(X_train_pad, y_train, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
X_train_encodings = tokenizer(list(X_train), padding=True, truncation=True, max_length=max_seq_length, return_tensors='pt')
X_test_encodings = tokenizer(list(X_test), padding=True, truncation=True, max_length=max_seq_length, return_tensors='pt')
X_train_encodings, X_val_encodings, y_train, y_val = train_test_split(X_train_encodings, y_train, test_size=0.2, random_state=42)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=X_train_encodings,
    eval_dataset=X_val_encodings,
)

trainer.train()

# Evaluate BERT model on the test set
predictions = trainer.predict(X_test_encodings)
bert_accuracy = np.mean((predictions.predictions.squeeze() > 0.5) == y_test)

print(f'BERT Accuracy: {bert_accuracy}')


ValueError: Found input variables with inconsistent numbers of samples: [3, 1772]

5. Evaluate all models

In [8]:
_, fcnn_accuracy = model_fcnn.evaluate(X_test_pad, y_test)
_, lstm_accuracy = model_lstm.evaluate(X_test_pad, y_test)

print(f'Fully Connected Neural Network Accuracy: {fcnn_accuracy}')
print(f'LSTM Accuracy: {lstm_accuracy}')


best_model = model_lstm if lstm_accuracy > fcnn_accuracy and lstm_accuracy > bert_accuracy else model_fcnn if fcnn_accuracy > bert_accuracy else model_bert
_, test_accuracy = best_model.evaluate(X_test_pad, y_test)
print(f'Best Model (based on validation accuracy) Test Accuracy: {test_accuracy}')

Fully Connected Neural Network Accuracy: 0.819727897644043
LSTM Accuracy: 0.8265306353569031


NameError: name 'bert_accuracy' is not defined