In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm


# Text Preprocessing and NLP
import nltk

# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize

# Part-of-speech tagging
from nltk import pos_tag

# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer


# Model Imports
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from gensim.models import Word2Vec

2025-02-20 09:58:15.111718: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-20 09:58:15.270272: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740016695.327868  317357 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740016695.344215  317357 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-20 09:58:15.498355: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /home/inflaton/code/CrediNews


### Load datasets

In [3]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
        "test": "dataset/test_data.csv",
        "rewritten_train": [
            "dataset/rewritten_train_data_1.csv",
            "dataset/rewritten_train_data_2.csv",
            "dataset/rewritten_train_data_3.csv",
            "dataset/rewritten_train_data_4.csv",
        ],
        "rewritten_test": "dataset/rewritten_test_data.csv",
    },
)
datasets

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
    rewritten_train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    rewritten_test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
})

### LSTM with Word2Vec (best model)

In [4]:
# Set seeds for reproducibility
import tensorflow as tf
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)


# Train and process Word2Vec embeddings
def train_word2vec_embeddings(
    train_texts, word_index, max_words=10000, embedding_dim=100
):
    """Tokenizes the text, trains Word2Vec, and creates an embedding matrix."""

    # Tokenizing text into word lists
    sentences = [text.split() for text in train_texts]

    # ✅ Train Word2Vec model using processed sentences
    word2vec = Word2Vec(
        sentences=sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4
    )

    # ✅ Ensure consistent vocab size (only include words within max_words limit)
    vocab_size = min(max_words, len(word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in word_index.items():
        if i < vocab_size and word in word2vec.wv:
            embedding_matrix[i] = word2vec.wv[word]

    return embedding_matrix, vocab_size  # ✅ Returns both embedding matrix & vocab size


def create_lstm_model(
    vocab_size,
    embedding_matrix,
    lstm_units=128,
    dropout_rate=0.3,
    learning_rate=0.001,
    l2_lambda=0.01,
):
    """Creates an LSTM model with Dropout and L2 regularization."""

    model = Sequential(
        [
            Embedding(
                vocab_size,
                embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=True,
            ),
            LSTM(units=lstm_units, return_sequences=False, dropout=dropout_rate),
            Dropout(dropout_rate),
            Dense(1, activation="sigmoid", kernel_regularizer=l2(l2_lambda)),
        ]
    )

    # ✅ Use Adam optimizer with a defined learning rate
    model.compile(
        loss="binary_crossentropy", optimizer=Adam(learning_rate), metrics=["accuracy"]
    )

    return model


# ✅ Early stopping callback
early_stopping = EarlyStopping(
    monitor="val_loss", patience=3, restore_best_weights=True
)

### Define function to train the model

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def train_model(
    train_data,
    val_data,
    epochs=10,
    batch_size=64,
    lstm_units=64,
    dropout_rate=0.3,
    grid_search=False,
):
    """Trains an LSTM model using Word2Vec embeddings while ensuring tokenizer consistency across datasets."""

    # ✅ Set random seeds for reproducibility
    seed = 42
    tf.random.set_seed(seed)
    np.random.seed(seed)

    max_words = 10000  # Maximum vocabulary size
    max_sequence_length = 300  # Max length of tokenized sequences
    embedding_dim = 100  # Embedding dimension

    print(
        f"\n🚀 Training LSTM with lstm_units={lstm_units}, dropout_rate={dropout_rate}"
    )

    # ✅ Extract texts and labels
    train_texts = train_data["processed_full_content"]
    val_texts = val_data["processed_full_content"]
    y_train = train_data["label"]
    y_val = val_data["label"]

    # ✅ Tokenizer is refitted on updated `train_data` for new dataset compatibility
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    # ✅ Convert texts to sequences and pad them
    X_train = pad_sequences(
        tokenizer.texts_to_sequences(train_texts), maxlen=max_sequence_length
    )
    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )

    # ✅ Ensure vocab size consistency (Prevents index mismatches)
    vocab_size = min(max_words, len(tokenizer.word_index) + 1)

    # ✅ Train Word2Vec and get the embedding matrix
    embedding_matrix, _ = train_word2vec_embeddings(
        train_texts, tokenizer.word_index, max_words, embedding_dim
    )

    # ✅ Create and compile the LSTM model
    model = create_lstm_model(
        vocab_size=vocab_size,
        embedding_matrix=embedding_matrix,
        lstm_units=lstm_units,
        dropout_rate=dropout_rate,
    )

    # ✅ Train the model with early stopping
    history = model.fit(
        X_train,
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping],
        verbose=1,
    )

    # ✅ Evaluate model performance on validation data
    y_pred = (model.predict(X_val) > 0.5).astype(int).reshape(-1)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    # ✅ Print evaluation metrics
    result = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }
    print("\n🏆 Training Results:")
    for key, value in result.items():
        print(f"🔹 {key.capitalize()}: {value:.4f}")

    return result if grid_search else model

### Define Evaluate model function

In [6]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
from tqdm import tqdm


def evaluate_model(model, train_data, val_data):
    print("Evaluating Model")

    max_words = 10000
    max_sequence_length = 300

    train_texts = train_data["processed_full_content"]
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    y_val = val_data["label"]
    val_texts = val_data["processed_full_content"]

    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )
    y_pred = (model.predict(X_val) > 0.5).astype(int)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")

### Define GridSearch Function

In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score


def do_grid_search(data):
    """Performs Grid Search to find the best LSTM hyperparameters using cross-validation."""
    param_grid = {"lstm_units": [64, 128], "dropout_rate": [0.2, 0.3]}
    results = []
    best_score = 0
    best_params = None

    if data.empty:
        raise ValueError("The dataset is empty. Please provide valid data.")

    for lstm_unit in param_grid["lstm_units"]:
        for dropout_rate in param_grid["dropout_rate"]:
            print(f"\n Testing lstm_units={lstm_unit}, dropout_rate={dropout_rate}")

            kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            fold_scores = []

            for fold, (train_idx, val_idx) in enumerate(
                kfold.split(data["processed_full_content"], data["label"].to_numpy()), 1
            ):
                print(f"Fold {fold}")

                try:
                    train_data = data.iloc[train_idx]
                    val_data = data.iloc[val_idx]

                    result = train_model(
                        train_data=train_data,
                        val_data=val_data,
                        lstm_units=lstm_unit,
                        dropout_rate=dropout_rate,
                        grid_search=True,
                    )

                    fold_score = result["f1_score"]
                    fold_scores.append(fold_score)
                    print(f"✔️ Fold {fold} F1-score: {fold_score:.4f}")

                except Exception as e:
                    print(f"Fold {fold} failed due to error: {e}")
                    continue

            avg_score = np.mean(fold_scores) if fold_scores else 0
            print(f"Average F1-score: {avg_score:.4f}")

            results.append(
                {
                    "lstm_units": lstm_unit,
                    "dropout_rate": dropout_rate,
                    "avg_f1_score": avg_score,
                }
            )

            if avg_score > best_score:
                best_score = avg_score
                best_params = {"lstm_units": lstm_unit, "dropout_rate": dropout_rate}

    # ✅ Edge case: If all folds fail
    if best_params is not None:
        print("Best Parameters Found:")
        print(f"LSTM Units: {best_params['lstm_units']}")
        print(f"Dropout Rate: {best_params['dropout_rate']}")
        print(f"Best F1-Score: {best_score:.4f}")
    else:
        print("Grid search failed—no valid results.")

    return best_params

### Set training and validation data

In [8]:
train_data = datasets["train"].to_pandas()
val_data = datasets["test"].to_pandas()
data = pd.concat([train_data, val_data], ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


### Perform grid search for optimal hyperparameters

In [9]:
%%time

best_params = do_grid_search(data)
best_params


 Testing lstm_units=64, dropout_rate=0.2
Fold 1

🚀 Training LSTM with lstm_units=64, dropout_rate=0.2


I0000 00:00:1740016726.789000  317357 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 45689 MB memory:  -> device: 0, name: NVIDIA RTX 6000 Ada Generation, pci bus id: 0000:01:00.0, compute capability: 8.9


Epoch 1/10


I0000 00:00:1740016729.411588  317572 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 43ms/step - accuracy: 0.8794 - loss: 0.2995 - val_accuracy: 0.9609 - val_loss: 0.1293
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 41ms/step - accuracy: 0.9597 - loss: 0.1258 - val_accuracy: 0.9682 - val_loss: 0.0991
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 40ms/step - accuracy: 0.9731 - loss: 0.0900 - val_accuracy: 0.9740 - val_loss: 0.0841
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 48ms/step - accuracy: 0.9783 - loss: 0.0755 - val_accuracy: 0.9755 - val_loss: 0.0833
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 44ms/step - accuracy: 0.9812 - loss: 0.0686 - val_accuracy: 0.9771 - val_loss: 0.0815
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 41ms/step - accuracy: 0.9842 - loss: 0.0594 - val_accuracy: 0.9775 - val_loss: 0.0751
Epoch 7/10
[1m757/757[0m 

{'lstm_units': 128, 'dropout_rate': 0.3}

### Train the model

In [10]:
%%time

model = train_model(
    train_data,
    val_data,
    lstm_units=best_params["lstm_units"],
    dropout_rate=best_params["dropout_rate"],
)
model.summary()


🚀 Training LSTM with lstm_units=128, dropout_rate=0.3
Epoch 1/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 34ms/step - accuracy: 0.8868 - loss: 0.2733 - val_accuracy: 0.9413 - val_loss: 0.1625
Epoch 2/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 36ms/step - accuracy: 0.9577 - loss: 0.1241 - val_accuracy: 0.9689 - val_loss: 0.0939
Epoch 3/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 35ms/step - accuracy: 0.9727 - loss: 0.0838 - val_accuracy: 0.9291 - val_loss: 0.1750
Epoch 4/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 33ms/step - accuracy: 0.9730 - loss: 0.0852 - val_accuracy: 0.9740 - val_loss: 0.0827
Epoch 5/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 36ms/step - accuracy: 0.9805 - loss: 0.0639 - val_accuracy: 0.9701 - val_loss: 0.0843
Epoch 6/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 35ms/step - accuracy: 0.9837 - loss: 0.0538 - v

CPU times: user 3min 36s, sys: 2min 10s, total: 5min 47s
Wall time: 5min 28s


### Save the model in .keras

In [11]:
model.save("results/LSTM_model_original.keras")

### Load the model again to see if results are the same

In [12]:
# load model
from tensorflow.keras.models import load_model

model2 = load_model("results/LSTM_model_original.keras")
model2.summary()

### Model evaluation

In [13]:
%%time

evaluate_model(model, train_data, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step
Accuracy: 0.9800
Precision: 0.9813
Recall: 0.9728
F1: 0.9770
CPU times: user 7.96 s, sys: 1.65 s, total: 9.61 s
Wall time: 9.43 s


In [14]:
%%time

evaluate_model(model2, train_data, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step
Accuracy: 0.9800
Precision: 0.9835
Recall: 0.9705
F1: 0.9770
CPU times: user 7.96 s, sys: 1.52 s, total: 9.47 s
Wall time: 9.31 s


### Load LLM-rewritten data

In [15]:
val_data_rewritten = datasets["rewritten_test"].to_pandas()
train_data_rewritten = datasets["rewritten_train"].to_pandas()
data_combined = pd.concat(
    [train_data, train_data_rewritten, val_data, val_data_rewritten], ignore_index=True
)
data_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120982 entries, 0 to 120981
Data columns (total 3 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   label                   120982 non-null  int64 
 1   full_content            120982 non-null  object
 2   processed_full_content  120982 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.8+ MB


### Model evaluation on rewritten data, using original training data

In [16]:
%%time

evaluate_model(model, train_data, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step
Accuracy: 0.8137
Precision: 0.9077
Recall: 0.6392
F1: 0.7502
CPU times: user 7.71 s, sys: 1.87 s, total: 9.58 s
Wall time: 9.39 s


In [17]:
%%time

evaluate_model(model2, train_data, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step
Accuracy: 0.8159
Precision: 0.9106
Recall: 0.6422
F1: 0.7532
CPU times: user 7.97 s, sys: 1.52 s, total: 9.49 s
Wall time: 9.31 s


### Rerun grid search with rewritten data

In [18]:
%%time

best_params_combined = do_grid_search(data_combined)
best_params_combined


 Testing lstm_units=64, dropout_rate=0.2
Fold 1

🚀 Training LSTM with lstm_units=64, dropout_rate=0.2
Epoch 1/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 42ms/step - accuracy: 0.8478 - loss: 0.3537 - val_accuracy: 0.9294 - val_loss: 0.1927
Epoch 2/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 42ms/step - accuracy: 0.9286 - loss: 0.1912 - val_accuracy: 0.9383 - val_loss: 0.1658
Epoch 3/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 41ms/step - accuracy: 0.9397 - loss: 0.1624 - val_accuracy: 0.9437 - val_loss: 0.1531
Epoch 4/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 42ms/step - accuracy: 0.9475 - loss: 0.1425 - val_accuracy: 0.9459 - val_loss: 0.1462
Epoch 5/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 45ms/step - accuracy: 0.9545 - loss: 0.1287 - val_accuracy: 0.9473 - val_loss: 0.1432
Epoch 6/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

{'lstm_units': 128, 'dropout_rate': 0.3}

### Retrain the model with rewritten data

In [19]:
%%time

train_data_combined = pd.concat([train_data, train_data_rewritten], ignore_index=True)
val_data_combined = pd.concat([val_data, val_data_rewritten], ignore_index=True)
model_combined = train_model(
    train_data_combined,
    val_data_combined,
    lstm_units=best_params_combined["lstm_units"],
    dropout_rate=best_params_combined["dropout_rate"],
)
model_combined.save("results/LSTM_model_combined.keras")


🚀 Training LSTM with lstm_units=128, dropout_rate=0.3
Epoch 1/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 34ms/step - accuracy: 0.8554 - loss: 0.3312 - val_accuracy: 0.9283 - val_loss: 0.1807
Epoch 2/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 34ms/step - accuracy: 0.9305 - loss: 0.1751 - val_accuracy: 0.9391 - val_loss: 0.1533
Epoch 3/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 34ms/step - accuracy: 0.9416 - loss: 0.1477 - val_accuracy: 0.9436 - val_loss: 0.1479
Epoch 4/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 36ms/step - accuracy: 0.9489 - loss: 0.1331 - val_accuracy: 0.9463 - val_loss: 0.1366
Epoch 5/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 30ms/step - accuracy: 0.9548 - loss: 0.1213 - val_accuracy: 0.9477 - val_loss: 0.1365
Epoch 6/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 34ms/step - accuracy: 0.9592 - loss

In [20]:
evaluate_model(model_combined, train_data_combined, val_data_combined)

Evaluating Model
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step
Accuracy: 0.9518
Precision: 0.9455
Recall: 0.9443
F1: 0.9449


In [21]:
evaluate_model(model_combined, train_data_combined, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step
Accuracy: 0.9792
Precision: 0.9773
Recall: 0.9751
F1: 0.9762


In [22]:
evaluate_model(model_combined, train_data_combined, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step
Accuracy: 0.9233
Precision: 0.9130
Recall: 0.9116
F1: 0.9123


In [23]:
model_combined2 = load_model("results/LSTM_model_combined.keras")
model_combined2.summary()

In [24]:
evaluate_model(model_combined2, train_data_combined, val_data_combined)

Evaluating Model
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step
Accuracy: 0.9493
Precision: 0.9437
Recall: 0.9403
F1: 0.9420


In [25]:
evaluate_model(model_combined2, train_data_combined, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step
Accuracy: 0.9787
Precision: 0.9773
Recall: 0.9739
F1: 0.9756


In [26]:
evaluate_model(model_combined2, train_data_combined, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step
Accuracy: 0.9255
Precision: 0.9147
Recall: 0.9150
F1: 0.9148


In [28]:
data_rewritten = pd.concat(
    [train_data_rewritten, val_data_rewritten], ignore_index=True
)
data_rewritten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [29]:
%%time

best_params_rewritten = do_grid_search(data_rewritten)
best_params_rewritten


 Testing lstm_units=64, dropout_rate=0.2
Fold 1

🚀 Training LSTM with lstm_units=64, dropout_rate=0.2
Epoch 1/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 40ms/step - accuracy: 0.7681 - loss: 0.4889 - val_accuracy: 0.8649 - val_loss: 0.3133
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 42ms/step - accuracy: 0.8882 - loss: 0.2752 - val_accuracy: 0.8892 - val_loss: 0.2676
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 43ms/step - accuracy: 0.9052 - loss: 0.2340 - val_accuracy: 0.9061 - val_loss: 0.2308
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 40ms/step - accuracy: 0.9193 - loss: 0.2042 - val_accuracy: 0.9041 - val_loss: 0.2319
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 41ms/step - accuracy: 0.9287 - loss: 0.1867 - val_accuracy: 0.8991 - val_loss: 0.2510
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m

{'lstm_units': 128, 'dropout_rate': 0.3}

In [32]:
%%time


model_rewritten = train_model(
    train_data_rewritten,
    val_data_rewritten,
    lstm_units=best_params_rewritten["lstm_units"],
    dropout_rate=best_params_rewritten["dropout_rate"],
)
model_rewritten.save("results/LSTM_model_rewritten.keras")


🚀 Training LSTM with lstm_units=128, dropout_rate=0.3
Epoch 1/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 37ms/step - accuracy: 0.7823 - loss: 0.4740 - val_accuracy: 0.8802 - val_loss: 0.2874
Epoch 2/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 37ms/step - accuracy: 0.8872 - loss: 0.2741 - val_accuracy: 0.8949 - val_loss: 0.2554
Epoch 3/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 35ms/step - accuracy: 0.9047 - loss: 0.2337 - val_accuracy: 0.9098 - val_loss: 0.2213
Epoch 4/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 38ms/step - accuracy: 0.9153 - loss: 0.2075 - val_accuracy: 0.9117 - val_loss: 0.2133
Epoch 5/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 36ms/step - accuracy: 0.9252 - loss: 0.1884 - val_accuracy: 0.9147 - val_loss: 0.2119
Epoch 6/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 34ms/step - accuracy: 0.9316 - loss: 0.1755 - v

In [33]:
evaluate_model(model_rewritten, train_data_rewritten, val_data_combined)

Evaluating Model
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step
Accuracy: 0.9347
Precision: 0.9484
Recall: 0.8997
F1: 0.9234


In [34]:
evaluate_model(model_rewritten, train_data_rewritten, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step
Accuracy: 0.9210
Precision: 0.9215
Recall: 0.8957
F1: 0.9084


In [35]:
evaluate_model(model_rewritten, train_data_rewritten, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step
Accuracy: 0.9481
Precision: 0.9787
Recall: 0.9010
F1: 0.9382
