In [34]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm


# Text Preprocessing and NLP
import nltk

# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize

# Part-of-speech tagging
from nltk import pos_tag

# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer

In [35]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /home/inflaton/code/CrediNews


# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [36]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
        "test": "dataset/test_data.csv",
        "rewritten_train": [
            "dataset/rewritten_train_data_1.csv",
            "dataset/rewritten_train_data_2.csv",
            "dataset/rewritten_train_data_3.csv",
            "dataset/rewritten_train_data_4.csv",
        ],
        "rewritten_test": "dataset/rewritten_test_data.csv",
    },
)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
    rewritten_train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    rewritten_test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
})

### Convolutional Neural network + Custom-trained word2vec word embeddings + 5-Fold Cross Validation + L2 Regularization + GridSearchCV


In [37]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Embedding,
    Conv1D,
    GlobalMaxPooling1D,
    Dense,
    Dropout,
    Input,
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from gensim.models import Word2Vec
from tensorflow.keras.regularizers import l2

# Set seeds for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)


def train_word2vec_and_create_embeddings(
    train_texts, word_index, max_words, embedding_dim=100
):
    """Train Word2Vec on training data only and create embedding matrix"""
    # Train Word2Vec on training data only
    train_sentences = [text.split() for text in train_texts]
    word2vec_model = Word2Vec(
        sentences=train_sentences,
        vector_size=embedding_dim,
        window=5,
        min_count=2,
        workers=4,
    )

    # Create embedding matrix with correct dimensions
    vocab_size = min(max_words, len(word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in word_index.items():
        if i < vocab_size:  # Only include words within max_words limit
            if word in word2vec_model.wv:
                embedding_matrix[i] = word2vec_model.wv[word]
            else:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

    return embedding_matrix


def create_model(
    max_sequence_length,
    vocab_size,
    embedding_dim,
    embedding_matrix,
    filters,
    dropout_rate,
):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True,
    )(input_layer)

    x = Conv1D(
        filters=filters, kernel_size=5, activation="relu", kernel_regularizer=l2(0.01)
    )(embedding_layer)

    x = GlobalMaxPooling1D()(x)
    x = Dense(64, activation="relu", kernel_regularizer=l2(0.01))(x)
    x = Dropout(dropout_rate)(x)
    output_layer = Dense(1, activation="sigmoid", kernel_regularizer=l2(0.01))(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [38]:
def train_model(
    train_data,
    val_data,
    epochs=10,
    batch_size=64,
    filters=64,
    dropout_rate=0.3,
    grid_search=False,
):
    # Set seeds for reproducibility
    seed = 42
    tf.random.set_seed(seed)
    np.random.seed(seed)

    # Constants
    max_words = 10000
    max_sequence_length = 300
    embedding_dim = 100

    print(f"\nTraining with paras: filters={filters}, dropout_rate={dropout_rate}")

    train_texts = train_data["processed_full_content"]
    val_texts = val_data["processed_full_content"]
    y_train = train_data["label"]
    y_val = val_data["label"]

    # Fit tokenizer on training data only
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    # Convert texts to sequences
    X_train = pad_sequences(
        tokenizer.texts_to_sequences(train_texts), maxlen=max_sequence_length
    )
    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )

    # Get vocab size for this fold
    vocab_size = min(max_words, len(tokenizer.word_index) + 1)

    # Create embedding matrix using training data only
    embedding_matrix = train_word2vec_and_create_embeddings(
        train_texts, tokenizer.word_index, max_words, embedding_dim
    )

    # Create and train model
    model = create_model(
        max_sequence_length=max_sequence_length,
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        embedding_matrix=embedding_matrix,
        filters=filters,
        dropout_rate=dropout_rate,
    )

    # Train model
    history = model.fit(
        X_train,
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        verbose=1,
    )

    # Evaluate using F1-score
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    # Store results
    result = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }

    print("\nResult:")
    for key, value in result.items():
        print(f"\t{key}: {value}")

    return result if grid_search else model

In [39]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
from tqdm import tqdm


def evaluate_model(model, train_data, val_data):
    print("Evaluating Model")

    max_words = 10000
    max_sequence_length = 300

    train_texts = train_data["processed_full_content"]
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    y_val = val_data["label"]
    val_texts = val_data["processed_full_content"]

    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )
    y_pred = (model.predict(X_val) > 0.5).astype(int)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")

In [40]:
def do_grid_search(data):
    # Define parameter grid
    param_grid = {"filters": [64, 128], "dropout_rate": [0.2, 0.3, 0.4, 0.5]}

    # Initialize variables to track results
    results = []
    best_score = 0
    best_params = None

    # Perform grid search with cross-validation
    for filters in param_grid["filters"]:
        for dropout_rate in param_grid["dropout_rate"]:
            print(f"\nTesting filters={filters}, dropout_rate={dropout_rate}")

            # Initialize cross-validation
            kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            fold_scores = []

            # Perform k-fold cross-validation
            for fold, (train_idx, val_idx) in enumerate(
                kfold.split(data["processed_full_content"], data["label"]), 1
            ):
                print(f"\nFold {fold}")

                # Split data
                train_data = data.iloc[train_idx]
                val_data = data.iloc[val_idx]

                result = train_model(
                    train_data,
                    val_data,
                    filters=filters,
                    dropout_rate=dropout_rate,
                    grid_search=True,
                )

                fold_score = result["f1_score"]
                fold_scores.append(fold_score)

                print(f"Fold {fold} F1-score: {fold_score:.4f}")

            # Calculate average score for this parameter combination
            avg_score = np.mean(fold_scores)
            print(f"Average F1-score: {avg_score:.4f}")

            # Store results
            results.append(
                {
                    "filters": filters,
                    "dropout_rate": dropout_rate,
                    "avg_f1_score": avg_score,
                    "fold_scores": fold_scores,
                }
            )

            # Update best parameters if necessary
            if avg_score > best_score:
                best_score = avg_score
                best_params = {"filters": filters, "dropout_rate": dropout_rate}

    # Print final results
    print("\nGrid Search Results:")
    for result in results:
        print(
            f"Filters: {result['filters']}, Dropout: {result['dropout_rate']}, "
            f"F1-score: {result['avg_f1_score']:.4f}"
        )

    print("\nBest Parameters:")
    print(f"Filters: {best_params['filters']}")
    print(f"Dropout Rate: {best_params['dropout_rate']}")
    print(f"Best F1-Score: {best_score:.4f}")

    results_df = pd.DataFrame(results)
    print("\nResults Summary:")
    print(results_df.sort_values("avg_f1_score", ascending=False))

    return best_params

In [41]:
train_data = datasets["train"].to_pandas()
val_data = datasets["test"].to_pandas()
data = pd.concat([train_data, val_data], ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [42]:
%%time

best_params = do_grid_search(data)
best_params


Testing filters=64, dropout_rate=0.2

Fold 1

Training with paras: filters=64, dropout_rate=0.2
Epoch 1/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8584 - loss: 1.1456 - val_accuracy: 0.9650 - val_loss: 0.2937
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9612 - loss: 0.2726 - val_accuracy: 0.9697 - val_loss: 0.1895
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9688 - loss: 0.1895 - val_accuracy: 0.9693 - val_loss: 0.1647
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9729 - loss: 0.1616 - val_accuracy: 0.9719 - val_loss: 0.1528
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9758 - loss: 0.1486 - val_accuracy: 0.9714 - val_loss: 0.1513
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accur

{'filters': 128, 'dropout_rate': 0.4}

In [43]:
%%time

model = train_model(
    train_data,
    val_data,
    filters=best_params["filters"],
    dropout_rate=best_params["dropout_rate"],
)
model


Training with paras: filters=128, dropout_rate=0.4
Epoch 1/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.8707 - loss: 1.3105 - val_accuracy: 0.9620 - val_loss: 0.2796
Epoch 2/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9611 - loss: 0.2611 - val_accuracy: 0.9669 - val_loss: 0.1873
Epoch 3/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9674 - loss: 0.1890 - val_accuracy: 0.9683 - val_loss: 0.1682
Epoch 4/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9723 - loss: 0.1643 - val_accuracy: 0.9736 - val_loss: 0.1540
Epoch 5/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9757 - loss: 0.1497 - val_accuracy: 0.9754 - val_loss: 0.1455
Epoch 6/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9797 - loss: 0.1392 - val_accuracy: 0.

<Functional name=functional_122, built=True>

In [44]:
model.save("results/CNN_model_original_CUDA.keras")

In [45]:
# load model
from tensorflow.keras.models import load_model

model2 = load_model("results/CNN_model_original_CUDA.keras")
model2.summary()

In [46]:
%%time

evaluate_model(model, train_data, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9775
Precision: 0.9776
Recall: 0.9709
F1: 0.9742
CPU times: user 6.33 s, sys: 87.1 ms, total: 6.42 s
Wall time: 6.46 s


In [47]:
%%time

evaluate_model(model2, train_data, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.9775
Precision: 0.9776
Recall: 0.9709
F1: 0.9742
CPU times: user 6.43 s, sys: 96.2 ms, total: 6.52 s
Wall time: 6.56 s


In [49]:
model3 = load_model("results/CNN_model.keras")
model3.summary()

In [53]:
%%time

evaluate_model(model3, train_data, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9762
Precision: 0.9750
Recall: 0.9705
F1: 0.9727
CPU times: user 6.27 s, sys: 143 ms, total: 6.41 s
Wall time: 6.43 s


In [58]:
val_data_rewritten = datasets["rewritten_test"].to_pandas()
train_data_rewritten = datasets["rewritten_train"].to_pandas()
data_combined = pd.concat(
    [train_data, train_data_rewritten, val_data, val_data_rewritten], ignore_index=True
)
data_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120982 entries, 0 to 120981
Data columns (total 3 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   label                   120982 non-null  int64 
 1   full_content            120982 non-null  object
 2   processed_full_content  120982 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.8+ MB


In [50]:
%%time

evaluate_model(model, train_data, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Accuracy: 0.8519
Precision: 0.9159
Recall: 0.7284
F1: 0.8114
CPU times: user 6.32 s, sys: 59.1 ms, total: 6.38 s
Wall time: 6.34 s


In [51]:
%%time

evaluate_model(model2, train_data, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.8519
Precision: 0.9159
Recall: 0.7284
F1: 0.8114
CPU times: user 6.41 s, sys: 80.7 ms, total: 6.49 s
Wall time: 6.52 s


In [52]:
%%time

evaluate_model(model3, train_data, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Accuracy: 0.8453
Precision: 0.9147
Recall: 0.7129
F1: 0.8013
CPU times: user 6.3 s, sys: 131 ms, total: 6.43 s
Wall time: 6.47 s


In [59]:
%%time

best_params_combined = do_grid_search(data_combined)
best_params_combined


Testing filters=64, dropout_rate=0.2

Fold 1

Training with paras: filters=64, dropout_rate=0.2
Epoch 1/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.8521 - loss: 0.9658 - val_accuracy: 0.9307 - val_loss: 0.2897
Epoch 2/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9271 - loss: 0.2802 - val_accuracy: 0.9360 - val_loss: 0.2388
Epoch 3/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9336 - loss: 0.2402 - val_accuracy: 0.9384 - val_loss: 0.2270
Epoch 4/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9402 - loss: 0.2235 - val_accuracy: 0.9412 - val_loss: 0.2188
Epoch 5/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9441 - loss: 0.2128 - val_accuracy: 0.9427 - val_loss: 0.2150
Epoch 6/10
[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/

{'filters': 128, 'dropout_rate': 0.2}

In [60]:
%%time

train_data_combined = pd.concat([train_data, train_data_rewritten], ignore_index=True)
val_data_combined = pd.concat([val_data, val_data_rewritten], ignore_index=True)
model_combined = train_model(
    train_data_combined,
    val_data_combined,
    filters=best_params_combined["filters"],
    dropout_rate=best_params_combined["dropout_rate"],
)
model_combined.save("results/CNN_model_combined_CUDA.keras")


Training with paras: filters=128, dropout_rate=0.2
Epoch 1/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8699 - loss: 1.1130 - val_accuracy: 0.9234 - val_loss: 0.2993
Epoch 2/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9281 - loss: 0.2800 - val_accuracy: 0.9300 - val_loss: 0.2499
Epoch 3/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9356 - loss: 0.2415 - val_accuracy: 0.9367 - val_loss: 0.2328
Epoch 4/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9418 - loss: 0.2236 - val_accuracy: 0.9412 - val_loss: 0.2172
Epoch 5/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9462 - loss: 0.2108 - val_accuracy: 0.9419 - val_loss: 0.2124
Epoch 6/10
[1m1702/1702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9482 - loss: 0.2030 - val_

In [61]:
evaluate_model(model_combined, train_data_combined, val_data_combined)

Evaluating Model
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.9389
Precision: 0.9304
Recall: 0.9299
F1: 0.9302


In [62]:
evaluate_model(model_combined, train_data_combined, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9707
Precision: 0.9682
Recall: 0.9649
F1: 0.9665


In [63]:
evaluate_model(model_combined, train_data_combined, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Accuracy: 0.9071
Precision: 0.8930
Recall: 0.8950
F1: 0.8940


In [64]:
model_combined2 = load_model("results/CNN_model_combined_CUDA.keras")
model_combined2.summary()

In [65]:
evaluate_model(model_combined2, train_data_combined, val_data_combined)

Evaluating Model
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.9389
Precision: 0.9304
Recall: 0.9299
F1: 0.9302


In [66]:
evaluate_model(model_combined2, train_data_combined, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9707
Precision: 0.9682
Recall: 0.9649
F1: 0.9665


In [67]:
evaluate_model(model_combined2, train_data_combined, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Accuracy: 0.9071
Precision: 0.8930
Recall: 0.8950
F1: 0.8940


In [68]:
data_rewritten = pd.concat(
    [train_data_rewritten, val_data_rewritten], ignore_index=True
)
data_rewritten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [69]:
%%time

best_params_rewritten = do_grid_search(data_rewritten)
best_params_rewritten


Testing filters=64, dropout_rate=0.2

Fold 1

Training with paras: filters=64, dropout_rate=0.2
Epoch 1/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7805 - loss: 1.3784 - val_accuracy: 0.8886 - val_loss: 0.4969
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8893 - loss: 0.4554 - val_accuracy: 0.8862 - val_loss: 0.3845
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8969 - loss: 0.3523 - val_accuracy: 0.8858 - val_loss: 0.3544
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9048 - loss: 0.3149 - val_accuracy: 0.8907 - val_loss: 0.3347
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9114 - loss: 0.2919 - val_accuracy: 0.8960 - val_loss: 0.3196
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accur

{'filters': 128, 'dropout_rate': 0.5}

In [70]:
%%time


model_rewritten = train_model(
    train_data_rewritten,
    val_data_rewritten,
    filters=best_params_rewritten["filters"],
    dropout_rate=best_params_rewritten["dropout_rate"],
)
model_rewritten.save("results/CNN_model_rewritten_CUDA.keras")


Training with paras: filters=128, dropout_rate=0.5
Epoch 1/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7903 - loss: 1.5395 - val_accuracy: 0.8802 - val_loss: 0.4677
Epoch 2/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8856 - loss: 0.4354 - val_accuracy: 0.8937 - val_loss: 0.3497
Epoch 3/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8968 - loss: 0.3459 - val_accuracy: 0.8934 - val_loss: 0.3313
Epoch 4/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9050 - loss: 0.3182 - val_accuracy: 0.8901 - val_loss: 0.3313
Epoch 5/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9098 - loss: 0.3046 - val_accuracy: 0.9043 - val_loss: 0.3058
Epoch 6/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9162 - loss: 0.2893 - val_accuracy: 0.

In [71]:
evaluate_model(model_rewritten, train_data_rewritten, val_data_combined)

Evaluating Model
[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.9194
Precision: 0.9337
Recall: 0.8782
F1: 0.9051


In [72]:
evaluate_model(model_rewritten, train_data_rewritten, val_data_rewritten)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9069
Precision: 0.8962
Recall: 0.8904
F1: 0.8933


In [73]:
evaluate_model(model_rewritten, train_data_rewritten, val_data)

Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9319
Precision: 0.9757
Recall: 0.8659
F1: 0.9175
