In [1]:
!pip install -q tensorflow scikit-learn gensim

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, regularizers, Model
from tensorflow.keras.optimizers import Adam
from gensim.models import KeyedVectors # Used for loading external word vectors

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# --- Data Loading ---
# **Ensure your dataset is accessible at this path**
df = pd.read_csv("/content/drive/MyDrive/bayyin_dataset.csv")
# Expecting columns: "Sentence" for text and "Readability_Level" for label
texts = df["Sentence"].astype(str).tolist()
labels = df["Readability_Level"].astype(str).tolist()
print("Dataset loaded. First 5 rows:")
print(df.head())

Mounted at /content/drive
Dataset loaded. First 5 rows:
            ID                                           Sentence  Word_Count  \
0  10102480006                                    انظر يا فهمان..           5   
1  30400010076  تشجع الدولة الديمقراطية القائمة على المشاركة ع...          11   
2  30500230021                 رسولُ اللّهِ والحياةُ الاجتماعيّةُ           4   
3  20200330009  س: ما هي عاصمة ألبانيا؟  (أ) تيرانا  (ب) ساوتو...          19   
4  20101750010  وظلت مأهولة لمدة 700 سنة أخرى على الأقل، كجماع...          18   

                                                Word  \
0                                  انظر يا فهمان . .   
1  تشجع الدولة الديمقراطية القائمة على المشاركة ع...   
2                       رسول الله والحياة الاجتماعية   
3  س : ما هي عاصمة ألبانيا ؟ ( أ ) تيرانا ( ب ) س...   
4  وظلت مأهولة لمدة 700 سنة أخرى على الأقل , كجما...   

                                                 Lex  \
0                                   نظر يا فهمان . .   
1  شجع د

In [3]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("Number of classes =", num_classes)

# --- Hyperparameters ---
MAX_NUM_WORDS = 165647   # Vocab size limit
MAX_SEQUENCE_LENGTH = 128   # Max sentence length
EMBEDDING_DIM = 300        # **Must match your external embedding dimension**

Classes: ['1' '2' '3' '4' '5' '6']
Number of classes = 6


In [4]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print("Unique tokens in vocab:", len(word_index))

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

Unique tokens in vocab: 165647


In [5]:
EXTERNAL_EMBEDDING_PATH = "/CAMeLBERT-mix_Bayyin (1).ipynb" # <--- **UPDATE THIS PATH**

# Load the external pre-trained word vectors
print(f"Loading embeddings from {EXTERNAL_EMBEDDING_PATH}...")
try:
    # Assumes a word2vec text format (word followed by space-separated vector values)
    external_embeddings_wv = KeyedVectors.load_word2vec_format(
        EXTERNAL_EMBEDDING_PATH,
        binary=False
    )
    print("External embeddings loaded successfully.")
except Exception as e:
    print(f"Error loading external embeddings: {e}. Check the path and format.")
    external_embeddings_wv = None


# Create the embedding matrix for our model
vocab_size = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
words_found = 0

if external_embeddings_wv:
    for word, idx in word_index.items():
        if idx >= vocab_size:
            continue
        # Check if the word in our vocabulary is present in the external embeddings
        if word in external_embeddings_wv:
            embedding_matrix[idx] = external_embeddings_wv[word]
            words_found += 1

print(f"Found {words_found} word vectors out of {vocab_size-1} vocabulary words.")

Loading embeddings from /CAMeLBERT-mix_Bayyin (1).ipynb...
Error loading external embeddings: invalid literal for int() with base 10: '{"metadata":{"kernelspec":{"language":"python","display_name":"Python'. Check the path and format.
Found 0 word vectors out of 165646 vocabulary words.


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
def build_textcnn_model(vocab_size,
                        max_len,
                        num_classes,
                        embedding_dim=300,
                        filter_sizes=(3, 4, 5),
                        num_filters=100,
                        dropout_rate=0.5,
                        l2_reg=0.0,
                        embedding_matrix=None,
                        embedding_trainable=True):
    # 1) Input layer
    inputs = layers.Input(shape=(max_len,), dtype="int32")

    # 2) Embedding layer
    if embedding_matrix is not None:
        # Use pre-trained embeddings
        embedding = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            weights=[embedding_matrix],
            trainable=embedding_trainable # True means the weights will be fine-tuned
        )(inputs)
    else:
        # Random trainable embeddings (fallback)
        embedding = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim
        )(inputs)

    # 3) Parallel conv + max-pool for each kernel size
    conv_outputs = []
    for k in filter_sizes:
        conv = layers.Conv1D(
            filters=num_filters,
            kernel_size=k,
            activation="relu",
            padding="valid"
        )(embedding)
        pooled = layers.GlobalMaxPooling1D()(conv)
        conv_outputs.append(pooled)

    # 4) Concatenate
    if len(conv_outputs) > 1:
        x = layers.concatenate(conv_outputs, axis=-1)
    else:
        x = conv_outputs[0]

    # 5) Dropout
    x = layers.Dropout(dropout_rate)(x)

    # 6) Output layer
    outputs = layers.Dense(
        num_classes,
        activation="softmax",
        kernel_regularizer=regularizers.l2(l2_reg) if l2_reg > 0 else None
    )(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}")

# Build the model using the loaded embedding matrix
model = build_textcnn_model(
    vocab_size=vocab_size,
    max_len=MAX_SEQUENCE_LENGTH,
    num_classes=num_classes,
    embedding_dim=EMBEDDING_DIM,
    embedding_matrix=embedding_matrix,
    embedding_trainable=True # Fine-tuning the pre-trained embeddings
)

Training set shape: (37128, 128), Validation set shape: (9282, 128)


In [9]:
model.compile(
    loss="sparse_categorical_crossentropy",   # y are integer labels
    optimizer=Adam(learning_rate=0.001),
    metrics=["accuracy"]
)

model.summary()

In [10]:
BATCH_SIZE = 64
EPOCHS = 10

history = model.fit(
    X_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    verbose=1
)

Epoch 1/10
[1m581/581[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m643s[0m 1s/step - accuracy: 0.1602 - loss: 1.7919 - val_accuracy: 0.1667 - val_loss: 1.7918
Epoch 2/10
[1m581/581[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 1s/step - accuracy: 0.1672 - loss: 1.7918 - val_accuracy: 0.1667 - val_loss: 1.7918
Epoch 3/10
[1m334/581[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m4:22[0m 1s/step - accuracy: 0.1722 - loss: 1.7917

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import (
    cohen_kappa_score,
    classification_report,
    confusion_matrix,
    accuracy_score
)
import numpy as np

y_val_proba = model.predict(X_val)
y_val_pred = np.argmax(y_val_proba, axis=1)

qwk = cohen_kappa_score(y_val, y_val_pred, weights='quadratic')
print("Quadratic Weighted Kappa (QWK):", qwk)

acc = accuracy_score(y_val, y_val_pred)
print("Accuracy:", acc)

print("\nClassification report:")
# Use label_encoder.classes_ to get the original readability levels
print(classification_report(
    y_val,
    y_val_pred,
    target_names=label_encoder.classes_
))

cm = confusion_matrix(y_val, y_val_pred)
print("\nConfusion matrix (rows = true, cols = predicted):")
print(cm)