<a href="https://colab.research.google.com/github/selfproclaimedgenius1706/Alzheimer-s-Disease-Detection/blob/main/AlBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!python3 -m venv venv
!source venv/bin/activate
!pip install tensorflow transformers


In [None]:
import pandas as pd
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification

# TOKENIZATION !!!

In [None]:
df = pd.read_csv('dataToPadding.csv')
df = df[~df['Seri'].isin(['s004','s007','s024', 's096','s084','s085'])]

In [None]:
def construct_encodings(data, tokenizer, max_len, truncation=True, padding=True):
    encodings = tokenizer(data, max_length=max_len, truncation=truncation, padding=padding)

    # Warning khi có 1 thằng nào dài hơn max length
    for idx, input_ids in enumerate(encodings['input_ids']):
        if len(input_ids) > max_len:
            print(f"Warning: Sequence at index {idx} is longer than {max_len} tokens.")

    return encodings


In [None]:
!pip install sentencepiece
from transformers import AlbertTokenizer

x = df['PreprocessedContent'].tolist()
y = df['Label'].tolist()

tkzr = AlbertTokenizer.from_pretrained("albert-base-v2")
encodings = construct_encodings(x, tokenizer=tkzr, max_len=20)




Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [None]:
def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

tfdataset = construct_tfdataset(encodings, y)

In [None]:
#In thử ra xem cho zui
print(encodings['input_ids'][0])
print(encodings['attention_mask'][0])

[2, 134, 80, 18, 21, 449, 1288, 80, 10863, 10863, 14877, 14, 11217, 17, 10863, 14, 1936, 368, 17, 3]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


## APPROACH 1 : USING HIGH-LEVEL API TO TRAIN + SPLIT 7-3 (TRAIN-TEST)

In [None]:
#70% for train, 30% for test
TEST_SPLIT = 0.3
BATCH_SIZE = 4
train_size = int(len(x) * (1-TEST_SPLIT))

tfdataset = tfdataset.shuffle(len(x))
tfdataset_train = tfdataset.take(train_size)
tfdataset_test = tfdataset.skip(train_size)

tfdataset_train = tfdataset_train.batch(BATCH_SIZE)
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)

In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.optimizers import Adam
import numpy as np
from tensorflow.keras import activations, optimizers, losses


In [None]:
N_EPOCHS = 10
MODEL_NAME = 'albert-base-v2'
model = TFAlbertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=1e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c924d4d1b70>

In [None]:
#Check accuracy
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
accuracy = benchmarks['accuracy']
print(f'The accuracy of the model is {round(accuracy*100)}%')

The accuracy of the model is 97%


In [None]:
import torch
testDF = pd.read_csv('preprocessed_data2020.csv')
test_text = testDF['Preprocessed_Content']
test_label = testDF['label']

# Print the shape of the extracted arrays
print("test_text shape:", test_text.shape)
print("test_label shape:", test_label.shape)

tokens_test = tkzr.batch_encode_plus(
    test_text.tolist(),
    max_length = 40,
    pad_to_max_length=True,
    truncation=True
)

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_label.tolist())

test_text shape: (48,)
test_label shape: (48,)




In [None]:
from sklearn.metrics import classification_report
import tensorflow as tf

# Convert PyTorch tensors to TensorFlow tensors
test_seq_tf = tf.convert_to_tensor(test_seq.numpy())
test_mask_tf = tf.convert_to_tensor(test_mask.numpy())

# Use tf.config.list_physical_devices to check for GPU availability
gpu_available = tf.config.list_physical_devices('GPU')

# Use GPU if available, otherwise use CPU
device = "/GPU:0" if gpu_available else "/CPU:0"

with tf.device(device):
    preds = model([test_seq_tf, test_mask_tf])

# Access the logits and convert them to a NumPy array
logits = preds.logits.numpy()
preds = np.argmax(logits, axis=1)
print(classification_report(test_y, preds))


              precision    recall  f1-score   support

           0       0.70      0.88      0.78        24
           1       0.83      0.62      0.71        24

    accuracy                           0.75        48
   macro avg       0.77      0.75      0.75        48
weighted avg       0.77      0.75      0.75        48



In [None]:
from google.colab import drive
drive.mount('/content/drive')


model_directory = "/content/drive/MyDrive/AlBERT/Accuracy77"
model.save(model_directory)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##APPROACH 2 : TRAINING LOOP + USING K-FOLD CROSS VALIDATION

In [None]:
# Initialize KFold cross-validator
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
BATCH_SIZE = 16
N_EPOCHS = 5

# Initialize lists to store metrics across folds
fold_train_losses = []
fold_train_accuracies = []
fold_val_losses = []
fold_val_accuracies = []

# Convert x and y to NumPy arrays
x = np.array(encodings['input_ids'])
y = np.array(y)

#TRAINING LOOP
for fold, (train_idx, val_idx) in enumerate(kf.split(x)):
    print(f"Fold {fold + 1}/{N_FOLDS}")

    x_train, y_train = x[train_idx], y[train_idx]
    x_val, y_val = x[val_idx], y[val_idx]

    # Create and compile the model
    model = TFAlbertForSequenceClassification.from_pretrained(MODEL_NAME)
    optimizer = Adam(learning_rate=0.0002)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy' , metrics=['accuracy'])

    # Create TensorFlow Datasets
    tfdataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    tfdataset_train = tfdataset_train.shuffle(len(x_train)).batch(BATCH_SIZE)

    tfdataset_val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    tfdataset_val = tfdataset_val.batch(BATCH_SIZE)

    # Build the optimizer with the list of trainable variables
    optimizer.build(model.trainable_variables)

    # Train the model
    history = model.fit(tfdataset_train, epochs=N_EPOCHS, validation_data=tfdataset_val)

    # Store metrics for this fold
    fold_train_losses.append(history.history['loss'])
    fold_train_accuracies.append(history.history['accuracy'])
    fold_val_losses.append(history.history['val_loss'])
    fold_val_accuracies.append(history.history['val_accuracy'])

# Calculate average metrics across all folds
avg_train_loss = np.mean(fold_train_losses, axis=0)
avg_train_accuracy = np.mean(fold_train_accuracies, axis=0)
avg_val_loss = np.mean(fold_val_losses, axis=0)
avg_val_accuracy = np.mean(fold_val_accuracies, axis=0)

print("Average Training Loss:", avg_train_loss)
print("Average Training Accuracy:", avg_train_accuracy)
print("Average Validation Loss:", avg_val_loss)
print("Average Validation Accuracy:", avg_val_accuracy)



Fold 1/5


All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 2/5


All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 3/5


All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 4/5


All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 5/5


All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Average Training Loss: [2.9036983  0.83031548 0.69314718 0.71915703 0.69314718]
Average Training Accuracy: [0.50240963 0.48066413 0.48551279 0.49030268 0.50984426]
Average Validation Loss: [0.68621572 0.69314718 0.69314718 0.69314718 0.69314718]
Average Validation Accuracy: [0.58238096 0.53476191 0.54428572 0.54428572 0.45857143]


## FUNCTIONS ĐỂ TEST ON REAL DATA (COMPLETELY NEW DATA)

In [None]:
#Hàm sài để test 1 sample data lẻ (1 transcript)
def create_predictor(model, model_name, max_len):
  tkzr = DistilBertTokenizer.from_pretrained(model_name)
  def predict_proba(text):
      x = [text]

      encodings = construct_encodings(x, tkzr, max_len=max_len)
      tfdataset = construct_tfdataset(encodings)
      tfdataset = tfdataset.batch(1)

      preds = model.predict(tfdataset).logits
      preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
      return preds[0][0]

  return predict_proba

In [None]:
#VÍ DỤ CÁCH SÀI

clf = create_predictor(model, MODEL_NAME,20)
proba = clf('''Hello
''')
if proba>0.5:
  print(f'Model prediction : Alzheimer')
  print(f'The percentage of Alzheimer is : {round(proba*100)}%')
else:
  print("Model prediction : Healthy")
  print(f'The percentage of healthy is : {round(100-proba*100)}%')


In [None]:
#Hàm sài để test 1 dataset luôn (nhiều transcript cùng lúc)
#HÀM SẼ DOWNLOAD DATASET RESULT VỀ MÁY !

def create_predictor(model, model_name, max_len):
    tkzr = AlbertTokenizer.from_pretrained(model_name)

    def predict_proba(text):
        x = [text]

        encodings = construct_encodings(x, tkzr, max_len=max_len)
        tfdataset = construct_tfdataset(encodings)
        tfdataset = tfdataset.batch(1)

        preds = model.predict(tfdataset).logits
        preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
        return preds[0][0]

    return predict_proba


In [None]:
#VÍ DỤ CÁCH SÀI

# Load the new DataFrame
new_df = pd.read_csv("./preprocessed_data2020.csv")

# Define the maximum sequence length
max_len = 50

# Create the predictor function
predict_proba = create_predictor(model, "albert-base-v2", 50)

# Apply the predictor function to each row of the new DataFrame
new_df["Predictions"] = new_df["Preprocessed_Content"].apply(predict_proba)

csv_filename = "test_data2020(2).csv"
new_df.to_csv(csv_filename, index=False)

from google.colab import files
files.download(csv_filename)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>