<a href="https://colab.research.google.com/github/selfproclaimedgenius1706/Alzheimer-s-Disease-Detection/blob/main/RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch

In [None]:
%%capture
!python3 -m venv venv
!source venv/bin/activate
!pip install tensorflow transformers


In [None]:
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification  # Import RoBERTa components

import pickle


In [None]:
df = pd.read_csv('dataToPadding.csv')

In [None]:
#Tokenization !!!
def construct_encodings(data, tokenizer, max_len, truncation=True, padding=True):
    encodings = tokenizer(data, max_length=max_len, truncation=truncation, padding=padding)

    #Alert if any sample > max len
    for idx, input_ids in enumerate(encodings['input_ids']):
        if len(input_ids) > max_len:
            print(f"Warning: Sequence at index {idx} is longer than {max_len} tokens.")

    return encodings



In [None]:
#Chia x(feature),y(label)
x = df['PreprocessedContent'].tolist()
y  = df['Label'].tolist()

#load model
MODEL_NAME = 'roberta-base'
tkzr = RobertaTokenizer.from_pretrained(MODEL_NAME)

#Tokenize
encodings = construct_encodings(x, tokenizer = tkzr, max_len=40)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
#Ghép x và y lại dataset để train/test
def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

tfdataset = construct_tfdataset(encodings, y)


In [None]:
#70% for train, 30% for test
TEST_SPLIT = 0.3
train_size = int(len(x) * (1-TEST_SPLIT))

#Set batch size
BATCH_SIZE = 2

#shuffle lên
tfdataset = tfdataset.shuffle(len(x))

tfdataset_train = tfdataset.take(train_size)
tfdataset_train = tfdataset_train.batch(BATCH_SIZE)

tfdataset_test = tfdataset.skip(train_size)
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)

In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.optimizers import Adam
import numpy as np


In [None]:
N_EPOCHS = 20
model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=1e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7d4b6edb6b60>

In [None]:
testDF = pd.read_csv('preprocessed_data2020.csv')
test_text = testDF['Preprocessed_Content']
test_label = testDF['label']

# Print the shape of the extracted arrays
print("test_text shape:", test_text.shape)
print("test_label shape:", test_label.shape)

tokens_test = tkzr.batch_encode_plus(
    test_text.tolist(),
    max_length = 40,
    pad_to_max_length=True,
    truncation=True
)

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_label.tolist())

test_text shape: (48,)
test_label shape: (48,)




In [None]:
from sklearn.metrics import classification_report
import tensorflow as tf

# Convert PyTorch tensors to TensorFlow tensors
test_seq_tf = tf.convert_to_tensor(test_seq.numpy())
test_mask_tf = tf.convert_to_tensor(test_mask.numpy())

# Use tf.config.list_physical_devices to check for GPU availability
gpu_available = tf.config.list_physical_devices('GPU')

# Use GPU if available, otherwise use CPU
device = "/GPU:0" if gpu_available else "/CPU:0"

with tf.device(device):
    preds = model([test_seq_tf, test_mask_tf])

# Access the logits and convert them to a NumPy array
logits = preds.logits.numpy()
preds = np.argmax(logits, axis=1)
print(classification_report(test_y, preds))


              precision    recall  f1-score   support

           0       0.67      0.75      0.71        24
           1       0.71      0.62      0.67        24

    accuracy                           0.69        48
   macro avg       0.69      0.69      0.69        48
weighted avg       0.69      0.69      0.69        48



In [None]:
from google.colab import drive
drive.mount('/content/drive')


model_directory = "/content/drive/MyDrive/RoBERTa"
model.save_pretrained(model_directory)


Mounted at /content/drive
