In [None]:
!pip uninstall tensorflow -y

Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0


In [None]:
!pip install tensorflow==2.15.0

Collecting tensorflow==2.15.0
  Using cached tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Using cached tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
Installing collected packages: tensorflow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf-keras 2.17.0 requires tensorflow<2.18,>=2.17, but you have tensorflow 2.15.0 which is incompatible.[0m[31m
[0mSuccessfully installed tensorflow-2.15.0


In [None]:
import tensorflow
print(tensorflow.__version__)

2.15.0


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense, Flatten
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [None]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"),
             Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
df = pd.read_csv('/content/labeled_twitter_DEENT_km2_Exp-50w_remove-7words_SHAP-LIME_improve.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123177 entries, 0 to 123176
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   text     123177 non-null  object
 1   cluster  123177 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [None]:
print(df.isnull().sum().sum())
print(df.isna().sum().sum())
print(df.duplicated().sum().sum())

0
0
0


# DEEN-Transformer Classification Model

In [None]:
from sklearn.preprocessing import LabelEncoder

X = df['text']
encoder = LabelEncoder()
y = encoder.fit_transform(df['cluster'])
print("shape of input data: ", X.shape)
print("shape of target variable: ", y.shape)

shape of input data:  (123177,)
shape of target variable:  (123177,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import regularizers

max_words = 5000
max_len = 200
embedding_dim = 128

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')
vocab_size = len(word_index) + 1

X_train_tweets = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_tweets = pad_sequences(X_test_sequences, maxlen=max_len)

Found 77203 unique tokens.


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

oversample = SMOTE(sampling_strategy=1, random_state=42)
over_X_train, over_y_train = oversample.fit_resample(X_train_tweets, y_train)

In [None]:
from collections import Counter

print('Original dataset shape {}'.format(Counter(y_train)))
print('Resampled dataset shape {}'.format(Counter(over_y_train)))

Original dataset shape Counter({0: 55979, 1: 42562})
Resampled dataset shape Counter({1: 55979, 0: 55979})


In [None]:
print(np.count_nonzero(over_y_train == 0))
print(np.count_nonzero(over_y_train == 1))

print(np.count_nonzero(y_test == 0))
print(np.count_nonzero(y_test == 1))

55979
55979
13995
10641


In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# matthews_correlation_coefficient
def mcc_m(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    tn = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    fp = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))

    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    return num / K.sqrt(den + K.epsilon())

def balanced_acc_m(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    tn = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    fp = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))

    sensitivity = tp / (tp + fn)
    specifity = tn / (fp + tn)
    return (sensitivity + specifity) / (2 + K.epsilon())

In [None]:
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = Input(shape=(max_len,))
embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, ff_dim)
x = transformer_block(x, True)
x = GlobalAveragePooling1D()(x)
x = Flatten()(x)
x = Dropout(0.1)(x)
x = Dense(32, activation="tanh")(x)
x = Dense(8, activation="tanh")(x)
x = Dropout(0.1)(x)
outputs = Dense(1, activation="sigmoid", kernel_regularizer=regularizers.l2(0.1))(x)

model = Model(inputs=inputs, outputs=outputs)

#model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss="binary_crossentropy", metrics=['accuracy', balanced_acc_m, precision_m, recall_m, f1_m, mcc_m, keras.metrics.AUC()])
checkpoint = ModelCheckpoint("best_epoch_transformer_model_improve_smote.h5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='auto', period=1, save_weights_only=False)
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10)
history = model.fit(over_X_train, over_y_train, batch_size=64, epochs=50, validation_data=(X_test_tweets, y_test), callbacks=[checkpoint, stop_early])



Epoch 1/50
Epoch 1: val_accuracy improved from -inf to 0.79920, saving model to best_epoch_transformer_model_improve_smote.h5


  saving_api.save_model(


Epoch 2/50
Epoch 2: val_accuracy improved from 0.79920 to 0.81819, saving model to best_epoch_transformer_model_improve_smote.h5
Epoch 3/50
Epoch 3: val_accuracy improved from 0.81819 to 0.82071, saving model to best_epoch_transformer_model_improve_smote.h5
Epoch 4/50
Epoch 4: val_accuracy did not improve from 0.82071
Epoch 5/50
Epoch 5: val_accuracy did not improve from 0.82071
Epoch 6/50
Epoch 6: val_accuracy did not improve from 0.82071
Epoch 7/50
Epoch 7: val_accuracy did not improve from 0.82071
Epoch 8/50
Epoch 8: val_accuracy did not improve from 0.82071
Epoch 9/50
Epoch 9: val_accuracy did not improve from 0.82071
Epoch 10/50
Epoch 10: val_accuracy did not improve from 0.82071
Epoch 11/50
Epoch 11: val_accuracy did not improve from 0.82071
Epoch 12/50
Epoch 12: val_accuracy did not improve from 0.82071
Epoch 13/50
Epoch 13: val_accuracy did not improve from 0.82071


In [None]:
import json

results_transformer = json.dumps(history.history)

with open('results_transf_full_clean_smote-train_depressive.json', 'w') as archivo:
    archivo.write(results_transformer)