# Step 1: Install necessary dependencies

In [1]:
!pip install transformers --quiet
!pip install seaborn --quiet
!pip install sklearn --quiet


[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issu

In [2]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm





# Step 2: Load and Prepare the BERT Model

In [3]:
def load_bert_model(trainable=False):
    """
    Load the pre-trained BERT model with the option to freeze or unfreeze its layers.
    """
    model = TFBertModel.from_pretrained("bert-base-uncased")
    model.bert.trainable = trainable
    return model

# Step 3: Build the Model Architecture

In [4]:
def build_classification_model(num_classes, bert_model, max_length=140, learning_rate=2e-5):
    """
    Build a classification model with BERT as the base and a Dense layer for classification.
    """
    # Input layers
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')
    
    # BERT output
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    
    # Dropout and Dense layer for classification
    x = tf.keras.layers.Dropout(0.2)(pooled_output)
    output_layer = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
    
    # Build and compile the model
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output_layer)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),  
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

# Step 4: Load and Prepare the Data

In [5]:
def load_data(filepath):
    return pd.read_csv(filepath)

def preprocess_data(df, target_col, num_classes):
    y = tf.keras.utils.to_categorical(df[target_col].values, num_classes=num_classes)
    sentences = df['content'].values    
    return sentences, y

def tokenize_data(tokenizer, sentences, max_length=140):
    tokens = tokenizer(sentences, max_length=max_length, padding=True, truncation=True, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

# Step 5: Train and Evaluate the Model

In [6]:
def train_and_evaluate_model(model, X_train, y_train, X_val, y_val, epochs=5):
    history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val), batch_size=32)
    return history

def evaluate_model(model, X_test, y_test, class_names):
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_test = np.argmax(y_test, axis=1)
    print(classification_report(y_test, y_pred, target_names=class_names))

# Excecution

# 1. Load the BERT model

In [7]:
bert_model = load_bert_model(trainable=False)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

# 2. Load and prepare the data

In [8]:
df = load_data('reviews.csv')

# 3. Process data for 3 classes

In [9]:
df['sentiment_3_classes'] = df.score.apply(lambda x: 0 if x <= 2 else 1 if x == 3 else 2)
class_names_3 = ['negative', 'neutral', 'positive']
sentences_3, y_3_classes = preprocess_data(df, 'sentiment_3_classes', num_classes=3)

# 4. Split and tokenize the data

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
X_train_val, X_test, y_train_val, y_test_3_classes = train_test_split(sentences_3, y_3_classes, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train = X_train.tolist()
X_val = X_val.tolist()
X_test = X_test.tolist()

X_train_tokens = tokenize_data(tokenizer, X_train)
X_val_tokens = tokenize_data(tokenizer, X_val)
X_test_tokens = tokenize_data(tokenizer, X_test)



# 5. Build and train the model for 3 classes

In [11]:
learning_rate = 2e-3
model_3_classes = build_classification_model(num_classes=3, bert_model=bert_model, learning_rate=learning_rate)
history_3_classes = train_and_evaluate_model(model_3_classes, [X_train_tokens[0], X_train_tokens[1]], y_train, [X_val_tokens[0], X_val_tokens[1]], y_val)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# 6. Evaluate the model for 3 classes

In [13]:
evaluate_model(model_3_classes, [X_test_tokens[0], X_test_tokens[1]], y_test_3_classes, class_names_3)

              precision    recall  f1-score   support

    negative       0.52      0.81      0.64       997
     neutral       0.53      0.08      0.14       979
    positive       0.64      0.79      0.70      1174

    accuracy                           0.58      3150
   macro avg       0.56      0.56      0.49      3150
weighted avg       0.57      0.58      0.51      3150



# 7. Process data for 5 classes

In [14]:
df['sentiment_5_classes'] = df.score.apply(lambda x: x - 1)
class_names_5 = ['1', '2', '3', '4', '5']
sentences_5, y_5_classes = preprocess_data(df, 'sentiment_5_classes', num_classes=5)

# 8. Split and tokenize the data

In [15]:
X_train_val, X_test, y_train_val, y_test_5_classes = train_test_split(sentences_5, y_5_classes, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train = X_train.tolist()
X_val = X_val.tolist()
X_test = X_test.tolist()

X_train_tokens = tokenize_data(tokenizer, X_train)
X_val_tokens = tokenize_data(tokenizer, X_val)
X_test_tokens = tokenize_data(tokenizer, X_test)

# 9. Build and train the model for 5 classes

In [16]:
model_5_classes = build_classification_model(num_classes=5, bert_model=bert_model)
history_5_classes = train_and_evaluate_model(model_5_classes, [X_train_tokens[0], X_train_tokens[1]], y_train, [X_val_tokens[0], X_val_tokens[1]], y_val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# 10. Evaluate the model for 5 classes

In [17]:
evaluate_model(model_5_classes, [X_test_tokens[0], X_test_tokens[1]], y_test_5_classes, class_names_5)

              precision    recall  f1-score   support

           1       0.50      0.00      0.01       515
           2       0.00      0.00      0.00       482
           3       0.31      0.99      0.48       979
           4       0.17      0.00      0.00       591
           5       0.24      0.01      0.01       583

    accuracy                           0.31      3150
   macro avg       0.24      0.20      0.10      3150
weighted avg       0.25      0.31      0.15      3150



# 11.

In [18]:
# Tokenizar los datos
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 140

def tokenize(sentences):
    return bert_tokenizer(sentences, max_length=max_length, padding=True, truncation=True, return_tensors='tf')

def analyze_sentence(text, model_3_classes, model_5_classes, class_names_3, class_names_5):
    # Tokenizar la frase
    tokens = tokenize([text])
    
    # Predecir la clase para la frase con el modelo de 3 clases
    prediction_3_classes = model_3_classes.predict([tokens['input_ids'], tokens['attention_mask']])
    predicted_class_3 = np.argmax(prediction_3_classes, axis=1)[0]
    
    # Predecir la clase para la frase con el modelo de 5 clases
    prediction_5_classes = model_5_classes.predict([tokens['input_ids'], tokens['attention_mask']])
    predicted_class_5 = np.argmax(prediction_5_classes, axis=1)[0]
    
    # Obtener las etiquetas de las clases predichas
    pred_class_3 = class_names_3[predicted_class_3]
    pred_class_5 = class_names_5[predicted_class_5]
    
    print(f"Frase: {text}")
    print(f"Predicción (3 clases): {pred_class_3}")
    print(f"Predicción (5 clases): {pred_class_5}")
    print()
    
    return pred_class_3, pred_class_5



In [19]:
# Lista de frases para analizar
texts = [
    "This app is amazing, I use it every day!",
    "Terrible experience, the app crashes all the time.",
    "It's a decent app, but it has some bugs.",
    "The app is okay, but there are better alternatives.",
    "I hate the new update, it ruined everything.",
    "This app works perfectly."
]

# Analizar cada frase
for text in texts:
    analyze_sentence(text, model_3_classes, model_5_classes, class_names_3, class_names_5)

Frase: This app is amazing, I use it every day!
Predicción (3 clases): positive
Predicción (5 clases): 3

Frase: Terrible experience, the app crashes all the time.
Predicción (3 clases): negative
Predicción (5 clases): 3

Frase: It's a decent app, but it has some bugs.
Predicción (3 clases): negative
Predicción (5 clases): 3

Frase: The app is okay, but there are better alternatives.
Predicción (3 clases): negative
Predicción (5 clases): 3

Frase: I hate the new update, it ruined everything.
Predicción (3 clases): negative
Predicción (5 clases): 3

Frase: This app works perfectly.
Predicción (3 clases): positive
Predicción (5 clases): 3

