Step 1: Install Required Libraries

In [None]:
!pip install tensorflow tensorflow-text tensorflow-hub scikit-learn pandas numpy matplotlib seaborn

Step 2: Import Libraries

In [None]:
# Core
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # Required for BERT preprocessing

# Data handling
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Utilities
import re
import string

print("TensorFlow version:", tf.__version__)
print("TF-Hub version:", hub.__version__)

Step 3: Load and Prepare Sample Data

In [None]:
# Create synthetic data (replace with real logs/feedback in practice)
data = {
    'text': [
        "The system is running smoothly with no issues.",
        "Critical error in database connection!",
        "Response time is excellent.",
        "CPU usage spiked to 95% â€” investigate immediately.",
        "User interface is intuitive and fast.",
        "Application crashed twice today.",
        "Backup completed successfully.",
        "Network latency causing timeouts."
    ],
    'label': ['normal', 'anomaly', 'normal', 'anomaly', 'normal', 'anomaly', 'normal', 'anomaly']
}

df = pd.DataFrame(data)

# For real use: df = pd.read_csv('your_logs.csv')
print("Dataset shape:", df.shape)
print("\nClass distribution:")
print(df['label'].value_counts())

Step 4: Text Preprocessing (Basic Cleaning)

In [None]:
def clean_text(text):
    """Basic text cleaning for logs/feedback."""
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)  # Remove log tags [INFO], [ERROR]
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words with numbers
    return text.strip()

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)
print("\nSample cleaned text:")
print(df[['text', 'cleaned_text']].head(2))

Step 5: Encode Labels

In [None]:
# Convert string labels to integers
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

# Map back for interpretation
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\nLabel mapping:", label_map)

Step 6: Split Data

In [None]:
X = df['cleaned_text'].values
y = df['encoded_label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")

Step 7: Load Pre-trained BERT Model (via TensorFlow Hub)

In [None]:
# Use official BERT model from TF-Hub
bert_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
bert_encoder = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"

# Preprocessing layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessor = hub.KerasLayer(bert_preprocess, name='preprocessing')
encoder_inputs = preprocessor(text_input)

# BERT encoder
encoder = hub.KerasLayer(bert_encoder, trainable=False, name='BERT_encoder')
outputs = encoder(encoder_inputs)
pooled_output = outputs['pooled_output']  # [batch_size, 768]

print("BERT embedding dimension:", pooled_output.shape[-1])

Step 8: Build Classification Head

In [None]:
# Add dropout and dense layers for classification
x = tf.keras.layers.Dropout(0.3, name='dropout')(pooled_output)
x = tf.keras.layers.Dense(64, activation='relu', name='dense_hidden')(x)
outputs = tf.keras.layers.Dense(len(label_map), activation='softmax', name='classifier')(x)

# Assemble model
model = tf.keras.Model(text_input, outputs)

# Compile
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Step 9: Train the Model

In [None]:
# Train
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=3,
    batch_size=8,
    verbose=1
)

Step 10: Evaluate Performance

In [None]:
# Predict on test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

Step 11: Make Predictions on New Text

In [None]:
def predict_text(text):
    """Predict class for new text."""
    cleaned = clean_text(text)
    prob = model.predict([cleaned])[0]
    pred_class = label_encoder.inverse_transform([np.argmax(prob)])[0]
    confidence = np.max(prob)
    return pred_class, confidence

# Test
test_text = "Database connection failed repeatedly."
pred, conf = predict_text(test_text)
print(f"\nText: '{test_text}'")
print(f"Prediction: {pred} (Confidence: {conf:.2f})")