# CNN News

* **Dataset**: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail

## Importar dependencias

In [1]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import logging

2025-02-04 00:18:56.120735: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738624736.136427  224959 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738624736.141201  224959 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-04 00:18:56.157988: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Configuración

In [2]:
logging.basicConfig(level=logging.INFO)
tf.random.set_seed(42)
np.random.seed(42)
data = '../datasets/cnn/train-1.csv'

## Explorando datos

In [3]:
news_df = pd.read_csv(data)
news_df.shape

(50000, 3)

### Detectando columnas vacías

In [4]:
print(news_df.isnull().sum())

id            0
article       0
highlights    0
dtype: int64


### Definiendo columna objetivo

In [5]:
texts = news_df['highlights'].values

## Estableciendo el modelo

In [6]:
def initialize_bert_model():
    bert_preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
    bert_encoder = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')
    
    return bert_preprocess, bert_encoder

In [7]:
bert_preprocess, bert_encoder = initialize_bert_model()

INFO:absl:Using /tmp/tfhub_modules to cache modules.
I0000 00:00:1738624747.787662  224959 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1709 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1060, pci bus id: 0000:01:00.0, compute capability: 6.1
INFO:absl:Fingerprint not found. Saved model loading will continue.
INFO:absl:path_and_singleprint metric could not be logged. Saved model loading will continue.
INFO:absl:Fingerprint not found. Saved model loading will continue.
INFO:absl:path_and_singleprint metric could not be logged. Saved model loading will continue.


## Preparando embeddings

In [8]:
def generate_embeddings(texts, preprocess, encoder, batch_size=64):
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        encoder_inputs = preprocess(batch)
        outputs = encoder(encoder_inputs)
        embeddings.append(outputs['pooled_output'])
        
    return tf.concat(embeddings, axis=0)

In [9]:
embeddings = generate_embeddings(texts, bert_preprocess, bert_encoder)

## Identificando tópicos y palabras representativas

In [10]:
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extract_pca_topics(texts, embeddings, n_components=5, n_keywords=10):
    if tf.is_tensor(embeddings):
        embeddings = embeddings.numpy()
    
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(embeddings)
    
    vectorizer = TfidfVectorizer(max_features=768, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    
    topics_keywords = {}
    for topic_idx in range(n_components):
        weights = pca.components_[topic_idx]
        top_idx = np.abs(weights).argsort()[-n_keywords:][::-1]
        topics_keywords[topic_idx] = [feature_names[i] for i in top_idx]
    
    return topics_keywords, pca_result, pca

In [11]:
topics_keywords, pca_result, pca_model = extract_pca_topics(texts, embeddings)
topics_keywords

{0: ['different',
  'striker',
  'faces',
  'team',
  'training',
  'taken',
  'safety',
  'meet',
  'judge',
  'goal'],
 1: ['different',
  'team',
  'news',
  'judge',
  'training',
  'don',
  'manager',
  '2013',
  'drug',
  'old'],
 2: ['right',
  'face',
  'evidence',
  'victim',
  'church',
  'year',
  'august',
  'military',
  'news',
  'research'],
 3: ['different',
  'church',
  'victim',
  'manager',
  'evidence',
  'homes',
  'face',
  'research',
  'taking',
  'based'],
 4: ['working',
  'premier',
  'south',
  'based',
  'taking',
  'road',
  'worked',
  'church',
  'face',
  'claimed']}

## Clasificando información en base a los tópicos encontrados

In [38]:
def classify_topic(embedding, pca, topics_keywords, threshold=0.7):
    if tf.is_tensor(embedding):
        embedding = embedding.numpy()
    
    topic_vector = pca.transform(embedding.reshape(1, -1))[0]
    component_magnitudes = np.abs(topic_vector)
    top_topic_idx = np.argmax(component_magnitudes)
    confidence = float(component_magnitudes[top_topic_idx] / np.sum(component_magnitudes))
    
    if confidence < threshold:
        return None
        
    return {
        'topic_id': int(top_topic_idx),
        'confidence': confidence,
        'keywords': topics_keywords[top_topic_idx]
    }

In [39]:
topics = []
for embedding in embeddings:
    result = classify_topic(embedding, pca_model, topics_keywords)
    results.append(result)

In [40]:
topics = list(filter(None, results))

In [61]:
def predict_topic(text, bert_preprocess, bert_encoder, pca, topics_keywords, threshold=0.5):
    encoder_inputs = bert_preprocess([text])
    outputs = bert_encoder(encoder_inputs)
    embedding = outputs['pooled_output']
    
    if tf.is_tensor(embedding):
        embedding = embedding.numpy()
    
    print(f"Original embedding shape: {embedding.shape}")
    
    if len(embedding.shape) == 2:
        embedding_2d = embedding
    else:
        embedding_2d = embedding.reshape(1, -1)
    
    print(f"Reshaped embedding: {embedding_2d.shape}")
    
    topic_vector = pca.transform(embedding_2d)[0]
    print(f"PCA transformed shape: {topic_vector.shape}")
    
    magnitudes = np.abs(topic_vector)
    print(f"Magnitudes: {magnitudes}")
    
    top_topic_idx = np.argmax(magnitudes)
    confidence = float(magnitudes[top_topic_idx] / np.sum(magnitudes))
    
    print(f"Top topic: {top_topic_idx}")
    print(f"Confidence: {confidence}")
    
    if confidence < threshold:
        return None
        
    return {
        'topic_id': int(top_topic_idx),
        'confidence': confidence,
        'keywords': topics_keywords[top_topic_idx]}

In [62]:
new_text = "Las Vegas police update policy on working with ICE."
result = predict_topic(new_text, bert_preprocess, bert_encoder, pca_model, topics_keywords)

if result:
    print(f"Topic: {result['topic_id']}")
    print(f"Confidence: {result['confidence']:.2f}")
    print(f"Keywords: {', '.join(result['keywords'])}")

Original embedding shape: (1, 768)
Reshaped embedding: (1, 768)
PCA transformed shape: (5,)
Magnitudes: [4.9501734  2.02143    1.5693636  0.29183888 0.1684444 ]
Top topic: 0
Confidence: 0.5499428510665894
Topic: 0
Confidence: 0.55
Keywords: different, striker, faces, team, training, taken, safety, meet, judge, goal
