In [None]:
import google.generativeai as genai
from IPython.display import HTML, Markdown, display


GOOGLE_API_KEY = "GEMINI KEY"
genai.configure(api_key=GOOGLE_API_KEY)

In [2]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test")

# View list of class names for dataset
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
print(newsgroups_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [4]:
import email
import re

import pandas as pd


def preprocess_newsgroup_row(data):
    # Extract only the subject and body
    msg = email.message_from_string(data)
    text = f"{msg['Subject']}\n\n{msg.get_payload()}"
    # Strip any remaining email addresses
    text = re.sub(r"[\w\.-]+@[\w\.-]+", "", text)
    # Truncate each entry to 5,000 characters
    text = text[:5000]

    return text


def preprocess_newsgroup_data(newsgroup_dataset):
    # Put data points into dataframe
    df = pd.DataFrame(
        {"Text": newsgroup_dataset.data, "Label": newsgroup_dataset.target}
    )
    # Clean up the text
    df["Text"] = df["Text"].apply(preprocess_newsgroup_row)
    # Match label to target name index
    df["Class Name"] = df["Label"].map(lambda l: newsgroup_dataset.target_names[l])

    return df

In [5]:
# Apply preprocessing function to training and test datasets
df_train = preprocess_newsgroup_data(newsgroups_train)
df_test = preprocess_newsgroup_data(newsgroups_test)

df_train.head()

Unnamed: 0,Text,Label,Class Name
0,WHAT car is this!?\n\n I was wondering if anyo...,7,rec.autos
1,SI Clock Poll - Final Call\n\nA fair number of...,4,comp.sys.mac.hardware
2,"PB questions...\n\nwell folks, my mac plus fin...",4,comp.sys.mac.hardware
3,Re: Weitek P9000 ?\n\nRobert J.C. Kyanko () wr...,1,comp.graphics
4,Re: Shuttle Launch Question\n\nFrom article <>...,14,sci.space


In [6]:
def sample_data(df, num_samples, classes_to_keep):
    # Sample rows, selecting num_samples of each Label.
    df = (
        df.groupby("Label")[df.columns]
        .apply(lambda x: x.sample(num_samples))
        .reset_index(drop=True)
    )

    df = df[df["Class Name"].str.contains(classes_to_keep)]

    # We have fewer categories now, so re-calibrate the label encoding.
    df["Class Name"] = df["Class Name"].astype("category")
    df["Encoded Label"] = df["Class Name"].cat.codes

    return df

In [7]:
TRAIN_NUM_SAMPLES = 100
TEST_NUM_SAMPLES = 25
CLASSES_TO_KEEP = "sci"  # Class name should contain 'sci' to keep science categories

df_train = sample_data(df_train, TRAIN_NUM_SAMPLES, CLASSES_TO_KEEP)
df_test = sample_data(df_test, TEST_NUM_SAMPLES, CLASSES_TO_KEEP)

In [8]:
df_train.value_counts("Class Name")

Class Name
sci.crypt          100
sci.electronics    100
sci.med            100
sci.space          100
Name: count, dtype: int64

In [9]:
df_test.value_counts("Class Name")

Class Name
sci.crypt          25
sci.electronics    25
sci.med            25
sci.space          25
Name: count, dtype: int64

In [10]:
from google.api_core import retry
from tqdm.rich import tqdm


tqdm.pandas()

@retry.Retry(timeout=300.0)
def embed_fn(text: str) -> list[float]:
    # You will be performing classification, so set task_type accordingly.
    response = genai.embed_content(
        model="models/text-embedding-004", content=text, task_type="classification"
    )

    return response["embedding"]


def create_embeddings(df):
    df["Embeddings"] = df["Text"].progress_apply(embed_fn)
    return df

In [11]:
df_train = create_embeddings(df_train)
df_test = create_embeddings(df_test)
df_train.head()

Output()

  t = cls(total=total, **tqdm_kwargs)


Output()

  t = cls(total=total, **tqdm_kwargs)


Unnamed: 0,Text,Label,Class Name,Encoded Label,Embeddings
1100,Re: Estimating Wiretap Costs/Benefits\n\n (Rob...,11,sci.crypt,0,"[-0.0077073346, 0.010468528, -0.040350024, 0.0..."
1101,Is there ANY security in the Clipper?\n\nIt se...,11,sci.crypt,0,"[0.009000731, 0.03144947, -0.04750469, 0.01482..."
1102,DOS 6.0 compression API: partial answer\n\nFor...,11,sci.crypt,0,"[-0.005675148, 0.006596165, -0.035962675, 0.04..."
1103,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0,"[0.0021466878, 0.012645809, -0.050081693, 0.02..."
1104,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0,"[-0.0087635005, 0.016608795, -0.047265254, 0.0..."


generación de embeddings en paralelo o de forma asíncrona

In [12]:
import concurrent.futures
from google.api_core import retry
from tqdm.rich import tqdm
import google.generativeai as genai

tqdm.pandas()

# Función para generar embeddings de forma asincrónica
@retry.Retry(timeout=300.0)
def embed_fn(text: str) -> list[float]:
    response = genai.embed_content(
        model="models/text-embedding-004", content=text, task_type="classification"
    )
    return response["embedding"]

# Función para aplicar los embeddings en paralelo
def create_embeddings_parallel(df):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        embeddings = list(tqdm(executor.map(embed_fn, df["Text"]), total=len(df), desc="Generating embeddings"))
    df["Embeddings"] = embeddings
    return df


In [13]:
df_train = create_embeddings(df_train)
df_test = create_embeddings(df_test)
df_train.head()

Output()

  t = cls(total=total, **tqdm_kwargs)


Output()

Unnamed: 0,Text,Label,Class Name,Encoded Label,Embeddings
1100,Re: Estimating Wiretap Costs/Benefits\n\n (Rob...,11,sci.crypt,0,"[-0.0077073346, 0.010468528, -0.040350024, 0.0..."
1101,Is there ANY security in the Clipper?\n\nIt se...,11,sci.crypt,0,"[0.009000731, 0.03144947, -0.04750469, 0.01482..."
1102,DOS 6.0 compression API: partial answer\n\nFor...,11,sci.crypt,0,"[-0.005675148, 0.006596165, -0.035962675, 0.04..."
1103,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0,"[0.0021466878, 0.012645809, -0.050081693, 0.02..."
1104,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0,"[-0.0087635005, 0.016608795, -0.047265254, 0.0..."


In [14]:
import keras
from keras import layers


def build_classification_model(input_size: int, num_classes: int) -> keras.Model:
    return keras.Sequential(
        [
            layers.Input([input_size], name="embedding_inputs"),
            layers.Dense(input_size, activation="relu", name="hidden"),
            layers.Dense(num_classes, activation="softmax", name="output_probs"),
        ]
    )

In [15]:
# Derive the embedding size from observing the data. The embedding size can also be specified
# with the `output_dimensionality` parameter to `embed_content` if you need to reduce it.
embedding_size = len(df_train["Embeddings"].iloc[0])

classifier = build_classification_model(
    embedding_size, len(df_train["Class Name"].unique())
)
classifier.summary()

classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"],
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 hidden (Dense)              (None, 768)               590592    
                                                                 
 output_probs (Dense)        (None, 4)                 3076      
                                                                 
Total params: 593668 (2.26 MB)
Trainable params: 593668 (2.26 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
import numpy as np


NUM_EPOCHS = 20
BATCH_SIZE = 32

# Split the x and y components of the train and validation subsets.
y_train = df_train["Encoded Label"]
x_train = np.stack(df_train["Embeddings"])
y_val = df_test["Encoded Label"]
x_val = np.stack(df_test["Embeddings"])

# Specify that it's OK to stop early if accuracy stabilises.
early_stop = keras.callbacks.EarlyStopping(monitor="accuracy", patience=3)

# Train the model for the desired number of epochs.
history = classifier.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_val, y_val),
    callbacks=[early_stop],
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [17]:
classifier.evaluate(x=x_val, y=y_val, return_dict=True)



{'loss': 0.18808385729789734, 'accuracy': 0.9300000071525574}

In [18]:
# This example avoids any space-specific terminology to see if the model avoids
# biases towards specific jargon.
new_text = """
First-timer looking to get out of here.

Hi, I'm writing about my interest in travelling to the outer limits!

What kind of craft can I buy? What is easiest to access from this 3rd rock?

Let me know how to do that please.
"""
embedded = embed_fn(new_text)

In [19]:
# Remember that the model takes embeddings as input, and the input must be batched,
# so here they are passed as a list to provide a batch of 1.
inp = np.array([embedded])
[result] = classifier.predict(inp)

for idx, category in enumerate(df_test["Class Name"].cat.categories):
    print(f"{category}: {result[idx] * 100:0.2f}%")

sci.crypt: 0.01%
sci.electronics: 0.05%
sci.med: 0.01%
sci.space: 99.94%
