In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import ast
import keras as keras

# Descargar recursos de NLTK si no los tienes
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:


def categorizar_articulo_subtemas_en(texto, palabras_clave, vectorizer, umbrales, top_n=3):
    """Categoriza un artículo en subtemas, con umbrales personalizados por subtema."""
    try:
        texto = texto.lower()
        texto = re.sub(r'[^\w\s]', '', texto)  # Elimina puntuación
        stop_words = set(stopwords.words('english'))  # Stopwords en inglés
        palabras = [palabra for palabra in texto.split() if palabra not in stop_words]

        # Lematización (opcional)
        lemmatizer = WordNetLemmatizer()
        palabras = [lemmatizer.lemmatize(palabra) for palabra in palabras]


        texto_limpio = " ".join(palabras)  # Une las palabras limpias en un texto
        vector_texto = vectorizer.transform([texto_limpio])  # Convierte el texto en un vector


        similitudes = {}
        for subtema, palabras_clave in palabras_clave.items():
            vector_palabras_clave = vectorizer.transform([" ".join(palabras_clave)])
            similitudes[subtema] = cosine_similarity(vector_texto, vector_palabras_clave)[0][0]

        # Ordena las categorías por similitud de mayor a menor
        categorias_ordenadas = sorted(similitudes.items(), key=lambda x: x[1], reverse=True)
        print(categorias_ordenadas)




        # Selecciona las N categorías principales que superan el umbral
        top_categorias = []
        for categoria, similitud in categorias_ordenadas:
            umbral_categoria = umbrales.get(categoria, 0.1)  # Busca el umbral específico o usa 0.1 por defecto
            if similitud >= umbral_categoria:
                top_categorias.append(categoria)

        if not top_categorias:
            top_categorias.append("No specific subtopic")

        return top_categorias[:top_n]  # Retorna las N categorías principales

    except Exception as e:
        print(f"Error al procesar el texto: {e}")
        return ["Error"]  # Devuelve "Error" en caso de excepción


# Carga del CSV (¡ADAPTA LA RUTA!)
try:
    df = pd.read_csv("clean_data.csv", encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv("other_data.csv", encoding='latin-1') # o 'ISO-8859-1'

# Definir palabras clave para cada subtema (¡ADAPTA ESTO!)
subtemas_palabras_clave = {
    #"Malware": ["virus", "worm", "trojan", "rootkit", "spyware", "adware", "keylogger", "malicious code"],
    #"Ransomware": ["ransomware", "kidnapping", "ransom", "cryptolocker", "decrypt", "encryption"],
    "Phishing": ["phishing", "spoofing", "scam", "email", "malicious link", "credential theft", "fraudulent","virus", "worm", "trojan", "rootkit", "spyware", "adware", "keylogger", "malicious code","ransomware", "kidnapping", "ransom", "cryptolocker", "decrypt", "encryption","attack", "hacker", "denial of service", "DDoS", "intrusion", "breach", "cyberattack", "cybercrime",
    "account",
    "password",
    "login",
    "verify",
    "suspicious",
    "security",
    "alert",
    "warning",
    "sensitive",
    "update",
    "confirm",
    "unexpected",
    "urgent",
    "fraudulent",
    "unauthorized",
    "suspicious activity",
    "click here",
    "safeguard",
    "scam",
    "risk",
    "free",
    "limited time",
    "offer",
    "act now",
    "bank account",
    "transfer",
    "secure your account",
    "immediately",
    "suspended",
    "blocked",
    "unusual activity",
    "click this link",
    "incorrect",
    "password reset",
    "customer support",
    "technical support",
    "update your info",
    "danger",
    "incomplete",
    "unclaimed",
    "recover",
    "payment issue",
    "unpaid",
    "compromise",
    "authenticate",
    "validation",
    "claim now",
    "prize",
    "recovery",
    "security breach",
    "inbox",
    "suspicious email",
    "confirm your identity",
    "personal information",
    "phishing",
    "breach",
    "click to confirm"
    ],
    "Vulnerabilities": ["vulnerability", "exploit", "patch", "zero-day", "CVE", "bug", "flaw","privacy", "personal data", "GDPR", "data protection", "consent", "tracking", "surveillance","tip", "recommendation", "guide", "tutorial", "best practices", "security awareness","software", "program", "application", "operating system", "code", "script", "firmware","exploit",
    "vulnerability",
    "zero-day",
    "bug",
    "patch",
    "weakness",
    "attack",
    "breach",
    "compromise",
    "buffer overflow",
    "SQL injection",
    "cross-site scripting",
    "XSS",
    "remote code execution",
    "RCE",
    "privilege escalation",
    "denial of service",
    "DoS",
    "distributed denial of service",
    "DDoS",
    "man-in-the-middle",
    "MITM",
    "malware",
    "trojan",
    "ransomware",
    "rootkit",
    "backdoor",
    "access control",
    "credential stuffing",
    "password cracking",
    "phishing attack",
    "social engineering",
    "brute force",
    "exploit code",
    "shellshock",
    "heartbleed",
    "log4j",
    "security hole",
    "patch management",
    "unauthorized access",
    "security flaw",
    "code injection",
    "memory corruption",
    "session hijacking",
    "insider threat",
    "CVE",
    "CVSS",
    "security misconfiguration",
    "flaw",
    "unpatched",
    "unsecured",
    "insecure",
    "vulnerable",
    "hacker",
    "malicious",
    "exploit kit",
    "spoofing",
    "root access",
    "backdoor access",
    "cyberattack",
    "XSRF",
    "CSRF",
    "unauthorized privilege",
    "unauthenticated",
    "exposed port",
    "leak",
    "security breach",
    "unverified",
    "patch bypass",
    "security loophole",
    "signature bypass"]
    #"Attacks": ["attack", "hacker", "denial of service", "DDoS", "intrusion", "breach", "cyberattack", "cybercrime"],
    #"Privacy": ["privacy", "personal data", "GDPR", "data protection", "consent", "tracking", "surveillance"],
    #"tips": ["tip", "recommendation", "guide", "tutorial", "best practices", "security awareness"],
    #"Software": ["software", "program", "application", "operating system", "code", "script", "firmware"]
}

# Definir umbrales personalizados para cada subtema (¡ADAPTA ESTO!)
umbrales_personalizados = {
    #"Malware": 0.01,      # Subtema Malware: Umbral más alto (más exigente)
    #"Ransomware": 0.01,   # Subtema Ransomware: Umbral medio
    "Phishing": 0.001,     # Subtema Phishing: Umbral más bajo (más sensible)
    "Vulnerabilities": 0.03,
    #"Attacks": 0.04,
    #"Privacy": 0.01,
    #"tips": 0.01,
    #"Software": 0.03
}

# Preprocesamiento y Vectorización con TF-IDF
vectorizer = TfidfVectorizer()
corpus = df["text"].tolist()  # Lista de todos los textos
vectorizer.fit(corpus)  # Ajusta el vectorizador al corpus

# Aplica la función de categorización a cada artículo
df["label"] = df["text"].apply(lambda x: categorizar_articulo_subtemas_en(x, subtemas_palabras_clave, vectorizer, umbrales_personalizados, top_n=3))

# Guarda el DataFrame con las categorías (¡ADAPTA LA RUTA!)
df.to_csv("labelled_data.csv", index=False, encoding='utf-8')

print("Categorización completada. Archivo guardado como noticias_tecnologia_subtemas_en.csv")

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
[('Phishing', 0.05548701882684261), ('Vulnerabilities', 0.022630000290094772)]
[('Phishing', 0.028499841379568763), ('Vulnerabilities', 0.01904388752839756)]
[('Vulnerabilities', 0.03673035809224013), ('Phishing', 0.017069698095729793)]
[('Phishing', 0.09108779529293494), ('Vulnerabilities', 0.044253405833900146)]
[('Vulnerabilities', 0.016914093839808887), ('Phishing', 0.01102125296193269)]
[('Vulnerabilities', 0.06529127699966823), ('Phishing', 0.052071555202966566)]
[('Vulnerabilities', 0.023474129568101094), ('Phishing', 0.004493272379431402)]
[('Vulnerabilities', 0.044126991533321454), ('Phishing', 0.014408655252238702)]
[('Phishing', 0.09030789759639042), ('Vulnerabilities', 0.03725837387055452)]
[('Vulnerabilities', 0.02308655886323687), ('Phishing', 0.0226240313241067)]
[('Phishing', 0.06714171379192257), ('Vulnerabilities', 0.041863553539746655)]
[('Phishing', 0.054682038928502576), ('Vulnerabilities', 

In [None]:


keras.backend.clear_session()


#df = pd.read_csv('clean_data.csv')
df = pd.read_csv('labelled_data.csv')
# Convertir etiquetas a listas
#df['label'] = df['text'].apply(categorize)

df["label"] = df["label"].apply(ast.literal_eval)



# Binarizar las etiquetas
mlb = MultiLabelBinarizer()
etiquetas_binarias = mlb.fit_transform(df['label'])
etiquetas_binarias_df = pd.DataFrame(etiquetas_binarias, columns=mlb.classes_)

etiquetas_binarias_df.to_csv('binarias.csv', index=False)
posi = etiquetas_binarias_df.sum()
print(posi)

# Dividir los datos en características (X) y etiquetas (y)
X = df['text']
y = etiquetas_binarias_df

# Dividir los datos en entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenizar el texto
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

train_encodings = tokenize_function(x_train)
val_encodings = tokenize_function(x_test)

# Cargar el modelo BERT preentrenado
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=y.shape[1])

# Crear un Dataset de TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(1000).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_test)).batch(16)

# Compilar el modelo con binary_crossentropy
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
            loss='categorical_crossentropy',
            metrics=['accuracy'])

es=keras.callbacks.EarlyStopping(monitor='val_loss',patience=10)



In [None]:
model.fit(train_dataset, validation_data=val_dataset, validation_split = 0.2,
                epochs=5)

In [None]:
def export_other_data():
    import pandas as pd

    # Leer el dataset desde el archivo CSV
    df = pd.read_csv('clean_data.csv')

    # Convertir etiquetas a listas
    df['label'] = df['text'].apply(categorize)

    # Filtrar los datos de la categoría "Other"
    df_other = df[df['label'].apply(lambda x: 'Other' in x)]

    # Exportar los datos filtrados a un archivo CSV
    df_other.to_csv('other_data.csv', index=False)

    print("Datos de la categoría 'Other' exportados a 'other_data.csv'")

# Llamar a la función para exportar los datos
export_other_data()


Datos de la categoría 'Other' exportados a 'other_data.csv'


In [None]:
dfdf = pd.read_csv('labelled_data.csv')
print(dfdf)