In [5]:
import pandas as pd
import io
from langdetect import detect
import nltk
import re
import unicodedata
import os

In [None]:

def load_word_lists():
    """Carga los diccionarios de palabras"""
    
    word_lists = {
        'polite_words': {
            "es": ["gracias", "por favor", "buen dia", "seria tan amable", "le agradezco", "disculpe", "permítame"],
            "en": ["please", "thank you", "good morning", "kindly", "appreciate", "sorry", "excuse me"]
        },
        'rude_words': {
            "es": ["maldito", "idiota", "estúpido", "imbécil", "pendejo", "culero", "pinche"],
            "en": ["stupid", "idiot", "dumb", "useless", "shit", "terrible", "lazy", "damn"]
        },
        'technical_terms': {
            "es": ["sistema", "base de datos", "protocolo", "infraestructura", "algoritmo", "interfaz", "servidor"],
            "en": ["system", "database", "protocol", "infrastructure", "algorithm", "interface", "server"]
        },
        'toxic_words': {
            "es": ["tonto", "idiota", "estúpido", "imbécil", "maldito", "horrible", "perezoso"],
            "en": ["stupid", "idiot", "dumb", "useless", "shit", "terrible", "lazy"]
        },
        'desagrado': {
            "es": ["no me gusta", "malo", "pésimo", "desagradable", "asco", "horror", "feo", "fatal", 
                   "terrible", "decepcionante", "guácala", "fuchi", "repugnante", "rechazo"],
            "en": ["dislike", "bad", "terrible", "gross", "disgusting", "awful", "nasty", "poor quality", 
                   "yuck", "revolting", "hate it", "displeased", "unpleasant"]
        },
        'frustracion': {
            "es": ["intentar", "otra vez", "de nuevo", "no funciona", "bloqueado", "harto", "impotencia", 
                   "siempre lo mismo", "fallo", "error", "espera", "inútil", "incapaz"],
            "en": ["tried", "again", "still not", "stuck", "fed up", "failure", "broken", "useless", 
                   "waiting", "blocked", "annoyed", "pointless", "keep trying"]
        },
        'gratitud': {
            "es": ["gracias", "agradezco", "buenísimo", "excelente", "amable", "bendiciones", "genial", 
                   "ayudó mucho", "recomiendo", "valoro", "perfecto"],
            "en": ["thanks", "thank you", "grateful", "appreciate", "kind", "helpful", "awesome", 
                   "perfect", "blessed", "highly recommend", "supportive"]
        },
        'indiferencia': {
            "es": ["da igual", "equis", "me da lo mismo", "como sea", "no importa", "ni fu ni fa", 
                   "meh", "ok", "está bien", "sin opinión"],
            "en": ["whatever", "doesn't matter", "don't care", "anyway", "meh", "ok", "fine", 
                   "neutral", "indifferent", "as you wish", "regardless"]
        },
        'satisfaccion': {
            "es": ["contento", "feliz", "valió la pena", "logré", "funciona", "bien", "satisfecho", 
                   "gusto", "maravilla", "increíble", "justo lo que quería"],
            "en": ["happy", "satisfied", "pleased", "works", "worth it", "glad", "great", 
                   "fulfilled", "exactly", "success", "delighted"]
        },
        'rabia_ira': {
            "es": ["enojado", "furia", "basura", "estafa", "robo", "insoportable", "maldito", 
                   "odio", "indignado", "insulto", "inservible", "peor", "asco de"],
            "en": ["angry", "mad", "hate", "scam", "shameful", "outraged", "furious", "garbage", 
                   "trash", "disgrace", "infuriating", "pissed", "worst"]
        },
        'amenazas': {
            "es": ["demandar", "denuncia", "PROFECO", "legal", "abogado", "ir de la competencia", 
                   "cancelar", "baja", "quemar", "nunca más", "redes sociales", "última vez"],
            "en": ["sue", "legal action", "lawyer", "reporting", "cancel", "switching to", 
                   "never again", "last warning", "publicly", "court", "quit"]
        }
    }
    
    return word_lists

def normalize_text(text):
    """Normaliza el texto removiendo acentos"""
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFD", text)
    text = "".join(c for c in text if unicodedata.category(c) != "Mn")
    return text.lower()

def detect_language_safe(text):
    """Detecta el idioma del texto de forma segura"""
    try:
        if pd.isna(text) or not isinstance(text, str) or len(text.strip()) < 3:
            return 'es'
        lang = detect(text)
        return lang if lang in ['es', 'en'] else 'es'
    except:
        return 'es'

def count_words_in_text(text, word_list, lang):
    """Cuenta cuántas palabras de una lista específica aparecen en el texto"""
    if pd.isna(text) or not isinstance(text, str):
        return 0
    
    text_normalized = normalize_text(text)
    words_from_dict = word_list.get(lang, [])
    
    count = 0
    for word in words_from_dict:
        word_normalized = normalize_text(word)
        pattern = r"\b" + re.escape(word_normalized) + r"\b"
        if re.search(pattern, text_normalized, re.IGNORECASE):
            count += 1
    
    return count

def analyze_sentiment_dataframe(df, text_column, word_lists=None):
    """
    Analiza sentimientos en un DataFrame.
    """
    
    # Crear una copia del DataFrame
    df_result = df.copy()
    
    # Detectar idioma para cada fila
    # progress_bar = st.progress(0, text="Detectando idiomas...")
    
    df_result['_lang_temp'] = df_result[text_column].apply(
        lambda x: detect_language_safe(str(x)) if pd.notna(x) else 'es'
    )
    # progress_bar.progress(10, text="Idiomas detectados")
    
    # Inicializar las 11 columnas nuevas
    # progress_bar.progress(15, text="Inicializando columnas...")
    df_result['polite_words'] = 0
    df_result['rude_words'] = 0
    df_result['technical_terms'] = 0
    df_result['toxic_words'] = 0
    df_result['desagrado'] = 0
    df_result['frustracion'] = 0
    df_result['gratitud'] = 0
    df_result['indiferencia'] = 0
    df_result['satisfaccion'] = 0
    df_result['rabia_ira'] = 0
    df_result['amenazas'] = 0
    
    # Procesar cada fila
    total_rows = len(df_result)
    
    for idx, row in df_result.iterrows():
        # Actualizar barra de progreso
        if idx % max(1, total_rows // 20) == 0:
            progress = 15 + int((idx / total_rows) * 80)
            # progress_bar.progress(progress, text=f"Procesando fila {idx + 1}/{total_rows}...")
        
        text = row[text_column]
        if pd.isna(text) or not isinstance(text, str):
            continue
        
        lang = row['_lang_temp']
        
        # Analizar cada categoría
        df_result.at[idx, 'polite_words'] = count_words_in_text(text, word_lists['polite_words'], lang)
        df_result.at[idx, 'rude_words'] = count_words_in_text(text, word_lists['rude_words'], lang)
        df_result.at[idx, 'technical_terms'] = count_words_in_text(text, word_lists['technical_terms'], lang)
        df_result.at[idx, 'toxic_words'] = count_words_in_text(text, word_lists['toxic_words'], lang)
        df_result.at[idx, 'desagrado'] = count_words_in_text(text, word_lists['desagrado'], lang)
        df_result.at[idx, 'frustracion'] = count_words_in_text(text, word_lists['frustracion'], lang)
        df_result.at[idx, 'gratitud'] = count_words_in_text(text, word_lists['gratitud'], lang)
        df_result.at[idx, 'indiferencia'] = count_words_in_text(text, word_lists['indiferencia'], lang)
        df_result.at[idx, 'satisfaccion'] = count_words_in_text(text, word_lists['satisfaccion'], lang)
        df_result.at[idx, 'rabia_ira'] = count_words_in_text(text, word_lists['rabia_ira'], lang)
        df_result.at[idx, 'amenazas'] = count_words_in_text(text, word_lists['amenazas'], lang)
    
    # Eliminar columna temporal de idioma
    df_result = df_result.drop('_lang_temp', axis=1)
    
    # progress_bar.progress(100, text="✅ Procesamiento completado!")
    
    return df_result

In [7]:
df = pd.read_csv('./twitter_training.csv')
df.head(5)

Unnamed: 0,ID,XD,Sentiment
0,2401,Borderlands,im getting on borderlands and i will murder yo...
1,2401,Borderlands,I am coming to the borders and I will kill you...
2,2401,Borderlands,im getting on borderlands and i will kill you ...
3,2401,Borderlands,im coming on borderlands and i will murder you...
4,2401,Borderlands,im getting on borderlands 2 and i will murder ...


In [8]:
word_lists = load_word_lists()

In [9]:
df_resultado = analyze_sentiment_dataframe(
    df, 
    text_column='Sentiment',
    word_lists=word_lists)

In [10]:
df_resultado.head(5)

Unnamed: 0,ID,XD,Sentiment,polite_words,rude_words,technical_terms,toxic_words,desagrado,frustracion,gratitud,indiferencia,satisfaccion,rabia_ira,amenazas
0,2401,Borderlands,im getting on borderlands and i will murder yo...,0,0,0,0,0,0,0,0,0,0,0
1,2401,Borderlands,I am coming to the borders and I will kill you...,0,0,0,0,0,0,0,0,0,0,0
2,2401,Borderlands,im getting on borderlands and i will kill you ...,0,0,0,0,0,0,0,0,0,0,0
3,2401,Borderlands,im coming on borderlands and i will murder you...,0,0,0,0,0,0,0,0,0,0,0
4,2401,Borderlands,im getting on borderlands 2 and i will murder ...,0,0,0,0,0,0,0,0,0,0,0
