In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from datasets import load_dataset
import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from sklearn.preprocessing import FunctionTransformer

### Cargamos datos

In [2]:


# Con esto lo bajamos de este repositorio centralizado
ds = load_dataset("ag_news")

vect = CountVectorizer()
# Si quisieramos que considere en el diccionario secuencia de un caracter (como la palabra "y", hay que modificar el parámetro token_pattern al instanciar CountVectorizer
# vect = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
vect.fit(ds["train"]["text"])

In [3]:
#matriz=vect.transform(ds["train"]["text"]).todense()

In [4]:
# Convertimos a pandas

id2label = ds["train"].features["label"].names

df_train = ds["train"].to_pandas()

df_test = ds["test"].to_pandas()

In [5]:
# Tomamos una proporción de la muestra de entrenamiento para que el entrenamiento sea más eficiente computacionalmente
muestra_fraccion = 0.01
df_train_muestra= df_train.sample(frac=muestra_fraccion, random_state=42)


In [6]:

# Obtener el vocabulario generado por el CountVectorizer
vocabulario = vect.get_feature_names_out()

# Mostrar las primeras 50 palabras del vocabulario
print(vocabulario[:50])

# Si deseas conocer el tamaño total del vocabulario
print("\nTamaño total del vocabulario:", len(vocabulario))

['00' '000' '000th' '01' '02' '037' '038' '04' '05' '06' '08' '09' '10'
 '100' '100m' '100th' '101' '1010' '1040' '105' '106' '108' '10k' '10m'
 '10th' '11' '110' '114' '116' '118' '11th' '12' '123' '128mb' '12th' '13'
 '130' '130b' '133' '135' '137th' '139m' '13b' '14' '140' '14012' '141'
 '142' '146' '147']

Tamaño total del vocabulario: 8560


In [7]:
for text in df_train_muestra['text'].sample(10):
    print(text)
    print('-'*80)

UN Mission Sends Military Team to Investigate Fighting in Eastern &lt;b&gt;...&lt;/b&gt; Following reports of fighting between different factions of former members of the Armed Forces of the Democratic Republic of the Congo (FARDC) and weapons distribution to civilians in North Kivu province, the United Nations mission today said it sent a 
--------------------------------------------------------------------------------
Jaguar's Coventry plant to close Jaguar's historic Brown's Lane factory in Coventry is to close, Ford has confirmed, with 400 jobs expected to be lost.
--------------------------------------------------------------------------------
Brown Says He May Never Get Over Athens (AP) AP - Larry Brown didn't have much time to enjoy his first NBA title. After coaching the Detroit Pistons to a surprising victory over the Los Angeles Lakers, Brown focused all his energy on getting the U.S. basketball team ready for the Olympics. Much to his dismay, the Americans came home with jus

In [13]:
# Para lematizar definimos una clase que luego ponemos en el parámetro tokenize de countvectorizer 
# https://scikit-learn.org/stable/modules/feature_extraction.html
#La función de nltk que vamos a definir a continuación para lematizar, requiere descargar el conjunto de datos punkt, que es un tokenizador preentrenado para tokenizar texto en frases
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/agustineckerdt/nltk_data...


True

In [16]:

class LemmaTokenizer:
     def __init__(self):
         self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


['!' '#' '$' '&' "'" "''" "'african" "'batman" "'better" "'d"
 "'encouraged" "'extreme" "'fear" "'final" "'fuzzy" "'google" "'important"
 "'in" "'ll" "'m" "'massive" "'may" "'park" "'phishing" "'plant" "'re"
 "'s" "'spam" "'unit" "'ve" "'war" "'worst" "'wrongdoers" '(' ')' ',' '-'
 '--' '-2' '-as' '-ave' '-bisoft' '-fed' '-iled' '-scientists' '-source'
 '-study' '-the' '-verheugen' '.']


In [22]:
#Creamos funcion que filtre los caracteres alfabeticos
def filtro_alfabeticos(documento):
    return [' '.join(word for word in doc.split() if word.isalpha()) for doc in documento]
#Convierto la función en una compatible con sklearn
alfab_filtro = FunctionTransformer(filtro_alfabeticos, validate=False)

#Generamos un pipeline con todos los pasos del pre procesamiento
pipeline = Pipeline([
    ('alfab_filtro', alfab_filtro),
    ('vectorizador', CountVectorizer(tokenizer=LemmaTokenizer(), strip_accents="ascii", analyzer='word'))
])

X_train = pipeline.fit_transform(df_train_muestra['text'])
vocabulario_alfabetico = pipeline.named_steps['vectorizador'].get_feature_names_out()
print(vocabulario_alfabetico[:50])


['a' 'abandon' 'abandoned' 'abbey' 'abc' 'abducted' 'abduction' 'able'
 'abn' 'aboard' 'aboriginal' 'abounds' 'about' 'above' 'absence' 'abtahi'
 'abu' 'abuse' 'abusing' 'academic' 'accelerates' 'accept' 'accepted'
 'accepts' 'access' 'accessible' 'acclaim' 'acclaimed' 'accomplished'
 'accord' 'according' 'account' 'accounting' 'accusation' 'accuse'
 'accused' 'accuser' 'accuses' 'accusing' 'ace' 'aceh' 'acer' 'achieve'
 'acquire' 'acquired' 'acquisition' 'acrobat' 'across' 'act' 'action']




In [None]:


vect = CountVectorizer(token_pattern=r'\b[a-zA-Z]+\b', tokenizer=LemmaTokenizer(),strip_accents="ascii",analyzer = 'word')  # Esto excluye los números y captura solo palabras alfabéticas
# Revisamos la documentación y vimos que el parámetro lowercase=True está puedo por defecto en countvectorizer, lo cual
# nos asegura que se convirtió todo a minúsculas
X_train = vect.fit_transform(df_train_muestra['text'])
# Obtener y mostrar el vocabulario
vocabulario_alfabetico = vect.get_feature_names_out()

print(vocabulario_alfabetico[:50])  # Mostramos las primeras 50 palabras del vocabulario

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)