# Aplicar BOW y TF-IDF

Descarga los datos de los artículos de NIPS del siguiente enlace: https://metatext.io/datasets/nips-papers

In [62]:
import pandas as pd
import numpy as np
import re
import string
import spacy
import gensim

pd.options.display.max_colwidth = None

In [63]:
corpus = pd.read_csv("papers.csv")
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7241 non-null   int64 
 1   year        7241 non-null   int64 
 2   title       7241 non-null   object
 3   event_type  2422 non-null   object
 4   pdf_name    7241 non-null   object
 5   abstract    7241 non-null   object
 6   paper_text  7241 non-null   object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [64]:
# Quitamos columnas no relevantes para nuestro ejercicio
corpus.drop(columns=['id', 'event_type', 'pdf_name','abstract','paper_text','year'], inplace=True)

In [65]:
corpus.head(10)

Unnamed: 0,title
0,Self-Organization of Associative Database and Its Applications
1,A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Artificial Neural Networks
2,Storing Covariance by the Associative Long-Term Potentiation and Depression of Synaptic Strengths in the Hippocampus
3,Bayesian Query Construction for Neural Network Models
4,"Neural Network Ensembles, Cross Validation, and Active Learning"
5,Using a neural net to instantiate a deformable model
6,Plasticity-Mediated Competitive Learning
7,ICEG Morphology Classification using an Analogue VLSI Neural Network
8,Real-Time Control of a Tokamak Plasma Using Neural Networks
9,Pulsestream Synapses with Non-Volatile Analogue Amorphous-Silicon Memories


In [66]:
corpus = corpus.iloc[:,0].tolist()



In [67]:
type(corpus)

list

## BOW

Utiliza 3/4 del corpus para crear el modelo y el resto para aplicarlo.

In [68]:
length = len(corpus)

middle_index = 3*length // 4

rest = corpus[middle_index:]
corpus2 = corpus[:middle_index]


In [69]:
nlp = spacy.load("en_core_web_md")
def normalizar_doc(doc):
    '''Función que normaliza un texto cogiendo sólo
    las palabras en minúsculas mayores de 3 caracteres'''
    # separamos en tokens
    tokens = nlp(doc)
    # filtramos stopwords
    filtered_tokens = [t.lower_ for t in tokens if
                       len(t.text)>3 and # cojemos las palabras q tienen más de 3 carácteres
                       not t.is_space and
                       not t.is_punct]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)
    return doc

In [70]:
#aplicamos a todo el corpus
norm_corpus = [normalizar_doc(doc) for doc in corpus2]
norm_corpus

['self organization associative database applications',
 'mean field theory layer visual cortex application artificial neural networks',
 'storing covariance associative long term potentiation depression synaptic strengths hippocampus',
 'bayesian query construction neural network models',
 'neural network ensembles cross validation active learning',
 'using neural instantiate deformable model',
 'plasticity mediated competitive learning',
 'iceg morphology classification using analogue vlsi neural network',
 'real time control tokamak plasma using neural networks',
 'pulsestream synapses with volatile analogue amorphous silicon memories',
 'learning play game chess',
 'multidimensional scaling data clustering',
 'experimental comparison recurrent neural networks',
 'training multilayer perceptrons with extended kalman algorithm',
 'interference learning internal models inverse dynamics humans',
 'active learning with statistical models',
 'rapid graph based method arbitrary transforma

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(norm_corpus) #también funcionaría cv.fit(map(normalizar_doc, corpus))

CountVectorizer()

In [72]:
cv_matrix = cv.transform(norm_corpus)
cv_matrix.shape

(5430, 4491)

In [73]:
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [74]:
# obtenemos palabras únicas en el corpus
vocab = cv.get_feature_names()
# mostramos vectores de características BoW del corpus
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,2003,2d,3d,_2,abandoned,abilities,ability,about,absence,absolute,...,wrong,yang,yield,ying,yosida,young,your,zero,zeta,zype
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5425,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5427,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5428,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
cv.vocabulary_  # cada palabra tiene un indice

{'self': 3591,
 'organization': 2787,
 'associative': 241,
 'database': 977,
 'applications': 187,
 'mean': 2360,
 'field': 1499,
 'theory': 4107,
 'layer': 2152,
 'visual': 4387,
 'cortex': 903,
 'application': 186,
 'artificial': 225,
 'neural': 2626,
 'networks': 2625,
 'storing': 3884,
 'covariance': 920,
 'long': 2260,
 'term': 4084,
 'potentiation': 3035,
 'depression': 1048,
 'synaptic': 4014,
 'strengths': 3897,
 'hippocampus': 1790,
 'bayesian': 353,
 'query': 3197,
 'construction': 830,
 'network': 2623,
 'models': 2484,
 'ensembles': 1323,
 'cross': 939,
 'validation': 4335,
 'active': 33,
 'learning': 2166,
 'using': 4328,
 'instantiate': 1973,
 'deformable': 1019,
 'model': 2481,
 'plasticity': 2983,
 'mediated': 2376,
 'competitive': 735,
 'iceg': 1843,
 'morphology': 2515,
 'classification': 629,
 'analogue': 140,
 'vlsi': 4395,
 'real': 3245,
 'time': 4140,
 'control': 852,
 'tokamak': 4148,
 'plasma': 2981,
 'pulsestream': 3171,
 'synapses': 4013,
 'with': 4463,
 'vola

### Aplicando el modelo a nuevos documentos
Cuando calculamos el vector BoW de un texto nuevo con el modelo no hay que volver a ajustar el vocabulario, por lo que los términos nuevos no se tendrán en cuenta:

In [76]:
cv_matrix_nueva = cv.transform(map(normalizar_doc,rest))
cv_matrix_nueva

<1811x4491 sparse matrix of type '<class 'numpy.int64'>'
	with 10709 stored elements in Compressed Sparse Row format>

In [77]:
pd.DataFrame(cv_matrix_nueva.toarray(), columns=vocab)

Unnamed: 0,2003,2d,3d,_2,abandoned,abilities,ability,about,absence,absolute,...,wrong,yang,yield,ying,yosida,young,your,zero,zeta,zype
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1808,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1809,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF-IDF

Utiliza 3/4 del corpus para crear el modelo y el resto para aplicarlo.