In [1]:
import math
import numpy as np
import pandas as pd
import dataframe_image as dfi
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Dados

In [2]:
df = pd.read_csv('../var/output/dense_dataset_v1.csv',
                 sep='|', parse_dates=['classif_date'],
                 dtype={'rpi_registro': str,
                        'area': str})
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

temp = df[df['title'].str.len() < 15]
temp['title'] = temp['title'].str.lower()
temp = temp[temp['title'].str.contains('aparelho')]

length = 10
corpus = [
    ' '.join(temp.iloc[0]['abstract'].split()[0:length]).lower(),
    ' '.join(temp.iloc[1]['abstract'].split()[0:length]).lower(),
    ' '.join(temp.iloc[2]['abstract'].split()[0:length]).lower(),
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['title'] = temp['title'].str.lower()


In [3]:
corpus

['a presente invenção se refere a um aparelho para balancear',
 'patente de invenção aparelho trata se de um sistema de',
 'patente de invenção aparelho um aparelho fornece energia a uma']

# Bag-of-Words (BoW)

In [4]:
boolean_vec = CountVectorizer(binary=True)
X = boolean_vec.fit_transform(corpus)
boolean_vec.get_feature_names_out()

array(['aparelho', 'balancear', 'de', 'energia', 'fornece', 'invenção',
       'para', 'patente', 'presente', 'refere', 'se', 'sistema', 'trata',
       'um', 'uma'], dtype=object)

## Binary Vectorizing

In [5]:
# Binary Vactorizing
boolean_vec = CountVectorizer(binary=True)
X = boolean_vec.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=boolean_vec.get_feature_names_out())

dfi.export(df, 'var/cache/table_binary_vectorizer.png')
df

objc[99947]: Class WebSwapCGLLayer is implemented in both /System/Library/Frameworks/WebKit.framework/Versions/A/Frameworks/WebCore.framework/Versions/A/Frameworks/libANGLE-shared.dylib (0x7ffb417df948) and /Applications/Google Chrome.app/Contents/Frameworks/Google Chrome Framework.framework/Versions/102.0.5005.61/Libraries/libGLESv2.dylib (0x11ee6f290). One of the two will be used. Which one is undefined.
[0604/195520.958554:INFO:headless_shell.cc(660)] Written to file /var/folders/zv/9wbwxp0d7nn5nsnkb6zsqtl80000gn/T/tmpv9_xtw3f/temp.png.


Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,1,1,0,0,0,1,1,0,1,1,1,0,0,1,0
1,1,0,1,0,0,1,0,1,0,0,1,1,1,1,0
2,1,0,1,1,1,1,0,1,0,0,0,0,0,1,1


## Term Frequency

In [6]:
# Term Frequency
freq_vec = CountVectorizer()
X = freq_vec.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=freq_vec.get_feature_names_out())
counts = df.to_numpy()

dfi.export(df, 'var/cache/table_count_vectorizer.png')
df

objc[99960]: Class WebSwapCGLLayer is implemented in both /System/Library/Frameworks/WebKit.framework/Versions/A/Frameworks/WebCore.framework/Versions/A/Frameworks/libANGLE-shared.dylib (0x7ffb417df948) and /Applications/Google Chrome.app/Contents/Frameworks/Google Chrome Framework.framework/Versions/102.0.5005.61/Libraries/libGLESv2.dylib (0x10c931290). One of the two will be used. Which one is undefined.
[0604/195522.084775:INFO:headless_shell.cc(660)] Written to file /var/folders/zv/9wbwxp0d7nn5nsnkb6zsqtl80000gn/T/tmp7zgb5h16/temp.png.


Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,1,1,0,0,0,1,1,0,1,1,1,0,0,1,0
1,1,0,3,0,0,1,0,1,0,0,1,1,1,1,0
2,2,0,1,1,1,1,0,1,0,0,0,0,0,1,1


## TF-IDF Term Frequence - Inverse Document Frequency

\begin{equation}\label{eq:fundamental}
    tfidf(t, d, D) = tf(t, d) \times idf(t, D) \\
    onde:\\
    \\
    t, \text{é um termo qualquer do vocabuláro} \\
    d, \text{é um documento $ d \in D$}\\ 
    D, \text{é o conjunto de todos os documentos}\\
    \\
    tf(t, d) = \text{parcela baseada na quantidade de vezes que o termo $t$ aparece no documento $d$}\\
    idf(t, D) = \text{parcela baseada na quantidade de documentos, no conjunto $D$, onde o termo $t$ aparece} 
    \\ \text{Perceba que ambos os termos podem ser calculados de diversas formas!} 
\end{equation}

https://monkeylearn.com/blog/what-is-tf-idf/

**term frequency**

A maneira mais simples de obter o **tf** de um termo é através do número absoluto de vezes que a palavra aparece em um documento. E depois, pode-se normalizar pela quantidade total de palavras em um documento; ou ainda pela maior quantidade de vezes dentre todas as palavras do documento.

**inverse document frequency**

É calculado para uma palavra. Quanto mais próximo de 0, mais comum é a palavra. Pode ser calculada pegando o número total de documentos, dividido pelo número de documentos onde a palavra aparece, e calculando-se o logaritmo.


\begin{equation}\label{eq:monkeyleran}
    tf(t, d) = log(1 + freq(t, d)) \\
    idf(t, D) = log \frac{ N }{count(d \in D : t \in d)}
\end{equation}

https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089

TF is normalized to have values between \[0, 1\]

DF is also normalized to keep consistency. Because of the explosion with big number of documents, to prevent division by 0, we +1 to DF. 

\begin{equation}\label{eq:towarddatascience}
    tf(t, d) = freq(t, d) / length(d) \\
    idf(t, D) = log \frac{ N }{df +1}
\end{equation}


<!-- reference = """
@article{scikit-learn,
 title={Scikit-learn: Machine Learning in {P}ython},
 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 journal={Journal of Machine Learning Research},
 volume={12},
 pages={2825--2830},
 year={2011}
}
"""

sklearn.feature_extraction.text.TfidfVectorizer
(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)
\begin{equation}\label{eq:20}
    idf(t) = log \frac{ n }{df(t)} + 1
\end{equation}


sklearn.feature_extraction.text.TfidfVectorizer
(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
\begin{equation}\label{eq:30}
    idf(t) = log \frac{ 1+n }{1+df(t)} + 1
\end{equation}
 -->

In [7]:
def get_df_vec(freq_matrix: np.array):
    '''calculate document frequency of all terms t in a frequency matrix'''
    num_features = len(freq_matrix[0])
    terms_df_vec = [0] * num_features
    for doc in freq_matrix:
        for feature in range(0, num_features):
            if doc[feature] > 0:
                terms_df_vec[feature] += 1
    return terms_df_vec


def get_doc_len(doc: np.array):
    '''normalize tf by dividing tf by number of word in the document'''
    doc_len = 0
    for feature in doc:
        if feature > 0:
            doc_len += 1
    return doc_len


def tfidf_naive(freq_matrix: np.array):
    '''calculates tfidf from a frequency matrix
        
        Using naive intuition
        
        tf = the number of times the term t appears on document d
        
        idf = inverse of the number of documents where term t appears

    '''
    df_ = get_df_vec(freq_matrix)
    tfidf = []
    for doc in freq_matrix:
        tfidf_ = []
        for feature in range(0, len(df_)):
            tf_ = doc[feature]
            idf_ = 1 / df_[feature]
            tfidf_.append(tf_ * idf_)
        tfidf.append(tfidf_)
    return np.array(tfidf)


def tfidf_classic(freq_matrix: np.array):
    '''calculates tfidf from a frequency matrix
        
        Using text book definition:
        
        tf = tf / doc_len
        # normalized version
        
        idf = ln [ N / (df + 1) ]
        # + 1 prevent division by zero, means there is a doc that contain all terms exactly once
        # ln to scale down the valeu because we would like the number of documents to be very big
        
    '''
    df_ = get_df_vec(freq_matrix)
    tfidf = []
    N = len(freq_matrix)
    for doc in freq_matrix:
        tfidf_ = []
        doc_len = get_doc_len(doc)
        for feature in range(0, len(df_)):
            tf_ = doc[feature] / doc_len
            idf_ = math.log(N / df_[feature] + 1)
            tfidf_.append(tf_ * idf_)
        tfidf.append(tfidf_)
    return np.array(tfidf)


def tf_only(freq_matrix: np.array):
    '''calculates tfidf from a frequency matrix
        using naive intuition'''
    df_ = get_df_vec(freq_matrix)
    tfidf = []
    N = len(freq_matrix)
    for doc in freq_matrix:
        tfidf_ = []
        for feature in range(0, len(df_)):
            tf_ = doc[feature]
            idf_ = 1
            tfidf_.append(tf_ * idf_)
        tfidf.append(tfidf_)
    return np.array(tfidf)


def idf_only(freq_matrix: np.array):
    '''calculates tfidf from a frequency matrix
        using naive intuition'''
    df_ = get_df_vec(freq_matrix)
    tfidf = []
    N = len(freq_matrix)
    for doc in freq_matrix:
        tfidf_ = []
        for feature in range(0, len(df_)):
            tf_ = 1
            idf_ = 1 / df_[feature]
            tfidf_.append(tf_ * idf_)
        tfidf.append(tfidf_)
    return np.array(tfidf)


def tfidf_test2(freq_matrix: np.array):
    '''calculates tfidf from a frequency matrix
        using naive intuition'''
    df_ = get_df_vec(freq_matrix)
    tfidf = []
    N = len(freq_matrix)
    for doc in freq_matrix:
        tfidf_ = []
        for feature in range(0, len(df_)):
            tf_ = doc[feature]
            idf_ = N / df_[feature]
            tfidf_.append(tf_ * idf_)
        tfidf.append(tfidf_)
    return np.array(tfidf)


def idf_by_N(freq_matrix: np.array):
    '''calculates tfidf from a frequency matrix
        using naive intuition'''
    df_ = get_df_vec(freq_matrix)
    tfidf = []
    N = len(freq_matrix)
    for doc in freq_matrix:
        tfidf_ = []
        for feature in range(0, len(df_)):
            tf_ = 1
            idf_ = N / df_[feature]
            tfidf_.append(tf_ * idf_)
        tfidf.append(tfidf_)
    return np.array(tfidf)


### TF-IDF ingênuo

\begin{equation}\label{eq:ingenua}
    tf(t, d) = freq(t, d) \rightarrow \text{frequência do termo $t$ no documento $d$}\\
    idf(t, D) = count( d \in D : t \in d) \rightarrow \text{quantidade de documentos em $D$ onde o termo $t$ aparece} 
\end{equation}

**O problema:** perceba que o termo "aparelho" aparece em todos os documentos, portanto não é um bom termo para diferenciar os documentos entre si, entretanto é o termo com o maior peso em todos os documentos, como se seu valor discriminante fosse elevado. Isso é indesejado. O que fazer?

In [8]:
# TF-IDF (ingênuo)
tfidf = tfidf_naive(counts)
df = pd.DataFrame(tfidf, columns=freq_vec.get_feature_names_out())
df

Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,0.333333,1.0,0.0,0.0,0.0,0.333333,1.0,0.0,1.0,1.0,0.5,0.0,0.0,0.333333,0.0
1,0.333333,0.0,1.5,0.0,0.0,0.333333,0.0,0.5,0.0,0.0,0.5,1.0,1.0,0.333333,0.0
2,0.666667,0.0,0.5,1.0,1.0,0.333333,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.333333,1.0


### TF-IDF clássico (do livro)
> juntando o que estudei [aqui](https://monkeylearn.com/blog/what-is-tf-idf/) e [aqui](https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089)

Para tratar a desigualdade no comprimento dos textos:
- Normalizamos a parcela TF
- O que produz valores entre 0 e 1;
- Para evitar desequilibrio entre as parcelas, normalizamos também a parcela DF

A normalização do IDF provoca um outro problema: quando N é muito grande o valor final do TF-IDF tende a ser muito grande também, causando problema de escala. Então utilizamos o logaritmo para reduzir os valores, causando um efeito colateral, uma vez não se pode calcular o logaritmo de zero, por isso o + 1 na fórmula. Assim, temos:

\begin{equation}\label{eq:classica_pesquisada}\
    tf(t, d) = log ( N / tf + 1 ) \\
    idf(t, D) = log \frac{ n }{df(t) + 1} 
\end{equation}

> Não está clara a fórmula do TF, aparentemente, para o texto da [sklearn](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction) não há normalização na fórmula clássica.

In [9]:
# TF-IDF (clássico)
name_id = f'TF-IDF clássico | log no tf e no idf'
filename = name_id.replace('\n', '').replace(' | ', '_').replace(' ', '_').lower()
filename = f'table_{filename}.png'
print(f'\n\n{filename}')
tfidf = tfidf_classic(counts)
df = pd.DataFrame(tfidf, columns=freq_vec.get_feature_names_out())
df



table_tf-idf_clássico_log_no_tf_e_no_idf.png


Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,0.086643,0.173287,0.0,0.0,0.0,0.086643,0.173287,0.0,0.173287,0.173287,0.114536,0.0,0.0,0.086643,0.0
1,0.086643,0.0,0.343609,0.0,0.0,0.086643,0.0,0.114536,0.0,0.0,0.114536,0.173287,0.173287,0.086643,0.0
2,0.173287,0.0,0.114536,0.173287,0.173287,0.086643,0.0,0.114536,0.0,0.0,0.0,0.0,0.0,0.086643,0.173287


### Comparando a versão ingênua com a versão clássica

In [10]:
name_id = f'Term Frequency'
filename = name_id.replace('\n', '').replace(' | ', '_').replace(' ', '_').lower()
filename = f'table_{filename}.png'
print(f'\n\n{filename}')
tfidf = tf_only(counts)
df = pd.DataFrame(tfidf, columns=freq_vec.get_feature_names_out())
df

name_id = f'Inverse Document Frequency | ingênua'
filename = name_id.replace('\n', '').replace(' | ', '_').replace(' ', '_').lower()
filename = f'table_{filename}.png'
print(f'\n\n{filename}')
tfidf = idf_only(counts)
df = pd.DataFrame(tfidf, columns=freq_vec.get_feature_names_out())
df

name_id = f'TF-IDF | efetivamente calculando o tf x idf | ingenuamente'
filename = name_id.replace('\n', '').replace(' | ', '_').replace(' ', '_').lower()
filename = f'table_{filename}.png'
print(f'\n\n{filename}')
tfidf = tfidf_naive(counts)
df = pd.DataFrame(tfidf, columns=freq_vec.get_feature_names_out())
df

name_id = f'Inverse Document Frequency | multiplicado por N'
filename = name_id.replace('\n', '').replace(' | ', '_').replace(' ', '_').lower()
filename = f'table_{filename}.png'
print(f'\n\n{filename}')
tfidf = idf_by_N(counts)
df = pd.DataFrame(tfidf, columns=freq_vec.get_feature_names_out())
df

name_id = f'TF-IDF | efetivamente calculando o tf x idf | multiplicando o idf por N'
filename = name_id.replace('\n', '').replace(' | ', '_').replace(' ', '_').lower()
filename = f'table_{filename}.png'
print(f'\n\n{filename}')
tfidf = tfidf_test2(counts)
df = pd.DataFrame(tfidf, columns=freq_vec.get_feature_names_out())
df



table_term_frequency.png


Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,1,1,0,0,0,1,1,0,1,1,1,0,0,1,0
1,1,0,3,0,0,1,0,1,0,0,1,1,1,1,0
2,2,0,1,1,1,1,0,1,0,0,0,0,0,1,1




table_inverse_document_frequency_ingênua.png


Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,0.333333,1.0,0.5,1.0,1.0,0.333333,1.0,0.5,1.0,1.0,0.5,1.0,1.0,0.333333,1.0
1,0.333333,1.0,0.5,1.0,1.0,0.333333,1.0,0.5,1.0,1.0,0.5,1.0,1.0,0.333333,1.0
2,0.333333,1.0,0.5,1.0,1.0,0.333333,1.0,0.5,1.0,1.0,0.5,1.0,1.0,0.333333,1.0




table_tf-idf_efetivamente_calculando_o_tf_x_idf_ingenuamente.png


Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,0.333333,1.0,0.0,0.0,0.0,0.333333,1.0,0.0,1.0,1.0,0.5,0.0,0.0,0.333333,0.0
1,0.333333,0.0,1.5,0.0,0.0,0.333333,0.0,0.5,0.0,0.0,0.5,1.0,1.0,0.333333,0.0
2,0.666667,0.0,0.5,1.0,1.0,0.333333,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.333333,1.0




table_inverse_document_frequency_multiplicado_por_n.png


Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,1.0,3.0,1.5,3.0,3.0,1.0,3.0,1.5,3.0,3.0,1.5,3.0,3.0,1.0,3.0
1,1.0,3.0,1.5,3.0,3.0,1.0,3.0,1.5,3.0,3.0,1.5,3.0,3.0,1.0,3.0
2,1.0,3.0,1.5,3.0,3.0,1.0,3.0,1.5,3.0,3.0,1.5,3.0,3.0,1.0,3.0




table_tf-idf_efetivamente_calculando_o_tf_x_idf_multiplicando_o_idf_por_n.png


Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,1.0,3.0,0.0,0.0,0.0,1.0,3.0,0.0,3.0,3.0,1.5,0.0,0.0,1.0,0.0
1,1.0,0.0,4.5,0.0,0.0,1.0,0.0,1.5,0.0,0.0,1.5,3.0,3.0,1.0,0.0
2,2.0,0.0,1.5,3.0,3.0,1.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,1.0,3.0


### Sklearn | TfidfVectorizer(norm=None, use_idf=False, smooth_idf=False, sublinear_tf=False)

In [11]:
# TF-IDF text book
tfidf_vec = TfidfVectorizer(norm=None, use_idf=False, smooth_idf=False, sublinear_tf=False)
X = tfidf_vec.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=tfidf_vec.get_feature_names_out())
df

# # aparelho no primeiro documento
# n = 3
# tf_aparelho_0 = 1
# df_aparelho = 3
# idf_aparelho = math.log(n/df_aparelho) + 1
# tfidf_aparelho_0 = tf_aparelho_0 * idf_aparelho

# print(f'O idf do termo aparelho no primeiro texto é: {idf_aparelho}')
# print(f'O tfidf do termo aparelho no primeiro texto é: {tfidf_aparelho_0}')

Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


### Sklearn | TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

In [12]:
# TF-IDF smooth_idf
tfidf_vec = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
X = tfidf_vec.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=tfidf_vec.get_feature_names_out())
df

# # aparelho
# tf = 1
# idf = math.log(1+3/(1+3)) + 1
# print(f'O idf do termo aparelho no primeiro texto é: {idf}')
# print(f'O tfidf do termo aparelho no primeiro texto é: {tf * idf}')

Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,0.249028,0.421641,0.0,0.0,0.0,0.249028,0.421641,0.0,0.421641,0.421641,0.320669,0.0,0.0,0.249028,0.0
1,0.192547,0.0,0.743817,0.0,0.0,0.192547,0.0,0.247939,0.0,0.0,0.247939,0.32601,0.32601,0.192547,0.0
2,0.472502,0.0,0.304216,0.400008,0.400008,0.236251,0.0,0.304216,0.0,0.0,0.0,0.0,0.0,0.236251,0.400008


### Sklearn | TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

In [13]:
# TF-IDF smooth_idf + sublinear
tfidf_sub_vec = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)
X = tfidf_sub_vec.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=tfidf_sub_vec.get_feature_names_out())
df

# aparelho
tf = 1 + math.log(1)
idf = math.log(1+3/(1+3)) + 1
print(f'O idf do termo aparelho no primeiro texto é: {idf}')
print(f'O tfidf do termo aparelho no primeiro texto é: {tf * idf}')

Unnamed: 0,aparelho,balancear,de,energia,fornece,invenção,para,patente,presente,refere,se,sistema,trata,um,uma
0,0.249028,0.421641,0.0,0.0,0.0,0.249028,0.421641,0.0,0.421641,0.421641,0.320669,0.0,0.0,0.249028,0.0
1,0.227317,0.0,0.61429,0.0,0.0,0.227317,0.0,0.292712,0.0,0.0,0.292712,0.384882,0.384882,0.227317,0.0
2,0.413292,0.0,0.314319,0.413292,0.413292,0.244097,0.0,0.314319,0.0,0.0,0.0,0.0,0.0,0.244097,0.413292


O idf do termo aparelho no primeiro texto é: 1.5596157879354227
O tfidf do termo aparelho no primeiro texto é: 1.5596157879354227


# Embeddings

## Word2Vec

## BERT