In [75]:
from collections import Counter
import configparser
import numpy as np
import csv
from tqdm import tqdm
import nbimporter

### Lendo arquivo de configuração

In [76]:
config = configparser.ConfigParser()
config.read('INDEX.cfg')

li_path = config.get('DEFAULT', 'LEIA')

li = {}
with open(li_path, 'r', newline='') as file:
    reader = csv.reader(file, delimiter=';')
    
    # Iterate over each row in the CSV file
    for row in reader:
        key = row[0].strip()
        value = eval(row[1].strip())
        li[key] = value

type_of_tf = config.get('DEFAULT', 'FREQUENCIA')

### Criando a matriz termo-documento

In [77]:
def term_document_matrix(inverted_index):
    tdm = {}
    for word in inverted_index.keys():
        tdm[word] = len([rec_num for rec_num in inverted_index[word]])
    return tdm

In [78]:
tdm = term_document_matrix(li)
tdm

{'SIGNIFICANCE': 26,
 'PSEUDOMONAS': 124,
 'AERUGINOSA': 194,
 'INFECTION': 225,
 'RESPIRATORY': 149,
 'TRACT': 73,
 'CYSTIC': 2042,
 'FIBROSIS': 2058,
 'PATIENT': 2168,
 'STUDIED': 153,
 'MEAN': 218,
 'IMMUNOELECTROPHORETICAL': 1,
 'ANALYSIS': 96,
 'SERUM': 778,
 'NUMBER': 150,
 'PRECIPITIN': 86,
 'CONCENTRATION': 316,
 'PROTEIN': 290,
 'ADDITION': 65,
 'CLINICAL': 274,
 'RADIOGRAPHICAL': 1,
 'STATUS': 42,
 'LUNG': 249,
 'EVALUATED': 43,
 'USING': 124,
 'SCORING': 5,
 'SYSTEM': 110,
 'DEMONSTRATED': 81,
 'MAXIMUM': 27,
 'ONE': 330,
 'SIGNIFICANTLY': 177,
 'CHANGED': 7,
 'COMPARED': 192,
 'MATCHED': 40,
 'CONTROL': 568,
 'PERSON': 33,
 'NOTABLY': 5,
 'IGG': 86,
 'IGA': 37,
 'ELEVATED': 90,
 'ACUTE': 62,
 'PHASE': 45,
 'LATTER': 25,
 'SUGGESTING': 24,
 'ACTIVE': 42,
 'TISSUE': 83,
 'DAMAGE': 26,
 'HAPTOGLOBIN': 5,
 'CORRELATED': 52,
 'MANY': 94,
 'ACCOMPANIED': 14,
 'RESULT': 332,
 'INDICATE': 54,
 'PROTECTIVE': 7,
 'VALUE': 248,
 'SALIVARY': 75,
 'AMYLASE': 94,
 'LEVEL': 337,
 'DETERMI

### Pesos TF-IDF

In [79]:
%store -r n_words
N = len(n_words.keys())

In [80]:
# Your list
lst = [1, 2, 2, 3, 3, 3, 3, 4, 5, 6, 6]

# Get distinct numbers using set
distinct_numbers = set(lst)

# Convert set back to list if needed
distinct_numbers_list = list(distinct_numbers)

# Print distinct numbers
print(len(distinct_numbers))


6


In [None]:
for word, list_of_docs in li.items():
        word_occurrence = Counter(list_of_docs)
        df = len(set(li[word]))
        print(df)

In [96]:
n_words

{'00001 ': 93,
 '00002 ': 86,
 '00003 ': 73,
 '00004 ': 95,
 '00005 ': 69,
 '00006 ': 116,
 '00007 ': 136,
 '00008 ': 180,
 '00009 ': 27,
 '00010 ': 20,
 '00011 ': 78,
 '00012 ': 55,
 '00013 ': 43,
 '00014 ': 185,
 '00015 ': 45,
 '00016 ': 35,
 '00017 ': 53,
 '00018 ': 146,
 '00019 ': 148,
 '00020 ': 48,
 '00021 ': 41,
 '00022 ': 79,
 '00023 ': 92,
 '00024 ': 134,
 '00025 ': 34,
 '00026 ': 53,
 '00027 ': 50,
 '00028 ': 28,
 '00029 ': 48,
 '00030 ': 99,
 '00031 ': 97,
 '00032 ': 35,
 '00033 ': 53,
 '00034 ': 89,
 '00035 ': 28,
 '00036 ': 1,
 '00037 ': 116,
 '00038 ': 50,
 '00039 ': 61,
 '00040 ': 65,
 '00041 ': 62,
 '00042 ': 22,
 '00043 ': 128,
 '00044 ': 31,
 '00045 ': 49,
 '00046 ': 108,
 '00047 ': 103,
 '00048 ': 97,
 '00049 ': 56,
 '00050 ': 45,
 '00051 ': 42,
 '00052 ': 69,
 '00053 ': 54,
 '00054 ': 34,
 '00055 ': 135,
 '00056 ': 85,
 '00057 ': 24,
 '00058 ': 99,
 '00059 ': 67,
 '00060 ': 81,
 '00061 ': 111,
 '00062 ': 109,
 '00063 ': 113,
 '00064 ': 46,
 '00065 ': 108,
 '00066 ':

In [100]:
len(set(li['SIGNIFICANCE']))

26

In [102]:
(1/93) * np.log(1239/26)


0.04154799294201786

In [93]:
def compute_tfidf(li, type_of_tf='relative'):
    tf = {}
    idf = {}
    tfidf = {}
    for word, list_of_docs in li.items():
        word_occurrence = Counter(list_of_docs)
        df = len(set(li[word]))
        idf[word] = np.log(N / df)
        for doc in list_of_docs:
            tf[doc, word] = word_occurrence[doc]
            if type_of_tf == 'relative':
                tf[doc, word] = tf[doc, word] / n_words[doc]
                tfidf[doc, word] = tf[doc, word] * idf[word]
            elif type_of_tf == 'absolute':
                tfidf[doc, word] = tf[doc, word] * idf[word]
                continue
    return tfidf

In [95]:
tfidf = compute_tfidf(li)
tfidf

{('00001 ', 'SIGNIFICANCE'): 0.04154799294201786,
 ('00074 ', 'SIGNIFICANCE'): 0.08781734871835592,
 ('00078 ', 'SIGNIFICANCE'): 0.07576398712956198,
 ('00121 ', 'SIGNIFICANCE'): 0.034194365872634164,
 ('00147 ', 'SIGNIFICANCE'): 0.04067329835376485,
 ('00157 ', 'SIGNIFICANCE'): 0.14310975346695037,
 ('00179', 'SIGNIFICANCE'): 0.034194365872634164,
 ('00185', 'SIGNIFICANCE'): 0.028621950693390078,
 ('00195', 'SIGNIFICANCE'): 0.0536661575501064,
 ('00205', 'SIGNIFICANCE'): 0.03018721362193485,
 ('00223', 'SIGNIFICANCE'): 0.058544899145570616,
 ('00258', 'SIGNIFICANCE'): 0.019514966381856874,
 ('00319', 'SIGNIFICANCE'): 0.052215720859562983,
 ('00355', 'SIGNIFICANCE'): 0.04246113564404023,
 ('00402 ', 'SIGNIFICANCE'): 0.12879877812025536,
 ('00411 ', 'SIGNIFICANCE'): 0.031933581352129425,
 ('00412 ', 'SIGNIFICANCE'): 0.08221198603420554,
 ('00485 ', 'SIGNIFICANCE'): 0.0633436613706174,
 ('00526 ', 'SIGNIFICANCE'): 0.0330253277231424,
 ('00555 ', 'SIGNIFICANCE'): 0.06899934542156537,
 ('0