## Tokenization

In this tool, tokenization will be about getting word chunks, i.e. ignore punctuation, spaces and number symbols.

In [60]:
def is_symbol(character: str) -> bool:
    decimal = ord(character)
    return (33 <= decimal <= 47) or (58 <= decimal <= 64) or (91 <= decimal <= 96) or (123 <= decimal <= 126)

In [61]:
def is_number_char(ascii) -> bool:
    return 48 <= ascii <= 57

In [62]:
def is_number_word(word) -> bool:
    flag = True
    for char in word:
        if not is_number_char(ord(char)):
            flag = False
    return flag

In [63]:
def number_remove(word) -> str:
    if not is_number_word(word):
        if is_number_char(ord(word[-1])):
            return word[0:-1]
        elif is_number_char(ord(word[0])):
            return word[1:]
    return word

In [64]:
def tokenize(document: str) -> list:
    tokens = [None] * len(document)
    i = 0
    flag = 0

    for letter in document:
        if ord(letter) == 32 or is_symbol(letter):
            # If space, symbol or number detected, then there must be a chunk to add to tokens
            # Adding all the word from the flag up to the index of the actual letter
            tokens[i] = document[flag:i]
            # Changing the value of the flag, i.e., it stores the index of the next item added at the previous step
            flag = i + 1
        i += 1

    tokens += [document[flag:i]]

    tokens = [x for x in tokens if x is not None and x != '']

    for idx_token in range(len(tokens) - 1):
        tokens[idx_token] = number_remove(tokens[idx_token])

    tokens = [x for x in tokens if x is not None and x != '']

    return tokens

## Utils

In [65]:
def append(array, element):
    current_length = len(array)
    
    new_array = [None] * (current_length + 1)
    
    for i in range(current_length):
        new_array[i] = array[i]
    
    new_array[current_length] = element
    
    return new_array

## Remove Stopwords

In [66]:
def is_stop_word(word: str) -> bool:
    stopwords = ["el", "y", "eso", "la", "los", "las" ,"de","en","a","un","uno","una","es","con","es","con","para","al","del"]
    return word in stopwords

In [67]:
def remove_stopwords(document: str) -> str:
    new_doc = list()
    for word in document:
        if not is_stop_word(word):
            new_doc = append(new_doc, word)
    return new_doc

## Lower case

In [68]:
def to_lower(text: str) -> str:
    
    final_text = ""

    for char in text:
        if 65 <= ord(char) <= 90:
            char = chr(ord(char) + 32)
        final_text += char
    
    return final_text

## BoW

In [69]:
def get_vocabulary(corpus: list) -> list:
    vocabulary = []
    
    for document in corpus:
        for word in document:
            if word not in vocabulary:
                vocabulary = append(vocabulary, word)
    return vocabulary


## Test

In [70]:
corpus = list()
for i in range(3):
    corpus = append(corpus, to_lower(input(f'Ingresa la oracion {i+1}: ')))

tokens = list()

for document in corpus:
    document = tokenize(document)
    tokens = append(tokens, remove_stopwords(document))

In [None]:
tokens

In [72]:
vocabulary = get_vocabulary(tokens)

In [73]:
def bag_of_words_sentence(vocabulary: list, corpus: list) -> list:
    print(f'Vocabulario-------{vocabulary}')
    bag = []

    for doc in corpus:
        vocabulary_vector_token = [0]*len(vocabulary)
        for token in doc:
            print(token)
            vocabulary_vector_token[vocabulary.index(token)] += 1
        print(vocabulary_vector_token)
        bag += [vocabulary_vector_token]

    return bag

In [None]:
bag_of_words_sentence(vocabulary, tokens)

In [109]:
def bag_of_words_w(vocabulary: list, corpus: list) -> list:
    print(f'Vocabulario-------{vocabulary}')
    bag = []
    words_already = []

    for word in vocabulary:
        # Una fila por palabra unica
        # En caso de que la palabra ya haya sido contada, se incrementa el contador en la fila correspondiente y columna correspondiente
        # Cada columna es una palabra del vocabulario
        # Cada fila es una palabra unica del documento -> token
        # Cada celda es el contador de la palabra en el documento
        vocabulary_vector_token = [0]*len(vocabulary)
        for doc in corpus:
            if word in doc:
                vocabulary_vector_token[vocabulary.index(word)] += 1
        bag += [vocabulary_vector_token]
        print(f'Conteo para la palabra "{word}": {vocabulary_vector_token}')

    return bag

In [None]:
bag_of_words_w(vocabulary, tokens)