In [None]:
import re
import string
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en import English
from spacy.vocab import Vocab
from spacy.language import Tokenizer, TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from sklearn.feature_extraction.text import TfidfVectorizer

from settings.enums import NaturalLanguage
from utils import re_patterns as repat

In [None]:
# TOKENIZER_PREFIXES

In [None]:
df = pd.read_parquet(path='../data/text_samples.parquet')
print(df.head())

In [None]:
index = 0
text = df.loc[index, 'pp_art_text']
text

In [None]:
# Note: Breaking up sentences
nlp_en_sent =  English()
nlp_en_sent.add_pipe('sentencizer')

In [None]:
# Note: Breaking up words

class TopicModel:
    def __init__(self, language: NaturalLanguage):
        match language:
            case NaturalLanguage.DE:
                self.nlp = spacy.load('de_dep_news_trf')
            case NaturalLanguage.EN:
                self.nlp = spacy.load('en_core_web_trf')
        self.word_separators = self.nlp.Defaults.infixes

    def word_splitter(self, text: str, 
                      return_word_separators: bool = False, 
                      custom_tokenizer: bool = True) -> list:
        # Note: these params can be adjusted: infix, prefix, suffix
        if custom_tokenizer:
            def custom_tokenizer(nlp: NaturalLanguage):
                # ToDo: Create patterns
                prefix_re = re.compile(r"")
                infix_re = re.compile(r"[-]")
                suffix_re = re.compile(r"")
                return Tokenizer(vocab=nlp.vocab, 
                                 # prefix_search=prefix_re.search,
                                 infix_finditer=infix_re.finditer,
                                 # suffix_search=suffix_re.search,
                                 )
            self.nlp.tokenizer = custom_tokenizer(self.nlp)
        doc = self.nlp(text)
        words_raw = [tok.text for tok in doc]
        if return_word_separators:
            words = words_raw
        else:
            words = [exp for exp in words_raw if exp not in string.punctuation]
        return words
    
    def tfidf_vectorizer(self, 
                         text: str,
                         min_pct_of_docs_word_must_appear_in:float = 0.00,
                         max_pct_of_docs_word_can_appear_in:float = 1.00,
                         ) -> pd.DataFrame:
        # Note: min: higher number -> less common words remain in text
        # Note: max: higher number -> more common words remain in text
        tfidf = TfidfVectorizer(min_df=min_pct_of_docs_word_must_appear_in, max_df=max_pct_of_docs_word_can_appear_in)
        # ToDo: The text must be ALL training text to get the features that are used for prediction
        tfidf_vectors = tfidf.fit_transform([text])
        df_tfidf = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf.get_feature_names_out())
        return df_tfidf
    
    def similarity_matrix(self, fit_transformed_model: np.ndarray):
        pass
        

In [None]:
tm = TopicModel(language=NaturalLanguage.DE)

In [None]:
res = tm.word_splitter(text=text, return_word_separators=False, custom_tokenizer=True)
res

In [None]:
res = tm.tfidf_vectorizer(text=text)
res

In [None]:
df1 = pd.DataFrame({'Entries':['man','guy','boy','girl'],'Conflict':['Yes','Yes','Yes','No']})

def funcA(d):
    d = d + 'aaa'
    return d
def funcB(d):
    d = d + 'bbb'
    return d

df1['Entries'] = df1.apply(lambda x: funcA(x['Entries']) if x['Conflict'] == 'Yes' else funcB(x['Entries']), axis=1)
df1