In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stop_words_en
from spacy.lang.de.stop_words import STOP_WORDS as stop_words_de

from utils.text_cleaner import sub_fancy_quot_marks

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

In [None]:
sentences = ["it was the best of times particularly", 
             "it was the worst of times", 
             "it was the age of wisdom", 
             "it was the age of foolishness",
             "it is an orange",
             "it is an apple",]

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]
tokenized_sentences

In [None]:
vocabulary = set([w for s in tokenized_sentences for w in s])
vocabulary

In [None]:
words = list(set(vocabulary))
words

In [None]:
[[w, i] for i,w in enumerate(vocabulary)]

In [None]:
df_data = pd.DataFrame(sentences, columns=['sentence'])
df_data

#### MultiLabelBinarizer

In [None]:
# Note: The MultiLabelBinarizer only shows IF a word is in the vocab, but not how often !!!
mlb = MultiLabelBinarizer()
mlb.fit([vocabulary])
class_word_map = [cl for cl in mlb.classes_]
data = mlb.transform(tokenized_sentences)
df_mlb = pd.DataFrame(data, columns=class_word_map)
df_mlb

#### CountVectorizer

In [None]:
# Note: CountVectorizer (contrary to MLB above) not only checks if a given word appears in the text, but also how often the word appears.
cv = CountVectorizer(analyzer='word',
                     binary=False,
                     decode_error='strict',
                     ngram_range=(1, 1), 
                     stop_words='english')
cv.fit(sentences)
tcv = cv.transform(sentences)
df_cv = pd.DataFrame(tcv.toarray(), columns=cv.get_feature_names_out())
df_cv

In [None]:
# Note cosine_similarity based on the number of times a word appears in a sentence
labels = [f'sent_{i +1}' for i in range(len(sentences))]
df_sim = pd.DataFrame(cosine_similarity(tcv, tcv), columns=labels, index=labels)
df_sim

#### TF-IDF

In [None]:
tfidf = TfidfVectorizer()
tid = tfidf.fit_transform(sentences)
df_tfid = pd.DataFrame(tid.toarray(), columns=tfidf.get_feature_names_out())
df_tfid

In [None]:
frame = tid.toarray()
print(type(frame))
frame.shape

In [None]:
# Note: Get most common words in all texts
tfidf_most_common_words_all = list()
for col in df_tfid:
    tfidf_most_common_words_all.append((col, sum(df_tfid[col])))
sorted(tfidf_most_common_words_all, key=lambda x: x[1], reverse=True)[:5]

In [None]:
# Note: Get most common in each text
tfidf_most_common_words_in_text = list()
for ind, row in df_tfid.iterrows():
    print(row.tolist())
    # print(sorted(list(zip(df_tfid.columns, row.tolist())), key=lambda x: x[1], reverse=True))
    print('----------')
    # tfidf_most_common_words_in_text.append((f'sent_{ind}', row))

print(tfidf_most_common_words_in_text)
#     tfidf_most_common_words_in_text.append((col, sum(df_tfid[col])))
# sorted(tfidf_most_common_words_in_text, key=lambda x: x[1], reverse=True)[:5]

In [None]:
df_tfid.columns.size

#### TF-IDF: Reducing feature dimensions with:

#### stopwords

In [None]:
# Note: Make choice
# Note: stop-words contain starnge chars that first must be cleaned

stop_words_de:list = sub_fancy_quot_marks(text=' '.join(list(stop_words_de))).replace("'", "").split()
stop_words_en:list = sub_fancy_quot_marks(text=' '.join(list(stop_words_en))).replace("'", "").split()
stop_words:list = list(set(stop_words_en + stop_words_de))

#### minimum and maximum percentage of appearance before a word remains in the text

In [None]:
# Note: A float input for parameter "min_df" means: minimum pct of documents the given word must appear in. Otherwise it is removed.
min_pct_of_docs_word_must_appear_in:float = 0.00    # higher number -> less common words remain in text
max_pct_of_docs_word_can_appear_in:float = 0.30    # higher number -> more common words remain in text
tfidf_sw = TfidfVectorizer(min_df=min_pct_of_docs_word_must_appear_in, max_df=max_pct_of_docs_word_can_appear_in)
tid_sw = tfidf_sw.fit_transform(sentences)
df_tfid_sw = pd.DataFrame(tid_sw.toarray(), columns=tfidf_sw.get_feature_names_out())
df_tfid_sw

In [None]:
df_sim = pd.DataFrame(cosine_similarity(tid_sw, tid_sw), columns=labels, index=labels)
df_sim

#### linguistic features

In [None]:
nlp_en = spacy.load('en_core_web_trf')
nlp_de = spacy.load('de_dep_news_trf')

nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]
for i, row in df_data.iterrows():
    doc = nlp_en(str(row["sentence"]))
    df_data.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    df_data.at[i, "not_a_verb"] = " ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjectives_verbs])

df_data

In [None]:
# Note: Use lemmatized words here. But cold also be other features such as 
tfidf_lf = TfidfVectorizer()
# use = 'lemmas'
use = 'not_a_verb'
tid_lf = tfidf_lf.fit_transform(df_data[use].map(str))
df_tfid_lf = pd.DataFrame(tid_lf.toarray(), columns=tfidf_lf.get_feature_names_out())
df_tfid_lf