In [1]:
import pandas as pd
from natasha import (
    Doc,
    MorphVocab,
    NamesExtractor,
    NewsEmbedding,
    NewsMorphTagger,
    NewsNERTagger,
    NewsSyntaxParser,
    Segmenter,
)
from tqdm import tqdm

segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor):
    token_embeddings = model_output[
        0
    ]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


def get_embs(text: str, model: AutoModel, tokenizer: AutoTokenizer):
    # Tokenize sentences
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    encoded_input = encoded_input.to(model.device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, average pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
    return sentence_embeddings


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)
model = AutoModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
).cuda("cuda")

In [None]:
import pickle

with open("labeled_data_v0.1.pickle", "rb") as handle:
    b = pickle.load(handle)

In [None]:
sentiment_texts = pd.read_pickle("../data/sentiment_texts.pickle")

In [None]:
sentiment_texts["MessageID_copy"] = sentiment_texts["MessageID"].copy()
sentiment_texts["ChannelID_copy"] = sentiment_texts["ChannelID"].copy()
aggregated_df = sentiment_texts.groupby(["MessageID_copy", "ChannelID_copy"]).agg(list)
only_needed_df = aggregated_df[
    ["MessageID", "ChannelID", "issuerid", "SentimentScore", "MessageText"]
]
only_needed_df[["MessageID", "ChannelID", "MessageText"]] = only_needed_df[
    ["MessageID", "ChannelID", "MessageText"]
].applymap(lambda x: x[0])

In [None]:
def get_issuer_map():
    names_n_synonyms_df = pd.read_excel("../data/names and synonyms.xlsx")
    names_n_synonyms_df["one_string"] = names_n_synonyms_df.iloc[:, 2:].apply(
        lambda row: " ".join(row.dropna().tolist()).lower(), axis=1
    )
    mapper = names_n_synonyms_df.set_index("issuerid")["one_string"]
    return mapper


def get_company_ids(text, mapper):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    for span in doc.spans:
        span.normalize(morph_vocab)

    companies = []
    for x in {_.normal for _ in doc.spans}:
        found_ones = mapper[mapper.str.contains(x.lower())].index.tolist()
        companies.extend(found_ones)
    return companies

In [None]:
mapper = get_issuer_map()

In [None]:
errs = 0

embeddings = []
issuer_ids = []
scores = []

for ind, row in tqdm(only_needed_df.iterrows()):
    try:
        text_emb = get_embs(row["MessageText"], model, tokenizer)
        embeddings.append(text_emb.cpu().detach().numpy())
        iids = []
        sc = []
        for r in b:
            if row.MessageText == r["corpus"]:
                for entity in r["entities"]:
                    try:
                        found_issuer_ids = mapper[
                            mapper.str.contains(entity["company"].lower())
                        ].index.tolist()
                        if found_issuer_ids:
                            target = int(entity["score"])
                            iids.extend(found_issuer_ids)
                            sc.extend([target] * len(found_issuer_ids))
                    except Exception as e:
                        errs += 1
                        print(e)
                break
        issuer_ids.append(iids)
        scores.append(sc)
    except Exception as e:
        errs += 1
        print(e)
print(errs)

In [None]:
only_needed_df["embedding"] = embeddings
only_needed_df["llm_issuerid"] = issuer_ids
only_needed_df["llm_SentimentScore"] = scores

In [None]:
new_x = []
new_xs = []
for x, xs, y, ys in zip(
    only_needed_df["issuerid"],
    only_needed_df["SentimentScore"],
    only_needed_df["llm_issuerid"],
    only_needed_df["llm_SentimentScore"],
    strict=True,
):
    new_x.append(x)
    new_xs.append(xs)
    for _y, _ys in zip(y, ys, strict=True):
        if _y not in x:
            new_x[-1].append(_y)
            new_xs[-1].append(_ys)

In [None]:
only_needed_df["final_ids"] = new_x
only_needed_df["final_scores"] = new_xs

In [None]:
only_needed_df.to_pickle("../data/training_needed_df.pkl")