In [1]:
import fasttext

ft = fasttext.load_model("/Users/sladkydrevo/opt/baka/fasttext/cc.cs.300.bin")

In [2]:
import spacy_udpipe

nlp = spacy_udpipe.load("cs")

In [3]:
import os

def load_texts(folder_path):
    """Loads .txt files from given directory.
    Args:
        folder_path (str): path to text to be processed
    Returns:
        list: list of dictionaires with "filename" and "text" keys for every file
    """
    texts = []
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(".txt"):
            text_data = {}
            file = os.path.join(folder_path, filename)
            filename = os.path.splitext(filename)[0]
            
            with open(file, "r") as f:
                text_data["filename"] = filename
                text_data["text"] = f.read()
            texts.append(text_data)
            
    return texts

In [4]:
def chunk_texts(texts, chunk_size, overlap):
    """Splits text into chunks of a given length (count of words).
    Args:
        filename (str): name of the text file
        text (str): text to be processed
        chunk_size (int): count of words in every text chunk
        overlap (int): overlap of words at the end of the text and the beginning of another
    Returns:
        dictionary: chunk_name as key (filename + _ + order rank of chunk for given text) and text chunk as value for every text
    """
    text_chunks = {}
    for text_data in texts:
        filename = text_data["filename"]
        text = text_data["text"].split()
        chunk_id = 1
        
        for i in range(0, len(text), chunk_size - overlap):
            chunk = " ".join(text[i : i + chunk_size])
            chunk_name = f"{filename}_{chunk_id}"
            text_chunks[chunk_name] = chunk
            chunk_id += 1
        
    return text_chunks

In [5]:
def split_dict_data(data):
    names = list(data)
    texts = list(data.values())
    return names, texts

In [6]:
def convert_questions_dict(questions):
    converted = {}
    for q in questions:
        converted[q["filename"]] = q["text"]
    return converted

In [7]:
def preprocess_text(texts):
    """Tokenizes every text (or chunk) with Spacy, appends tokens that are not stopwords and 
    are alphanumeric to a new list. Every list is appended to the list of all data.
    Args:
        texts (list): accepts list of strings (texts, chunks)
    Returns:
        list: list of lists of tokens
    """
    preprocessed = []
    for text in texts:
        tokens = []
        doc = nlp(text)
        for token in doc:
            if token.is_alpha and not token.is_stop:
                tokens.append(token.lemma_.lower())
        preprocessed.append(tokens)
    return preprocessed

In [8]:
import numpy

def embed_texts(preprocessed_texts, model):
    embeddings = []
    for text in preprocessed_texts:
        text_embeddings = []
        for token in text:
            embedding = model.get_word_vector(token)
            text_embeddings.append(embedding)
        if len(text_embeddings) > 0:
            text_embeddings = numpy.mean(text_embeddings, axis=0)
        else:
            text_embeddings = numpy.zeros(300)
        embeddings.append(text_embeddings)
    return numpy.array(embeddings)

In [9]:
import pandas as pd

def make_similarity_table(chunks_data, questions, similarity_matrix):
    rows = [name for name in chunks_data.keys()]
    table = pd.DataFrame(similarity_matrix, index=rows, columns=questions)
    return table

In [10]:
def mask_similarity_table(df, n):
    ranked = df.rank(ascending=False, method="first").astype(int)
    masked = ranked.where(cond=ranked <= n, other=0)
    return masked

In [11]:
def get_top_n_answers(chunks_data, questions, df, n):
    qna = []
    masked_df = mask_similarity_table(df, n)
    masked_dict = masked_df.to_dict()
    
    for question_name, chunk_with_rank in masked_dict.items():
        answer_chunks = []
        for chunk_name, rank in chunk_with_rank.items():
            if rank != 0:
                answer_data = {
                    "rank" : rank,
                    "chunk_name" : chunk_name,
                    "chunk_text" : chunks_data[chunk_name]
                }
                answer_chunks.append(answer_data)
        answer_chunks.sort(key=lambda x: x["rank"])
        answers = {
            "question_name" : question_name,
            "question" : questions[question_name],
            "returned_answers" : answer_chunks
        }
        qna.append(answers)  
        
    return qna

In [12]:
import json

def save_json(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=5, default=str)

In [13]:
def read_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

In [14]:
def load_right_answers(path):
    text = read_txt(path).split("\n")
    sorted_right_answers = {}
    for number, answer_name in enumerate(text, start=1):
        sorted_right_answers[number] = answer_name
    return sorted_right_answers

In [15]:
def get_rank_table(answers, right_answers):
    rank_table = []
    for entry in answers:
        sorted_chunk_names = [answer["chunk_name"] for answer in entry["returned_answers"]]
        rank_table.append(sorted_chunk_names)
    rank_table_df = pd.DataFrame(rank_table, index=list(right_answers.values()), columns=range(1,6))
    return rank_table_df

In [52]:
def get_match_count(table):
    return dict((table == numpy.array(table.index)[:, None]).sum(axis=0))

In [17]:
def get_top_accuracies(results, questions):
    counts = list(results.values())
    cumulative_sums = []
    for k in (1, 3, 5):
        result = sum(counts[:k]) / len(questions)
        cumulative_sums.append(result)
    return cumulative_sums

In [49]:
import csv

def write_to_csv_top_5(path, new=False, results=None, model_name=None):
    if new:
        with open(path, "w") as f:
            writer = csv.writer(f, delimiter=",", )
            writer.writerow(["MODEL NAME", "TOP 1", "TOP 3", "TOP 5"])
    with open(path, "a") as f:
        writer = csv.writer(f, delimiter=",")
        results = list(results)
        results.insert(0, model_name)
        writer.writerow(results)
        print(f"Results inserted to csv file. Model name: {model_name} Results: {results}")

In [19]:
TEXTS_FOLDER_PATH = "/Users/sladkydrevo/opt/baka/dataset/texts"
texts = load_texts(TEXTS_FOLDER_PATH)

In [20]:
chunk_data = chunk_texts(texts, chunk_size=128, overlap=10)

In [21]:
path_to_all_chunks = "/Users/sladkydrevo/opt/baka/all_chunks.txt"
save_json(chunk_data, path_to_all_chunks)

In [22]:
chunk_names, text_chunks = split_dict_data(chunk_data) 

In [23]:
QUESTIONS_FOLDER_PATH = "/Users/sladkydrevo/opt/baka/dataset/questions"
questions_data = load_texts(QUESTIONS_FOLDER_PATH)

In [24]:
questions = convert_questions_dict(questions_data)

In [25]:
question_names, question_texts = split_dict_data(questions) 

In [26]:
preprocessed_texts = preprocess_text(text_chunks)

In [27]:
preprocessed_questions = preprocess_text(question_texts)

In [28]:
text_embeddings = embed_texts(preprocessed_texts, model=ft)

In [29]:
question_embeddings = embed_texts(preprocessed_questions, model=ft)

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(text_embeddings, question_embeddings)

In [31]:
similarities_df = make_similarity_table(chunk_data, questions, similarity_matrix)

In [32]:
masked = mask_similarity_table(similarities_df, n=5)

In [33]:
masked.to_csv("/Users/sladkydrevo/opt/baka/matrix.csv")

In [34]:
answers = get_top_n_answers(chunk_data, questions, similarities_df, n=5)

In [36]:
RESULTS_PATH = "/Users/sladkydrevo/opt/baka/pt1_baseline_results3.json"
save_json(answers, RESULTS_PATH)

In [37]:
path_to_right_answers = "/Users/sladkydrevo/opt/baka/right_answers.txt"
right_answers = load_right_answers(path_to_right_answers)

In [38]:
rank_table_df = get_rank_table(answers, right_answers)
rank_table_df

Unnamed: 0,1,2,3,4,5
01VN_Neptun_2,10VN_Kus_zdi_2,01VN_Neptun_3,03VN_Podzemni_ocean_2,01VN_Neptun_2,06VN_Exosystem_2
02VN_Tajemny_signal_3,02VN_Tajemny_signal_4,04VN_Zablesky_2,02VN_Tajemny_signal_3,02VN_Tajemny_signal_1,02VN_Tajemny_signal_2
03VN_Podzemni_ocean_1,03VN_Podzemni_ocean_1,03VN_Podzemni_ocean_4,01VN_Neptun_3,10VN_Kus_zdi_2,09VN_Orion_2
04VN_Zablesky_4,04VN_Zablesky_1,08VN_Galaxie_T57_1,08VN_Galaxie_T57_2,04VN_Zablesky_4,01VN_Neptun_1
05VN_Podzemni_jezera_5,05VN_Podzemni_jezera_1,05VN_Podzemni_jezera_6,05VN_Podzemni_jezera_5,05VN_Podzemni_jezera_4,01VN_Neptun_4
06VN_Exosystem_3,06VN_Exosystem_4,06VN_Exosystem_2,06VN_Exosystem_1,06VN_Exosystem_3,08VN_Galaxie_T57_1
07VN_Temna_hmota_4,07VN_Temna_hmota_3,07VN_Temna_hmota_4,07VN_Temna_hmota_2,07VN_Temna_hmota_1,07VN_Temna_hmota_6
08VN_Galaxie_T57_2,04VN_Zablesky_1,08VN_Galaxie_T57_2,08VN_Galaxie_T57_1,04VN_Zablesky_4,04VN_Zablesky_3
09VN_Orion_4,09VN_Orion_3,09VN_Orion_1,09VN_Orion_5,09VN_Orion_4,07VN_Temna_hmota_3
10VN_Kus_zdi_3,10VN_Kus_zdi_1,10VN_Kus_zdi_4,10VN_Kus_zdi_3,10VN_Kus_zdi_2,01VN_Neptun_3


In [53]:
match_results = get_match_count(rank_table_df)

In [40]:
tops = get_top_accuracies(match_results, questions)

In [47]:
path_to_models_results = "/Users/sladkydrevo/opt/baka/MODELS_RESULTS.csv"

In [50]:
write_to_csv_top_5(path_to_models_results, new=True, results=tops, model_name="fastText")

Results inserted to csv file. Model name: fastText Results: ['fastText', 0.1, 0.45, 0.65]
