In [75]:
import spacy
nlp = spacy.load("el_core_news_sm")
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('lighteternal/stsb-xlm-r-greek-transfer')

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('lighteternal/stsb-xlm-r-greek-transfer')


In [77]:
from sklearn.preprocessing import normalize
import numpy as np

def round_decimal_points(float_list, decimal_points=4):
    rounded_list = [round(float_val, decimal_points) for float_val in float_list]
    return rounded_list

def normalize_l2(vector):
    #return vector
    float_array = np.array(vector)
    reshaped_array = float_array.reshape(1, -1)
    normalized_array = normalize(reshaped_array, norm='l2')
    normalized_list = round_decimal_points(normalized_array.tolist()[0], 4)
    return normalized_list

In [105]:
def generate_embeddings(str):
    sentence_embeddings = model.encode(query_text).tolist()
    return normalize_l2(sentence_embeddings)

def generate_embeddings_euc(str):
    sentence_embeddings = model.encode(query_text).tolist()
    return round_decimal_points(sentence_embeddings)


In [106]:
import pysolr
solr = pysolr.Solr('http://localhost:8983/solr/lnc')

In [107]:
query_text = "Προβάλινθος Μάτι"

In [108]:
tokens = tokenizer.tokenize(query_text)

print(tokens)

['▁Προ', 'βάλ', 'ιν', 'θος', '▁Μά', 'τι']


In [109]:
embeddings = generate_embeddings(query_text)
embeddings_euc = generate_embeddings_euc(query_text)

In [110]:
query_params = {
    'q': '{!knn f=par_embeddings_dp topK=1000}' + str(embeddings),
    'rows': 25, 
    'fl': "*,score"
}

print(query_params)

# Execute the query
results = solr.search(**query_params)

# Print the results
for result in results:
    print(f"ID: {result['id']}, Score: {result['score']}, Title: {result['title'] if 'title' in result else ''}, Published: {result['published_at']}")

{'q': '{!knn f=par_embeddings_dp topK=1000}[-0.0153, -0.0057, -0.0071, -0.0482, 0.0146, 0.0525, 0.0253, 0.0335, -0.0175, -0.0345, 0.0216, 0.0019, 0.0229, -0.0536, 0.0119, -0.0244, -0.0016, -0.0396, -0.0372, 0.0416, -0.0279, -0.0082, 0.0292, -0.0172, -0.0771, 0.0005, 0.0778, 0.019, 0.004, 0.019, 0.0174, -0.0058, -0.0457, 0.0098, 0.0272, 0.0328, 0.0129, -0.0383, 0.0521, -0.0205, -0.0597, 0.0252, -0.0437, 0.0076, 0.0148, 0.0494, -0.0705, 0.0278, 0.012, -0.0133, 0.0039, -0.0134, -0.006, -0.0072, -0.0072, 0.0499, 0.0323, -0.0389, -0.0064, -0.0336, 0.0413, -0.0013, 0.0468, -0.0264, 0.036, -0.0024, -0.0277, 0.0007, 0.018, -0.0298, 0.0737, -0.0411, -0.0252, -0.0427, -0.0357, 0.0403, 0.0442, -0.0847, -0.0048, 0.0019, 0.035, -0.0066, -0.0467, -0.0039, -0.0266, -0.0536, -0.0305, 0.0402, -0.0228, 0.0215, 0.0521, 0.0089, 0.0453, -0.0347, 0.0343, 0.0197, 0.0149, -0.006, 0.0278, 0.0047, -0.039, 0.0428, 0.0468, -0.029, 0.0372, -0.013, 0.0284, 0.0178, -0.0067, -0.1106, 0.0106, -0.0116, -0.0568, -0.0024

In [94]:
query_params = {
    'q': query_text,
    'defType': 'edismax',
    'qf': 'title^2 content summary',
    'pf': 'title^2 content summary',
    'rows': 10
}

# Execute the query
results = solr.search(**query_params)

# Print the results
for result in results:
    print(f"ID: {result['id']}, Title: {result['title']}, Published: {result['published_at']}")

ID: 157347, Title: Στο μαρτυρικό Μάτι έκοψε την πίτα η Έυη Χριστοφιλοπούλου (pics), Published: 2019-01-31T12:00:00Z
ID: 157488, Title: Στο μαρτυρικό Μάτι έκοψε την πίτα η Εύη Χριστοφιλοπούλου (pics), Published: 2019-01-31T12:00:00Z
ID: 356203, Title: Μάτι: Διαδικτυακή συλλογή υπογραφών ενάντια στη μείωση των ποινών για την πυρκαγιά, Published: 2019-03-14T15:48:04Z
ID: 333681, Title: Στο Μάτι..., Published: 2019-03-10T06:28:23Z
ID: 245, Title: Χριστουγεννιάτικο «δώρο» οι μεγάλες υποσχέσεις στο Μάτι, Published: 2018-12-16T23:11:44Z
ID: 45550, Title: Με τα μάτια της Ασφάλειας, Published: 2019-01-05T08:18:00Z
ID: 314000, Title: Βγάζει Μάτι…, Published: 2019-03-06T09:21:47Z
ID: 1239, Title: Μάτι : Χριστούγεννα οργής, απελπισίας, θυμού για τα ψεύτικα λόγια, Published: 2018-12-17T07:22:27Z
ID: 38337, Title: Ευχές με τα μάτια κλειστά..., Published: 2019-01-03T15:00:00Z
ID: 15336, Title: Τα Χριστούγεννα δεν ήρθαν στο Μάτι, Published: 2018-12-25T11:59:06Z


In [116]:
query_params = {
    'q': '{!knn f=par_embeddings_euc topK=1000}' + str(embeddings_euc),
    'rows': 25, 
    'fl': "*,score"
}

print(query_params)

# Execute the query
results = solr.search(**query_params)

# Print the results
for result in results:
    print(f"ID: {result['id']}, Score: {result['score']}, Title: {result['title'] if 'title' in result else ''}, Published: {result['published_at']}")

{'q': '{!knn f=par_embeddings_euc topK=1000}[-0.18, -0.07, -0.08, -0.57, 0.17, 0.63, 0.3, 0.4, -0.21, -0.41, 0.26, 0.02, 0.27, -0.64, 0.14, -0.29, -0.02, -0.47, -0.44, 0.5, -0.33, -0.1, 0.35, -0.2, -0.92, 0.01, 0.93, 0.23, 0.05, 0.23, 0.21, -0.07, -0.54, 0.12, 0.32, 0.39, 0.15, -0.46, 0.62, -0.24, -0.71, 0.3, -0.52, 0.09, 0.18, 0.59, -0.84, 0.33, 0.14, -0.16, 0.05, -0.16, -0.07, -0.09, -0.09, 0.59, 0.38, -0.46, -0.08, -0.4, 0.49, -0.02, 0.56, -0.31, 0.43, -0.03, -0.33, 0.01, 0.21, -0.36, 0.88, -0.49, -0.3, -0.51, -0.42, 0.48, 0.53, -1.01, -0.06, 0.02, 0.42, -0.08, -0.56, -0.05, -0.32, -0.64, -0.36, 0.48, -0.27, 0.26, 0.62, 0.11, 0.54, -0.41, 0.41, 0.24, 0.18, -0.07, 0.33, 0.06, -0.46, 0.51, 0.56, -0.35, 0.44, -0.16, 0.34, 0.21, -0.08, -1.32, 0.13, -0.14, -0.68, -0.03, 0.69, -0.84, 0.12, 0.07, 0.34, 0.56, 0.05, -0.23, 0.21, -0.66, -0.82, -0.66, -0.19, 0.12, 0.49, -0.08, -0.26, -0.47, -0.53, -0.16, -0.28, -0.34, -0.22, 0.15, 0.12, 0.69, 0.36, 0.75, -0.27, -0.16, 0.87, -0.85, -0.22, 0.24,