In [3]:
import pandas as pd

abstracts = pd.read_parquet(r'D:/NLP/tfidf_xgboost/cleaned_abstracts.parquet')

print(abstracts.head())


   paper_id                                           abstract  \
0         0  The development of an automated system for the...   
1         1  This paper proposes a novel hybrid forward alg...   
2         2  Modern CCD cameras are usually capable of a sp...   
3         3  This paper deals with the problem of fuzzy non...   
4         4  A number of neural networks can be formulated ...   

                                             cleaned  
0  development automated system quality assessmen...  
1  paper proposes novel hybrid forward algorithm ...  
2  modern ccd cameras usually capable spatial acc...  
3  paper deals problem fuzzy nonlinear model iden...  
4  number neural networks formulated linearinthep...  


In [4]:
from gensim.models import Word2Vec

# 1. Φτιάχνουμε τη λίστα των “sentences” για το Word2Vec
sentences = abstracts['cleaned'].str.split().tolist()


In [5]:
w2v_model = Word2Vec(
    sentences,
    vector_size=100,    # διάσταση embedding
    window=5,           # μέγεθος παράθυρου
    min_count=5,        # αγνοούμε λέξεις που εμφανίζονται < 5 φορές
    workers=4,          # αριθμός threads
    epochs=10           # γύροι εκπαίδευσης
)


In [6]:
import numpy as np

def doc_embedding(tokens, model):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

# Εφαρμογή σε όλο το DataFrame
abstracts['w2v_mean'] = abstracts['cleaned'].str.split().apply(lambda toks: doc_embedding(toks, w2v_model))


In [7]:
def doc_stats(tokens, model):
    vecs = np.array([model.wv[w] for w in tokens if w in model.wv])
    if vecs.size == 0:
        return np.zeros(model.vector_size), np.zeros(model.vector_size)
    return vecs.mean(axis=0), vecs.std(axis=0)

stats = abstracts['cleaned'].str.split().apply(lambda toks: doc_stats(toks, w2v_model))
abstracts['w2v_std'] = stats.apply(lambda x: x[1])


In [9]:
import numpy as np

# 1. Στοίχιση των mean‐embeddings
X_mean = np.vstack( abstracts['w2v_mean'].values ).astype(np.float32)

# 2. Στοίχιση των std‐embeddings
#   Υποθέτουμε ότι abstracts['w2v_std'] περιέχει numpy arrays της ίδιας διάστασης
X_std  = np.vstack( abstracts['w2v_std'].values ).astype(np.float32)

# 3. Τα IDs
doc_ids = abstracts['paper_id'].values

# 4. Αποθήκευση σε ένα NPZ
np.savez(
    r'D:\NLP\Features_XL\embeddings_abstracts\abstract_w2v_embeds.npz',
    ids=doc_ids,
    mean_embeds=X_mean,
    std_embeds=X_std
)