In [10]:
# Updated FeatureExtractor with SciBERT and LDA topic paths
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
import networkx as nx
import pickle
import time
from itertools import combinations, product
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer


class FeatureExtractor:
    """
    Class to organize and compute link prediction features for citation graphs.
    Organized feature extraction with incremental flush to disk and INFO logging.
    """
    def __init__(self, base_path, output_dir, chunked=True, its=100000):
        # Paths for resources
        self.base_path = base_path
        self.split_path = f"{base_path}/split_train_val/citation_pairs_split_train_val.csv"
        self.authors_path = f"{base_path}/paper_to_authors.pkl"
        
        self.abstracts_emb_path = r"D:/NLP/citation_link_prediction/abstracts_embeds.npy"
        self.tfidf_idx_path = f"{base_path}/tfidf_pid_to_idx.pkl"
        
        # αντί για pickle.load, θα φορτώσουμε .npy
        self.specter_path = r'D:\NLP\citation_link_prediction\specter_pretrained.npy'
      
        self.scibert_path = "D:/NLP/data/paper_scibert_embeddings.pkl"
        self.bertopic_path = f"{base_path}/bertopic_features.parquet"
        self.lda_topics_path = f"{base_path}/paper_topics.parquet"
        

        
        # Output config
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        self.chunked = chunked
        self.its = its  # estimated iterations per second for timing
        
        # Data holders
        self.df_pairs = None
        self.paper_to_authors = None
        
        self.specter_emb  = None
        
        self.pid_to_idx = None
        self.specter_dict = None
        self.scibert_dict = None
        self.lda_topics = None
        self.G = None
        self.G_auth = None
        
        # will hold in-memory features if not chunked
        self.feat_dict = {}  
    
    def load_data(self):
        # 1. Load citation pairs
        """
        Load citation pairs (train/val or test), paper→authors, TF–IDF matrix & index,
        SPECTER & SciBERT embeddings, BERTopic & LDA topics, and build graphs.
        """
        print("INFO: Loading data...")
        t0 = time.time()
        # a) Citation pairs
        path = self.split_path
        self.df_pairs = pd.read_csv(
            path, usecols=["citing","cited","split","label"],
            dtype={"citing":int,"cited":int,"split":str,"label":int}
        )
        if 'split' in self.df_pairs.columns:
            self.df_pairs = (
                self.df_pairs[self.df_pairs["split"].isin(["train","val"])]
                .reset_index(drop=True)
            )
        print(f"INFO: Loaded {len(self.df_pairs)} pairs in {time.time()-t0:.1f}s")
        
        # b) Paper→authors mapping
        with open(self.authors_path, "rb") as f:
            self.paper_to_authors = pickle.load(f)
        print(f"INFO: Loaded authors mapping ({len(self.paper_to_authors)} papers)")
        
        # c) TF–IDF index & matrix
        # ► i) Load precomputed SVD embeddings for abstracts & authors
        #(My tfidf instead of Kozel)
        # load abstracts SVD embeddings (LSA 32d)
        self.emb_abs = np.load(self.abstracts_emb_path)
        # emb_abs.shape == (n_rows_in_tfidf_matrix, 32)
        print(f"INFO: Loaded abstracts_embeds {self.emb_abs.shape}")

        # μέσα στο load_data(), μετά το "LOAD TF-IDF index"
        with open(self.tfidf_idx_path, "rb") as f:
            self.pid_to_idx = pickle.load(f)
        print(f"INFO: Loaded pid_to_idx ({len(self.pid_to_idx)} entries)")
      
        
        # d) SPECTER embeddings
        # Load Specter embeddings from .npy (shape = [n_papers_used, D_s])
        self.specter_emb = np.load(self.specter_path)
        print(f"INFO: Loaded Specter embeddings array {self.specter_emb.shape}")
                
        # e) SciBERT embeddings
        with open(self.scibert_path, "rb") as f:
            self.scibert_dict = pickle.load(f)
        print(f"INFO: Loaded SciBERT embeddings ({len(self.scibert_dict)})")
        
        # f) BERTopic & LDA topics
        self.lda_topics = pd.read_parquet(self.lda_topics_path)
        print(f"INFO: Loaded LDA topics ({len(self.lda_topics)}) in {time.time()-t0:.1f}s total")

        ## EMBEDS
        with open(os.path.join(self.base_path,
                               "split_train_val",
                               "citation_node2vec_tuned.pkl"), "rb") as f:
            self.citation_node2vec = pickle.load(f)
        with open(os.path.join(self.base_path,
                               "split_train_val",
                               "citation_walklets_weighted_undirected.pkl"), "rb") as f:
            self.citation_walklets = pickle.load(f)
        with open(os.path.join(self.base_path, "author_node2vec.pkl"), "rb") as f:
            self.author_node2vec = pickle.load(f)
        with open(os.path.join(self.base_path, "author_walklets.pkl"), "rb") as f:
            self.author_walklets = pickle.load(f)
        print(f"INFO: Loaded graph embeddings: "
              f"citation_node2vec={len(self.citation_node2vec)}, "
              f"citation_walklets={len(self.citation_walklets)}, "
              f"author_node2vec={len(self.author_node2vec)}, "
              f"author_walklets={len(self.author_walklets)}")

        # ——————————————————————————————————————————————————————————————
        # ► h) Load BERTopic features and build topic‐dicts once for all functions
        df_bt = pd.read_parquet(self.bertopic_path)
        # dominant topic & entropy
        self.topic_dict   = dict(zip(df_bt.paper_id, df_bt.bertopic_dominant_topic))
        self.entropy_dict = dict(zip(df_bt.paper_id, df_bt.bertopic_topic_entropy))
        # full distribution vectors
        topic_cols = [c for c in df_bt.columns if c.startswith("topic_dist_")]
        td_df = df_bt.set_index("paper_id")[topic_cols]
        self.topic_dist_arr = {
            pid: td_df.loc[pid].to_numpy()
            for pid in td_df.index
        }
        # per‐author average topic vector
        from collections import defaultdict
        author_topic_acc = defaultdict(list)
        for pid, dist in self.topic_dist_arr.items():
            for a in self.paper_to_authors.get(pid, []):
                author_topic_acc[a].append(dist)
        self.auth_topic_dict = {
            a: np.mean(vs, axis=0)
            for a, vs in author_topic_acc.items()
        }
        # per‐paper “domain” vector via its authors
        self.paper_dom_dict = {
            pid: np.mean(
                [ self.auth_topic_dict[a] for a in authors if a in self.auth_topic_dict ],
                axis=0
            )
            for pid, authors in self.paper_to_authors.items()
            if any(a in self.auth_topic_dict for a in authors)
        }
        print(f"INFO: BERTopic dicts ready (topics={len(self.topic_dist_arr)}," 
              f" authors={len(self.auth_topic_dict)}, papers_dom={len(self.paper_dom_dict)})")
        
        
        # ***Μενει Να φτιαξω απο τα δικα μου clean authors
        with open(r"D:\NLP\kozel\embeddings\author_emb.pkl", "rb") as f:
            self.emb_auth = pickle.load(f)   # dict: paper_id → 32‐d vector
        print(f"INFO: Loaded SVD embeddings (abstracts={len(self.emb_abs)}, authors={len(self.emb_auth)})")
        # ——————————————————————————————————————————————————————————————
        
        # j) Build citation graph (unweighted)
        train_pos = self.df_pairs[(self.df_pairs.split=="train") & (self.df_pairs.label==1)]
        self.G = nx.DiGraph()
        self.G.add_nodes_from(
            pd.unique(self.df_pairs[["citing","cited"]].values.ravel())
        )
        for u,v in zip(train_pos['citing'], train_pos['cited']):
            self.G.add_edge(int(u), int(v))
        print(f"INFO: Built citation graph (nodes={self.G.number_of_nodes()}, edges={self.G.number_of_edges()})")
        
        # h) Build co-author graph (weighted by # coauthored papers)
        G_auth = nx.Graph()
        for authors in self.paper_to_authors.values():
            for a,b in combinations(authors,2):
                if G_auth.has_edge(a,b):
                    G_auth[a][b]['weight'] += 1
                else:
                    G_auth.add_edge(a,b,weight=1)
        self.G_auth = G_auth
        print(f"INFO: Built co-author graph (nodes={G_auth.number_of_nodes()}, edges={G_auth.number_of_edges()})")
        
        print("✅ Data loaded, graphs ready.")
        return self
    
    def _flush_feature(self, name, array):
        """Save feature array to disk and optionally free memory."""
        np.save(os.path.join(self.output_dir, f"{name}.npy"), array)
        if self.chunked:
            self.feat_dict.pop(name, None)
        else:
            self.feat_dict[name] = array
        print(f"    • Flushed feature '{name}' ({array.shape})")
    
    def compute_tfidf_similarity(self, batch_size=10000):
        """
        (Παλιό όνομα, αλλά πλέον κάνει cosine similarity
         στα 32-διάστατα LSA embeddings αντί για raw TF–IDF.)
        """
        n = len(self.df_pairs)
        print(f"INFO: Abstract-LSA sim ({n} pairs) est ~{n/self.its:.1f}s")
        t0 = time.time()
    
        # προσωρινός πίνακας για τα αποτελέσματα
        sim = np.zeros(n, dtype=float)
    
        # θα αντλήσουμε σειρές από self.emb_abs βάσει pid_to_idx
        D = self.emb_abs.shape[1]
        zero = np.zeros(D, dtype=float)
    
        u = self.df_pairs['citing'].to_numpy()
        v = self.df_pairs['cited'].to_numpy()
    
        # batch loop για να μην γεμίσουμε μνήμη
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            for i in range(start, end):
                pid_u, pid_v = u[i], v[i]
                idx_u = self.pid_to_idx.get(pid_u, -1)
                idx_v = self.pid_to_idx.get(pid_v, -1)
    
                eu = self.emb_abs[idx_u] if idx_u >= 0 else zero
                ev = self.emb_abs[idx_v] if idx_v >= 0 else zero
    
                num = float(np.dot(eu, ev))
                den = np.linalg.norm(eu) * np.linalg.norm(ev) + 1e-8
                sim[i] = num / den
    
        # και το flush όπως πριν
        self._flush_feature('tfidf_similarity', sim)
        print(f"INFO: Abstract-LSA sim done in {time.time()-t0:.1f}s")

    def compute_specter_similarity(self, batch_size=10000):
        """
        Compute three Specter-based features for each (citing,cited):
          • dot-product
          • cosine similarity
          • L1 distance
        Flushes all three into 'specter_feats.npz'.
        """
        n = len(self.df_pairs)
        print(f"INFO: Specter feats ({n} pairs) est ~{n/self.its:.1f}s")
        t0 = time.time()
    
        # prepare arrays
        dots      = np.zeros(n, dtype=float)
        cos_sims  = np.zeros(n, dtype=float)
        abs_diffs = np.zeros(n, dtype=float)
        specter_l2= np.zeros(n, dtype=float)
    
        # helper
        D     = self.specter_emb.shape[1]
        zero  = np.zeros(D, dtype=float)
        pid2i = self.pid_to_idx  # paper→row in emb array
    
        u = self.df_pairs['citing'].to_numpy()
        v = self.df_pairs['cited'].to_numpy()
    
        # batch-loop
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            for i in range(start, end):
                pid_u, pid_v = u[i], v[i]
                iu = pid2i.get(pid_u, -1)
                iv = pid2i.get(pid_v, -1)
                eu = self.specter_emb[iu] if iu >= 0 else zero
                ev = self.specter_emb[iv] if iv >= 0 else zero
    
                d = float(np.dot(eu, ev))
                dots[i] = d
                # cosine
                nu = np.linalg.norm(eu)
                nv = np.linalg.norm(ev)
                cos_sims[i] = d / (nu * nv + 1e-8)

                diff = eu - ev
                # L1
                abs_diffs[i] = float(np.sum(np.abs(diff)))
                # L2
                specter_l2[i] =float(np.sum(diff * diff))
    
        # flush all three at once
        out = os.path.join(self.output_dir, 'specter_feats.npz')
        np.savez(
            out,
            specter_dot       = dots,
            specter_cosine    = cos_sims,
            specter_l1        = abs_diffs,
            specter_l2        = specter_l2
        )
        print(f"INFO: Specter feats done in {time.time()-t0:.1f}s, flushed to {out}")


        
    # def compute_scibert_similarity(self):
    #     """Compute cosine similarity using SciBERT embeddings."""
    #     # To be implemented
    #     pass
    def compute_scibert_similarity(self, batch_size=100000):
        """Compute and flush SciBERT cosine similarity in batches to save memory."""
        n = len(self.df_pairs)
        est = n/self.its
        print(f"INFO: SciBERT sim: {n} pairs (~{est:.1f}s)")
        t0 = time.time()
        sim = np.zeros(n, float)
        # dimension of SciBERT embeddings
        D = len(next(iter(self.scibert_dict.values())))
        # arrays of ids
        u = self.df_pairs.citing.to_numpy(); v = self.df_pairs.cited.to_numpy()
        # process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]; v_batch = v[start:end]
            # stack embeddings for this batch
            U = np.vstack([self.scibert_dict.get(pid, np.zeros(D)) for pid in u_batch])
            V = np.vstack([self.scibert_dict.get(pid, np.zeros(D)) for pid in v_batch])
            dots = np.einsum('ij,ij->i', U, V)
            nu = np.linalg.norm(U, axis=1); nv = np.linalg.norm(V, axis=1)
            sim[start:end] = dots / (nu * nv + 1e-8)
        # flush full feature
        self._flush_feature('scibert_similarity', sim)
        print(f"INFO: SciBERT done in {time.time()-t0:.1f}s")
    
    # def compute_bertopic_features(self):
    #     pass
    def compute_bertopic_features(self, batch_size=100000):
        """
        Compute BERTopic features for each (citing, cited) pair:
          - citing & cited dominant topic
          - same topic flag
          - citing & cited topic entropy
          - cosine similarity of full distributions
        All 6 features are stacked into one array of shape (n_pairs, 6) 
        and flushed as a single file.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: BERTopic feats: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # 1. Load BERTopic table
        df_bt = pd.read_parquet(self.bertopic_path)
        dom_dict = dict(zip(df_bt.paper_id, df_bt.bertopic_dominant_topic))
        ent_dict = dict(zip(df_bt.paper_id, df_bt.bertopic_topic_entropy))
        dist_cols = [c for c in df_bt.columns if c.startswith("topic_dist_")]
        dist_mat = df_bt.set_index("paper_id")[dist_cols]

        # 2. Prepare id arrays
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # 3. Initialize arrays
        citing_dom = np.array([dom_dict.get(pid, -1) for pid in u],   dtype=int)
        cited_dom  = np.array([dom_dict.get(pid, -1) for pid in v],   dtype=int)
        same_bt    = (citing_dom == cited_dom).astype(int)
        citing_ent = np.array([ent_dict.get(pid, 0.0) for pid in u],  dtype=float)
        cited_ent  = np.array([ent_dict.get(pid, 0.0) for pid in v],  dtype=float)
        cos_sim    = np.zeros(n, dtype=float)

        # 4. Cosine similarity in batches
        D = len(dist_cols)
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]
            v_batch = v[start:end]

            U = np.vstack([
                dist_mat.loc[pid].to_numpy() if pid in dist_mat.index else np.zeros(D)
                for pid in u_batch
            ])
            V = np.vstack([
                dist_mat.loc[pid].to_numpy() if pid in dist_mat.index else np.zeros(D)
                for pid in v_batch
            ])

            dots = np.einsum('ij,ij->i', U, V)
            nu   = np.linalg.norm(U, axis=1)
            nv   = np.linalg.norm(V, axis=1)
            cos_sim[start:end] = dots / (nu * nv + 1e-8)

        # 5. Stack all 6 features into one 2D array (n_pairs × 6)
        all_feats = np.column_stack([
            citing_dom,
            cited_dom,
            same_bt,
            citing_ent,
            cited_ent,
            cos_sim
        ])

        # 6. Flush as a single file
        self._flush_feature('bertopic_features', all_feats)
        print(f"INFO: BERTopic done in {time.time() - t0:.1f}s")
    
    # def compute_lda_topics_features(self):
    #     """Compute similarity or distributions from LDA topics."""
    #     # To be implemented
    #     pass
    def compute_lda_topics_features(self, batch_size=100000):
        """
        Compute cosine similarity between LDA topic distributions for each (citing, cited) pair in batches.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: LDA topics sim: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # 1. Prepare mapping from paper_id to topic vector
        #    assume self.lda_topics has columns: "paper_id" + one column per topic
        topic_cols = [c for c in self.lda_topics.columns if c != "paper_id"]
        lda_mat = self.lda_topics.set_index("paper_id")[topic_cols]
        
        # 2. Arrays of citing/cited IDs
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # 3. Preallocate result
        sim = np.zeros(n, dtype=float)

        # 4. Process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]
            v_batch = v[start:end]

            # stack vectors, default to zero-vector if missing
            U = np.vstack([
                lda_mat.loc[pid].to_numpy() if pid in lda_mat.index else np.zeros(len(topic_cols))
                for pid in u_batch
            ])
            V = np.vstack([
                lda_mat.loc[pid].to_numpy() if pid in lda_mat.index else np.zeros(len(topic_cols))
                for pid in v_batch
            ])

            # cosine similarity
            dots = np.einsum('ij,ij->i', U, V)
            nu = np.linalg.norm(U, axis=1)
            nv = np.linalg.norm(V, axis=1)
            sim[start:end] = dots / (nu * nv + 1e-8)

        # 5. Flush feature array
        self._flush_feature('lda_topics_similarity', sim)
        print(f"INFO: LDA topics done in {time.time()-t0:.1f}s")
    
    # def compute_author_graph_heuristics(self):
    #     pass
    def compute_author_graph_heuristics(self, batch_size=100000):
        """
        Compute author-graph heuristics in batches:
          - average common neighbors count per author-pair
          - average Adamic–Adar per author-pair
          - average Resource Allocation per author-pair
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Author-graph heuristics: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Prepare ID arrays and result buffers
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        cn_arr = np.zeros(n, dtype=float)
        aa_arr = np.zeros(n, dtype=float)
        ra_arr = np.zeros(n, dtype=float)

        # Local references for speed
        G_auth = self.G_auth
        deg_auth = dict(G_auth.degree())

        # Process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]
            v_batch = v[start:end]

            for i, (uid, vid) in enumerate(zip(u_batch, v_batch)):
                # Get author lists for each paper
                Au = self.paper_to_authors.get(uid, [])
                Av = self.paper_to_authors.get(vid, [])
                cn_vals, aa_vals, ra_vals = [], [], []

                # Compute per-author-pair metrics
                for a, b in product(Au, Av):
                    if G_auth.has_node(a) and G_auth.has_node(b):
                        common = list(nx.common_neighbors(G_auth, a, b))
                        if common:
                            cn_vals.append(len(common))
                            aa_vals.append(sum(1.0 / np.log(1 + deg_auth[z]) for z in common))
                            ra_vals.append(sum(1.0 / deg_auth[z] for z in common))

                # Aggregate (mean) or default to 0
                idx = start + i
                if cn_vals:
                    cn_arr[idx] = np.mean(cn_vals)
                    aa_arr[idx] = np.mean(aa_vals)
                    ra_arr[idx] = np.mean(ra_vals)

        # Flush to disk
        # Stack all three author-graph features into one array (n_pairs × 3)
        all_feats = np.column_stack([
            cn_arr,
            aa_arr,
            ra_arr
        ])
        # Flush as a single file
        self._flush_feature('author_graph_heuristics', all_feats)
        print(f"INFO: Author-graph heuristics done in {time.time() - t0:.1f}s")

    
    # def compute_embedding_features(self):
    #     pass
    def compute_embedding_features(self, batch_size=100000):
        """
        Compute embedding-based scalars in batches:
          - citation_node2vec_cosine, _dot, _l2
          - citation_walklets_cosine, _dot, _l2
          - author_node2vec_cosine, _dot, _l2
          - author_walklets_cosine, _dot, _l2
        Flushes all 12 arrays in a single .npz.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Embedding feats: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Prepare id arrays
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # Allocate buffers
        cnv_c_cos = np.zeros(n, dtype=float)
        cnv_c_dot = np.zeros(n, dtype=float)
        cnv_c_l2  = np.zeros(n, dtype=float)
        cnv_w_cos = np.zeros(n, dtype=float)
        cnv_w_dot = np.zeros(n, dtype=float)
        cnv_w_l2  = np.zeros(n, dtype=float)
        # Precompute author-level mean embeddings
        # assume self.author_node2vec & self.author_walklets exist
        # and self.paper_to_authors maps pid→list of a_ids
        def mean_emb(pid, emb_dict, dim):
            vs = [emb_dict[a] for a in self.paper_to_authors.get(pid, []) if a in emb_dict]
            return np.mean(vs, axis=0) if vs else np.zeros(dim, dtype=float)
        # determine dims
        d_cn = next(iter(self.citation_node2vec.values())).shape[0]
        d_aw = next(iter(self.citation_walklets.values())).shape[0]
        d_an = next(iter(self.author_node2vec.values())).shape[0]
        d_awl= next(iter(self.author_walklets.values())).shape[0]
        # process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_b, v_b = u[start:end], v[start:end]
            # citation-node2vec
            Cv = np.vstack([ self.citation_node2vec.get(pid, np.zeros(d_cn)) for pid in u_b ])
            Vv = np.vstack([ self.citation_node2vec.get(pid, np.zeros(d_cn)) for pid in v_b ])
            dot = np.einsum('ij,ij->i', Cv, Vv)
            nu  = np.linalg.norm(Cv, axis=1); nv = np.linalg.norm(Vv, axis=1)
            cnv_c_cos[start:end] = dot / (nu*nv + 1e-8)
            cnv_c_dot[start:end] = dot
            cnv_c_l2[start:end]  = np.linalg.norm(Cv - Vv, axis=1)
            # citation-walklets
            Cw = np.vstack([ self.citation_walklets.get(pid, np.zeros(d_aw)) for pid in u_b ])
            Wv = np.vstack([ self.citation_walklets.get(pid, np.zeros(d_aw)) for pid in v_b ])
            dot = np.einsum('ij,ij->i', Cw, Wv)
            nu  = np.linalg.norm(Cw, axis=1); nv = np.linalg.norm(Wv, axis=1)
            cnv_w_cos[start:end] = dot / (nu*nv + 1e-8)
            cnv_w_dot[start:end] = dot
            cnv_w_l2[start:end]  = np.linalg.norm(Cw - Wv, axis=1)
            # author-node2vec
            An = np.vstack([ mean_emb(pid, self.author_node2vec, d_an) for pid in u_b ])
            Bn = np.vstack([ mean_emb(pid, self.author_node2vec, d_an) for pid in v_b ])
            dot = np.einsum('ij,ij->i', An, Bn)
            nu  = np.linalg.norm(An, axis=1); nv = np.linalg.norm(Bn, axis=1)
            # reuse buffers names for author if desired, or separate
            # here stacking all into one npz with clear keys below
            # similarly for author-walklets
            Aw = np.vstack([ mean_emb(pid, self.author_walklets, d_awl) for pid in u_b ])
            Bw = np.vstack([ mean_emb(pid, self.author_walklets, d_awl) for pid in v_b ])
            dot_aw = np.einsum('ij,ij->i', Aw, Bw)
            nu_aw  = np.linalg.norm(Aw, axis=1); nv_aw = np.linalg.norm(Bw, axis=1)
            # store author embeddings
            if start == 0:
                an_cos = np.zeros(n, dtype=float)
                an_dot = np.zeros(n, dtype=float)
                an_l2  = np.zeros(n, dtype=float)
                aw_cos = np.zeros(n, dtype=float)
                aw_dot = np.zeros(n, dtype=float)
                aw_l2  = np.zeros(n, dtype=float)
            an_cos[start:end] = dot / (nu*nv + 1e-8)
            an_dot[start:end] = dot
            an_l2[start:end]  = np.linalg.norm(An - Bn, axis=1)
            aw_cos[start:end] = dot_aw / (nu_aw*nv_aw + 1e-8)
            aw_dot[start:end] = dot_aw
            aw_l2[start:end]  = np.linalg.norm(Aw - Bw, axis=1)

        # stack and flush all 12 features
        np.savez(
            os.path.join(self.output_dir, 'embedding_features.npz'),
            citation_node2vec_cosine       = cnv_c_cos,
            citation_node2vec_dot          = cnv_c_dot,
            citation_node2vec_l2           = cnv_c_l2,
            citation_walklets_cosine       = cnv_w_cos,
            citation_walklets_dot          = cnv_w_dot,
            citation_walklets_l2           = cnv_w_l2,
            author_node2vec_cosine         = an_cos,
            author_node2vec_dot            = an_dot,
            author_node2vec_l2             = an_l2,
            author_walklets_cosine         = aw_cos,
            author_walklets_dot            = aw_dot,
            author_walklets_l2             = aw_l2
        )
        print(f"INFO: Embedding feats done in {time.time()-t0:.1f}s")
    
    # def compute_coauthor_distance(self):
    #     pass
    def compute_coauthor_distance(self, batch_size=100000):
        """
        Compute co-author distance metrics in batches:
          - coauth_min_dist: minimum shortest-path between any author of citing & cited
          - coauth_mean_dist: average such shortest-path
          - coauth_max_dist: maximum such shortest-path
          - coauth_inv_min: 1 / (min_dist + 1)
          - coauth_close_bin: binary flag if min_dist <= 2
        Flushes all five arrays as a single .npz.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Co-author distance: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Prepare id arrays and result buffers
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        min_arr  = np.zeros(n, dtype=float)
        mean_arr = np.zeros(n, dtype=float)
        max_arr  = np.zeros(n, dtype=float)

        # Maximum distance if no path exists
        max_dist = self.G_auth.number_of_nodes()

        # Process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]
            v_batch = v[start:end]

            for i, (uid, vid) in enumerate(zip(u_batch, v_batch)):
                Au = self.paper_to_authors.get(uid, [])
                Av = self.paper_to_authors.get(vid, [])
                dists = []
                for a in Au:
                    for b in Av:
                        if self.G_auth.has_node(a) and self.G_auth.has_node(b):
                            try:
                                dists.append(nx.shortest_path_length(self.G_auth, a, b))
                            except nx.NetworkXNoPath:
                                dists.append(max_dist)
                idx = start + i
                if dists:
                    min_arr[idx]  = min(dists)
                    mean_arr[idx] = sum(dists) / len(dists)
                    max_arr[idx]  = max(dists)
                else:
                    min_arr[idx] = mean_arr[idx] = max_arr[idx] = max_dist

        # Derived metrics
        inv_min  = 1.0 / (min_arr + 1.0)
        close_bin = (min_arr <= 2).astype(int)

        # Flush all features together
        out_path = os.path.join(self.output_dir, 'coauthor_distance.npz')
        np.savez(
            out_path,
            coauth_min_dist  = min_arr,
            coauth_mean_dist = mean_arr,
            coauth_max_dist  = max_arr,
            coauth_inv_min   = inv_min,
            coauth_close_bin = close_bin
        )
        print(f"INFO: Co-author distance done in {time.time() - t0:.1f}s, flushed to {out_path}")

    def compute_author_overlap_jaccard(self, batch_size=100000):
        """
        Compute author overlap and Jaccard coefficient for each pair:
          - author_overlap: count of common authors
          - jaccard_authors: |intersection| / |union|
        Results saved together in 'author_overlap_jaccard.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Author overlap/Jaccard: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Precompute author sets
        auth_sets = {pid: set(auths) for pid, auths in self.paper_to_authors.items()}

        # Prepare buffers
        overlap = np.zeros(n, dtype=int)
        jaccard = np.zeros(n, dtype=float)

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # Batch processing
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            for i, (uid, vid) in enumerate(zip(u[start:end], v[start:end])):
                Au = auth_sets.get(uid, set())
                Av = auth_sets.get(vid, set())
                inter = Au & Av
                uni   = Au | Av
                idx = start + i
                overlap[idx] = len(inter)
                jaccard[idx] = len(inter) / (len(uni) + 1e-8)

        # Flush both features in one .npz
        out_path = os.path.join(self.output_dir, 'author_overlap_jaccard.npz')
        np.savez(
            out_path,
            author_overlap   = overlap,
            jaccard_authors  = jaccard
        )
        print(f"INFO: Author overlap/Jaccard done in {time.time() - t0:.1f}s, flushed to {out_path}")


    def compute_author_aggregate_stats(self):
        """
        Compute author-level aggregate stats for each pair:
          - citing_author_mean_pagerank, cited_author_mean_pagerank
          - citing_author_max_pagerank,  cited_author_max_pagerank
          - citing_author_mean_degree,   cited_author_mean_degree
        Results saved together in 'author_aggregate_stats.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Author agg stats: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Compute pagerank and degree on co-author graph
        auth_pr  = nx.pagerank(self.G_auth, weight='weight')
        auth_deg = dict(self.G_auth.degree())

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        mean_pr_u = np.zeros(n, dtype=float)
        mean_pr_v = np.zeros(n, dtype=float)
        max_pr_u  = np.zeros(n, dtype=float)
        max_pr_v  = np.zeros(n, dtype=float)
        mean_deg_u = np.zeros(n, dtype=float)
        mean_deg_v = np.zeros(n, dtype=float)

        for idx, (uid, vid) in enumerate(zip(u, v)):
            Au = self.paper_to_authors.get(uid, [])
            Av = self.paper_to_authors.get(vid, [])
            # pagerank stats
            prs_u = [auth_pr.get(a, 0.0) for a in Au]
            prs_v = [auth_pr.get(a, 0.0) for a in Av]
            if prs_u:
                mean_pr_u[idx] = sum(prs_u) / len(prs_u)
                max_pr_u[idx]  = max(prs_u)
            if prs_v:
                mean_pr_v[idx] = sum(prs_v) / len(prs_v)
                max_pr_v[idx]  = max(prs_v)
            # degree stats
            degs_u = [auth_deg.get(a, 0) for a in Au]
            degs_v = [auth_deg.get(a, 0) for a in Av]
            if degs_u:
                mean_deg_u[idx] = sum(degs_u) / len(degs_u)
            if degs_v:
                mean_deg_v[idx] = sum(degs_v) / len(degs_v)

        # Flush all six stats together
        out_path = os.path.join(self.output_dir, 'author_aggregate_stats.npz')
        np.savez(
            out_path,
            citing_author_mean_pagerank = mean_pr_u,
            cited_author_mean_pagerank  = mean_pr_v,
            citing_author_max_pagerank  = max_pr_u,
            cited_author_max_pagerank   = max_pr_v,
            citing_author_mean_degree   = mean_deg_u,
            cited_author_mean_degree    = mean_deg_v
        )
        print(f"INFO: Author agg stats done in {time.time() - t0:.1f}s, flushed to {out_path}")
    def compute_node_level_metrics(self):
        """
        Compute node‐level graph features for each (citing, cited) pair:
          - citing_in_degree, citing_out_degree, citing_degree
          - cited_in_degree,  cited_out_degree,  cited_degree
          - citing_pagerank,   cited_pagerank
          - citing_triangles,  cited_triangles
          - citing_clustering, cited_clustering
          - citing_core,       cited_core
          - citing_onion,      cited_onion
          - citing_eigen,      cited_eigen
          - common_neighbors (undirected)
        Flush all 21 arrays as one .npz.
        """
        n = len(self.df_pairs)
        print(f"INFO: Node‐level metrics: {n} pairs (~{n/self.its:.1f}s)")
        t0 = time.time()

        # Build undirected version
        und = self.G.to_undirected()

        # Compute raw dicts
        in_deg   = dict(self.G.in_degree())
        out_deg  = dict(self.G.out_degree())
        deg      = dict(self.G.degree())
        pr       = nx.pagerank(self.G, weight=None)
        tri      = nx.triangles(und)
        clust    = nx.clustering(und, weight=None)
        core     = nx.core_number(und)
        onion    = nx.onion_layers(und)
        eig      = nx.eigenvector_centrality(self.G, max_iter=500)

        # Prepare index arrays
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # Map into arrays
        ci_deg = np.array([in_deg.get(x,0) for x in u])
        co_deg = np.array([out_deg.get(x,0) for x in u])
        ct_deg = np.array([deg.get(x,0) for x in u])
        di_deg = np.array([in_deg.get(x,0) for x in v])
        do_deg = np.array([out_deg.get(x,0) for x in v])
        dt_deg = np.array([deg.get(x,0) for x in v])

        ci_pr = np.array([pr.get(x,0.0) for x in u])
        co_pr = np.array([pr.get(x,0.0) for x in v])

        ci_tri = np.array([tri.get(x,0) for x in u])
        co_tri = np.array([tri.get(x,0) for x in v])

        ci_cl = np.array([clust.get(x,0.0) for x in u])
        co_cl = np.array([clust.get(x,0.0) for x in v])

        ci_co = np.array([core.get(x,0) for x in u])
        co_co = np.array([core.get(x,0) for x in v])

        ci_on = np.array([onion.get(x,0) for x in u])
        co_on = np.array([onion.get(x,0) for x in v])

        ci_ei = np.array([eig.get(x,0.0) for x in u])
        co_ei = np.array([eig.get(x,0.0) for x in v])

        # Common neighbors via adjacency-squared
        nodes = list(und.nodes())
        idx_map = {node:i for i,node in enumerate(nodes)}
        A = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A)
        ui = [idx_map[x] for x in u]
        vi = [idx_map[x] for x in v]
        cn = np.array(A2[ui, vi]).ravel()

        # Stack and flush
        np.savez(
            os.path.join(self.output_dir, 'node_level_metrics.npz'),
            citing_in_degree       = ci_deg,
            citing_out_degree      = co_deg,
            citing_degree          = ct_deg,
            cited_in_degree        = di_deg,
            cited_out_degree       = do_deg,
            cited_degree           = dt_deg,
            citing_pagerank        = ci_pr,
            cited_pagerank         = co_pr,
            citing_triangles       = ci_tri,
            cited_triangles        = co_tri,
            citing_clustering      = ci_cl,
            cited_clustering       = co_cl,
            citing_core_number     = ci_co,
            cited_core_number      = co_co,
            citing_onion_number    = ci_on,
            cited_onion_number     = co_on,
            citing_eigenvector     = ci_ei,
            cited_eigenvector      = co_ei,
            common_neighbors       = cn
        )
        print(f"INFO: Node‐level done in {time.time()-t0:.1f}s")


    def compute_pair_heuristics(self, batch_size=100000):
        """
        Compute pair‐level heuristics on the citation graph:
          - citation_jaccard, salton, hub_depressed, adamic_adar
          - preferential_attachment, resource_allocation
          - directed_shortest_path (–1 if none)
        Flush all 7 arrays in 'pair_heuristics.npz'.
        """
        n = len(self.df_pairs)
        print(f"INFO: Pair heuristics: {n} pairs (~{n/self.its:.1f}s)")
        t0 = time.time()

        und = self.G.to_undirected()
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # Precompute deg and common_neighbors via A2
        deg_dict = dict(self.G.degree())
        ui = u; vi = v
        nodes = list(und.nodes())
        idx_map = {node:i for i,node in enumerate(nodes)}
        A = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A)
        cn = np.array([A2[idx_map[x], idx_map[y]] for x,y in zip(u,v)])

        # Buffers
        jacc = cn / (np.array([deg_dict.get(x,0) + deg_dict.get(y,0) - c for x,y,c in zip(u,v,cn)]) + 1e-8)
        sal   = cn / np.sqrt(np.array([deg_dict.get(x,0)*deg_dict.get(y,0) for x,y in zip(u,v)]) + 1e-8)
        hub   = cn / (np.maximum([deg_dict.get(x,0) for x in u],[deg_dict.get(y,0) for y in v]) + 1e-8)
        # Adamic–Adar & RA
        aa = np.zeros(n, dtype=float)
        ra = np.zeros(n, dtype=float)
        for i,(x,y) in enumerate(zip(u,v)):
            aa[i] = sum(1.0/np.log(1+und.degree(z)) for z in nx.common_neighbors(und, x, y))
            ra[i] = sum(1.0/und.degree(z)        for z in nx.common_neighbors(und, x, y))
        # Preferential attachment
        pa = np.array([self.G.out_degree(x)*self.G.in_degree(y) for x,y in zip(u,v)])
        # Directed shortest paths (batch)
        dsp = np.full(n, -1, dtype=int)
        for start in range(0, n, batch_size):
            end = min(start+batch_size, n)
            for i,(x,y) in enumerate(zip(u[start:end],v[start:end])):
                try:
                    dsp[start+i] = nx.shortest_path_length(self.G, x, y)
                except nx.NetworkXNoPath:
                    dsp[start+i] = -1

        

        # Flush
        np.savez(
            os.path.join(self.output_dir,'pair_heuristics.npz'),
            citation_G_jaccard              = jacc,
            citation_G_salton               = sal,
            citation_G_hub_depressed        = hub,
            citation_G_adamic_adar          = aa,
            citation_G_resource_allocation  = ra,
            citation_G_preferential_attachment = pa,
            citation_G_sp_directed          = dsp
        )
        print(f"INFO: Pair heuristics done in {time.time()-t0:.1f}s")
    

    def compute_co_citation_bibliographic(self):
        """
        Compute co-citation and bibliographic coupling counts:
          - co_citation: |predecessors(u) ∩ predecessors(v)|
          - bibliographic_coupling: |successors(u) ∩ successors(v)|
        Flushes both arrays in 'cocite_biblio.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Co-citation & bibliographic: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        G = self.G
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        co_cite = np.zeros(n, dtype=int)
        biblio  = np.zeros(n, dtype=int)

        for i, (uid, vid) in enumerate(zip(u, v)):
            preds_u = set(G.predecessors(uid))
            preds_v = set(G.predecessors(vid))
            co_cite[i] = len(preds_u & preds_v)

            succs_u = set(G.successors(uid))
            succs_v = set(G.successors(vid))
            biblio[i] = len(succs_u & succs_v)

        out_path = os.path.join(self.output_dir, 'cocite_biblio.npz')
        np.savez(
            out_path,
            co_citation              = co_cite,
            bibliographic_coupling   = biblio
        )
        print(f"INFO: Co-citation & biblio done in {time.time()-t0:.1f}s, flushed to {out_path}")

    def compute_path_based_scores(self, beta=0.005, epsilon=0.001):
        """
        Katz (χωρίς direct‐term), Local‐Path, και approx directed‐SP:
          • katz_index = β²·A2 + β³·A3
          • local_path = A2 + ε·A3
          • sp_directed = 2 if A2>0, 3 if A3>0, else -1
        """
        import scipy.sparse as sps
    
        n = len(self.df_pairs)
        print(f"INFO: Path-based scores: {n} pairs")
        t0 = time.time()
    
        # 1) adjacency powers on undirected G
        und = self.G.to_undirected()
        nodes = list(und.nodes())
        idx = {node:i for i,node in enumerate(nodes)}
        A  = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A)
        A3 = A2.dot(A)
    
        # 2) prepare output arrays
        katz = np.zeros(n, dtype=float)
        lp   = np.zeros(n, dtype=float)
        spd  = np.full(n, -1,   dtype=int)
    
        u_ids = self.df_pairs['citing'].to_numpy()
        v_ids = self.df_pairs['cited'].to_numpy()
    
        # 3) extract per-pair features
        for i,(u,v) in enumerate(zip(u_ids, v_ids)):
            ui, vi = idx[u], idx[v]
            # κλασικό Katz χωρίς direct term
            katz[i] = (beta**2) * A2[ui,vi] + (beta**3) * A3[ui,vi]
            # Local-Path
            lp[i]   = A2[ui,vi] + epsilon * A3[ui,vi]
            # approx directed‐SP
            if A2[ui,vi] > 0:
                spd[i] = 2
            elif A3[ui,vi] > 0:
                spd[i] = 3
            else:
                spd[i] = -1
    
        # 4) flush all together
        out = os.path.join(self.output_dir, 'path_based_scores.npz')
        np.savez(
            out,
            katz_index = katz,
            local_path = lp,
            citation_G_sp_directed = spd
        )
        print(f"INFO: Path-based done in {time.time()-t0:.1f}s → {out}")


    def compute_community_features(self):
        """
        3. Community‐Based Features on citation graph:
          • same_community: 1 if both in same Louvain community
          • comm_size_ratio: |C_u| / |C_v|
        Flushes both arrays in 'community_features.npz'.
        """
        print("INFO: Detecting communities (greedy modularity)...")
        t0 = time.time()
        und = self.G.to_undirected()
        comms = nx.algorithms.community.greedy_modularity_communities(und)
        comm_map = {node:cid for cid,comm in enumerate(comms) for node in comm}
        sizes = {cid: len(comm) for cid,comm in enumerate(comms)}

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        n = len(u)
        same  = np.zeros(n, dtype=int)
        ratio = np.zeros(n, dtype=float)

        for i,(x,y) in enumerate(zip(u,v)):
            cx, cy = comm_map.get(x,-1), comm_map.get(y,-1)
            same[i] = int(cx==cy and cx!=-1)
            if cx in sizes and cy in sizes and sizes[cy]>0:
                ratio[i] = sizes[cx] / sizes[cy]

        out = os.path.join(self.output_dir, 'community_features.npz')
        np.savez(out, same_community=same, comm_size_ratio=ratio)
        print(f"INFO: Community done in {time.time()-t0:.1f}s, flushed to {out}")


    def compute_motif_counts(self):
        """
        4. Higher‐Order Motif Counts:
          • triangles_through_edge = A2[x,y]//2
          • cycles4 ≈ number of paths length‐3 = A3[x,y]
        Flushes both arrays in 'motif_counts.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Motif counts: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        und = self.G.to_undirected()
        nodes = list(und.nodes()); idx = {node:i for i,node in enumerate(nodes)}
        A  = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A); A3 = A2.dot(A)

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        tri  = np.zeros(n, dtype=int)
        cyc4 = np.zeros(n, dtype=float)

        for i,(x,y) in enumerate(zip(u,v)):
            xi, yi = idx[x], idx[y]
            tri[i]  = int(A2[xi,yi] // 2)
            cyc4[i] = A3[xi,yi]

        out = os.path.join(self.output_dir, 'motif_counts.npz')
        np.savez(out, triangles=tri, cycles4=cyc4)
        print(f"INFO: Motifs done in {time.time()-t0:.1f}s, flushed to {out}")

   
    def extract_keywords_from_abstracts(self, abstracts_path, top_k=10, max_features=5000):
        """
        5a. Extract top-k keywords per paper from abstracts via TF–IDF.
        Stores self.paper_keywords: {paper_id: [kw1,…,kw_topk]}.
        """
        print("INFO: Extracting keywords via TF–IDF from abstracts...")
        t0 = time.time()
        # load abstracts parquet with columns ['paper_id','abstract']
        df_abs = pd.read_parquet(abstracts_path)
        texts  = df_abs['abstract'].fillna("").tolist()
        pids   = df_abs['paper_id'].tolist()

        from sklearn.feature_extraction.text import TfidfVectorizer
        vec = TfidfVectorizer(
            max_features=max_features,
            stop_words='english'
        )
        X = vec.fit_transform(texts)  # shape = (n_papers, max_features)
        features = vec.get_feature_names_out()

        self.paper_keywords = {}
        for i, pid in enumerate(pids):
            row = X[i].tocoo()
            if row.nnz:
                top_idx = np.argsort(row.data)[-top_k:]
                kws = [features[row.col[j]] for j in top_idx]
            else:
                kws = []
            self.paper_keywords[pid] = kws

        print(f"INFO: Keywords extracted for {len(pids)} papers in {time.time()-t0:.1f}s")


    def compute_content_overlap(self):
        """
        5b. Content Overlap Beyond Abstracts:
          • title_jaccard: (if titles exist)
          • keyword_jaccard: Jaccard of extracted keywords
        Requires run of extract_keywords_from_abstracts(...) first.
        Flushes 'content_overlap.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Content overlap: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # assume self.paper_keywords exists
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        kw_j = np.zeros(n, dtype=float)

        for i,(x,y) in enumerate(zip(u,v)):
            Au = set(self.paper_keywords.get(x, []))
            Av = set(self.paper_keywords.get(y, []))
            inter = Au & Av
            uni   = Au | Av
            kw_j[i] = len(inter) / (len(uni) + 1e-8)

        out = os.path.join(self.output_dir, 'content_overlap.npz')
        np.savez(out, keyword_jaccard=kw_j)
        print(f"INFO: Content done in {time.time()-t0:.1f}s, flushed to {out}")

    def compute_author_domain_similarity(self, batch_size=100000):
        """
        Compute author-domain similarity features:
          • max_citing_topic_author_dom_cosine
          • mean_citing_topic_author_dom_cosine
          • min_citing_topic_author_dom_cosine
          • max_cited_topic_author_dom_cosine
          • mean_cited_topic_author_dom_cosine
          • min_cited_topic_author_dom_cosine
        Flushes all 6 arrays in 'author_domain_similarity.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Author-domain sim: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # prepare default zero‐vector for missing topics
        D_topic = next(iter(self.topic_dist_arr.values())).shape[0]
        zero_vec = np.zeros(D_topic, dtype=float)

        # result buffers
        max_u = np.zeros(n, float)
        mean_u = np.zeros(n, float)
        min_u = np.zeros(n, float)
        max_v = np.zeros(n, float)
        mean_v = np.zeros(n, float)
        min_v = np.zeros(n, float)

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # batch over pairs
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            for i, (uid, vid) in enumerate(zip(u[start:end], v[start:end])):
                idx = start + i

                # domain vector for cited (used by citing→authors)
                dom_v = self.topic_dist_arr.get(vid, zero_vec)
                sims_u = []
                for a in self.paper_to_authors.get(uid, []):
                    vec_a = self.auth_topic_dict.get(a)
                    if vec_a is not None:
                        sims_u.append(
                            float(
                                np.dot(vec_a, dom_v) /
                                (np.linalg.norm(vec_a) * np.linalg.norm(dom_v) + 1e-8)
                            )
                        )
                if sims_u:
                    max_u[idx], mean_u[idx], min_u[idx] = max(sims_u), np.mean(sims_u), min(sims_u)

                # domain vector for citing (used by cited→authors)
                dom_u = self.topic_dist_arr.get(uid, zero_vec)
                sims_v = []
                for a in self.paper_to_authors.get(vid, []):
                    vec_a = self.auth_topic_dict.get(a)
                    if vec_a is not None:
                        sims_v.append(
                            float(
                                np.dot(vec_a, dom_u) /
                                (np.linalg.norm(vec_a) * np.linalg.norm(dom_u) + 1e-8)
                            )
                        )
                if sims_v:
                    max_v[idx], mean_v[idx], min_v[idx] = max(sims_v), np.mean(sims_v), min(sims_v)

        # flush to disk
        out = os.path.join(self.output_dir, 'author_domain_similarity.npz')
        np.savez(
            out,
            max_citing_topic_author_dom_cosine   = max_u,
            mean_citing_topic_author_dom_cosine  = mean_u,
            min_citing_topic_author_dom_cosine   = min_u,
            max_cited_topic_author_dom_cosine    = max_v,
            mean_cited_topic_author_dom_cosine   = mean_v,
            min_cited_topic_author_dom_cosine    = min_v
        )
        print(f"INFO: Author-domain done in {time.time()-t0:.1f}s, flushed to {out}")
        
    def compute_paper_author_domain_similarity(self):
        """
        Compute paper‐level author‐domain similarity:
          • paper_total_auth_dom_cosine_citing_vs_cited
          • paper_total_auth_dom_dot_citing_vs_cited
          • paper_total_auth_dom_l2_citing_vs_cited
        Uses self.paper_dom_dict.
        Flushes 3 arrays in 'paper_author_domain.npz'.
        """
        n = len(self.df_pairs)
        print(f"INFO: Paper–author‐domain sim: {n} pairs (~{n/self.its:.1f}s)")
        t0 = time.time()

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # build matrices
        D = next(iter(self.paper_dom_dict.values())).shape[0]
        U = np.vstack([ self.paper_dom_dict.get(pid, np.zeros(D)) for pid in u ])
        V = np.vstack([ self.paper_dom_dict.get(pid, np.zeros(D)) for pid in v ])

        dot = np.einsum('ij,ij->i', U, V)
        nu  = np.linalg.norm(U, axis=1)
        nv  = np.linalg.norm(V, axis=1)
        cos = dot / (nu * nv + 1e-8)
        l2  = np.linalg.norm(U - V, axis=1)

        out = os.path.join(self.output_dir, 'paper_author_domain.npz')
        np.savez(
            out,
            paper_total_auth_dom_cosine_citing_vs_cited = cos,
            paper_total_auth_dom_dot_citing_vs_cited    = dot,
            paper_total_auth_dom_l2_citing_vs_cited     = l2
        )
        print(f"INFO: Paper–author‐domain done in {time.time()-t0:.1f}s, flushed to {out}")
        

    def compute_abstract_author_svd_similarity(self):
        n = len(self.df_pairs)
        dot_abs = np.zeros(n); cos_abs = np.zeros(n)
        dot_auth= np.zeros(n); cos_auth= np.zeros(n)
    
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
    
        for i,(uid,vid) in enumerate(zip(u,v)):
            idx_u = self.pid_to_idx.get(uid, -1)
            idx_v = self.pid_to_idx.get(vid, -1)
            eu = self.emb_abs[idx_u] if idx_u>=0 else np.zeros(self.emb_abs.shape[1])
            ev = self.emb_abs[idx_v] if idx_v>=0 else np.zeros(self.emb_abs.shape[1])
            da = eu.dot(ev)
            na = np.linalg.norm(eu); nb = np.linalg.norm(ev)
            dot_abs[i] = da
            cos_abs[i] = da/(na*nb+1e-8)
    
            # αν έχεις emb_auth
            au = self.emb_auth.get(uid, np.zeros_like(eu))
            av = self.emb_auth.get(vid, np.zeros_like(eu))
            du = au.dot(av)
            nu = np.linalg.norm(au); nv = np.linalg.norm(av)
            dot_auth[i] = du
            cos_auth[i] = du/(nu*nv+1e-8)
    
        out = os.path.join(self.output_dir,'svd_text_author.npz')
        np.savez(out,
                 cosine_abs_svd_koz  = cos_abs,
                 cosine_auth_svd_koz = cos_auth)
        print(f"INFO: SVD text/author done.")

        
    
    def compute_rooted_pagerank(self, alpha=0.85, max_iter=100, tol=1e-6):
        """
        Compute Personalized (Rooted) PageRank score from each citing u to cited v,
        grouping by u to avoid OOM and redundant computation:
          • For each unique u, run PageRank on self.G with teleport vector focused on u
          • Record the PageRank score at v for every (u,v) pair
        Flushes array 'rooted_pagerank_score.npy'.
        """
        n   = len(self.df_pairs)
        # Estimate time assuming each PR is ~10× heavier than a single it/sec
        est = n / (self.its / 10)
        print(f"INFO: Rooted PageRank: {n} pairs est ~{est:.1f}s")
        t0  = time.time()

        # Prepare output buffer
        scores = np.zeros(n, dtype=float)

        # Group indices by citing‐paper u
        from collections import defaultdict
        idxs_by_u = defaultdict(list)
        for idx, (u, v) in enumerate(zip(self.df_pairs.citing, self.df_pairs.cited)):
            idxs_by_u[u].append((idx, v))

        # Compute PageRank once per unique u, assign and free memory immediately
        for u, lst in idxs_by_u.items():
            # Build personalization vector: teleport only to u
            pers = {node: 0.0 for node in self.G.nodes()}
            pers[u] = 1.0
            pr_u = nx.pagerank(
                G=self.G,
                alpha=alpha,
                personalization=pers,
                max_iter=max_iter,
                tol=tol,
                weight=None
            )
            # Assign scores for all (idx, v) belonging to this u
            for idx, v in lst:
                scores[idx] = pr_u.get(v, 0.0)
            # Free the PageRank vector before next u
            del pr_u

        # Flush to disk
        out = os.path.join(self.output_dir, 'rooted_pagerank_score.npy')
        np.save(out, scores)
        print(f"INFO: Rooted PageRank done in {time.time()-t0:.1f}s, flushed to {out}")
        
    def compute_motif_and_path_features(self, beta=0.005, epsilon=0.001, batch_size=100_000):
        """
        Memory‐safe, chunked computation of:
          • katz_index            = β²·A² + β³·A³
          • local_path            = A² + ε·A³
          • triangles_through_edge= A²[u,v] // 2
          • cycles4               = A³[u,v]
          • citation_G_sp_directed= 2 if A²>0, 3 if A³>0, else -1
    
        Uses np.memmap to avoid holding all features in RAM,
        and processes the pair list in chunks.
        """
        import os
        import time
        import numpy as np
        from numpy.lib.format import open_memmap
    
        n = len(self.df_pairs)
        print(f"INFO: Motif+Path feats (chunked): {n} pairs")
        t0 = time.time()
    
        # — Build undirected adjacency and its powers once —
        und = self.G.to_undirected()
        nodes = list(und.nodes())
        idx_map = {node: i for i, node in enumerate(nodes)}
        A  = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A)
        A3 = A2.dot(A)
    
        # — Prepare on‐disk arrays via memmap —
        out_dir = self.output_dir
        katz_mmap = open_memmap(os.path.join(out_dir, 'katz_index_h2.npy'),
                            mode='w+', dtype='float32', shape=(n,))
        lp_mmap   = open_memmap(os.path.join(out_dir, 'local_path_h2.npy'),
                                mode='w+', dtype='float32', shape=(n,))
        tri_mmap  = open_memmap(os.path.join(out_dir, 'triangles_through_edge_h2.npy'),
                                mode='w+', dtype='int32',   shape=(n,))
        cyc4_mmap = open_memmap(os.path.join(out_dir, 'cycles4_h2.npy'),
                                mode='w+', dtype='int32',   shape=(n,))
        spd_mmap  = open_memmap(os.path.join(out_dir, 'citation_G_sp_directed_h2.npy'),
                                mode='w+', dtype='int8',    shape=(n,))
      
        
    
        # — Load u/v arrays once —
        u_arr = self.df_pairs['citing'].to_numpy()
        v_arr = self.df_pairs['cited'].to_numpy()
    
        # — Process in chunks —
        for start in range(0, n, batch_size):
            end = min(n, start + batch_size)
            for i in range(start, end):
                u, v = u_arr[i], v_arr[i]
                ui, vi = idx_map[u], idx_map[v]
                a2 = A2[ui, vi]
                a3 = A3[ui, vi]
    
                katz_mmap[i] = (beta**2) * a2 + (beta**3) * a3
                lp_mmap[i]   = a2 + epsilon * a3
                tri_mmap[i]  = int(a2 // 2)
                cyc4_mmap[i] = int(a3)
                if a2 > 0:
                    spd_mmap[i] = 2
                elif a3 > 0:
                    spd_mmap[i] = 3
                # else leaves -1
    
            print(f"  • Processed rows {start}–{end} in {time.time()-t0:.1f}s")
    
        # — flush & cleanup memmaps —
        del katz_mmap, lp_mmap, tri_mmap, cyc4_mmap, spd_mmap

        print(f"INFO: Motif+Path done in {time.time()-t0:.1f}s ")


    
    
    def compute_katz_centrality(self, beta=0.005, max_iter=1000, tol=1e-6):
        """
        Υπολογίζει Katz centrality (node‐level) με networkx.katz_centrality
        για μεγάλο, αραιό γράφο self.G και αποθηκεύει ως NumPy array
        το χαρακτηριστικό katz_centrality.npy για κάθε (citing, cited) ζεύγος.
        Επιστρέφει το array.
        
        Παράμετροι:
          - beta:     ο συντελεστής ύφεσης (alpha στο NetworkX)
          - max_iter: μέγιστος αριθμός επαναλήψεων
          - tol:      ανοχή σύγκλισης
        """
        # 1) Katz centrality ανά κόμβο (iterative, sparse-friendly)
        katz_scores = nx.katz_centrality(
            self.G,
            alpha=beta,
            beta=1.0,
            max_iter=max_iter,
            tol=tol,
            normalized=True
        )

        # 2) Δημιουργία array κεντρικοτήτων για κάθε ζεύγος (citing)
        n = len(self.df_pairs)
        katz_centrality = np.zeros(n, dtype=float)
        for i, u in enumerate(self.df_pairs['citing'].astype(int)):
            katz_centrality[i] = katz_scores.get(u, 0.0)

        # 3) Αποθήκευση σε .npy
        out_path = os.path.join(self.output_dir, 'katz_centrality.npy')
        np.save(out_path, katz_centrality)
        print(f"INFO: Saved katz_centrality to {out_path}")

        return katz_centrality

    ############################################################## EMBEDDINGS FEATURES #########################################################
    def compute_specter_hadamard(self):
        """
        Για κάθε (citing, cited) ζεύγος στο train split υπολογίζει
        το element-wise Hadamard product των Specter embeddings
        και το αποθηκεύει σε NumPy array shape = (n_train_pairs, D).
        Επιστρέφει το array.
        """
        # 1) Φόρτωση Specter embeddings
        embeds = np.load(self.abstracts_emb_path)  # shape = [n_papers, D]
        D = embeds.shape[1]

        # 2) Επιλογή μόνο των train ζευγών
        train_df = self.df_pairs[self.df_pairs['split'] == 'train']
        u_ids = train_df['citing'].astype(int).to_numpy()
        v_ids = train_df['cited'].astype(int).to_numpy()
        n = len(train_df)

        # 3) Πρίζουμε πίνακα για τα Hadamard features
        hadamard = np.zeros((n, D), dtype=float)

        # 4) Υπολογισμός element-wise product για κάθε ζεύγος
        for i, (u, v) in enumerate(zip(u_ids, v_ids)):
            hadamard[i, :] = embeds[u] * embeds[v]

        # 5) Αποθήκευση σε .npy
        out_path = os.path.join(self.output_dir, 'specter_hadamard.npy')
        np.save(out_path, hadamard)
        print(f"INFO: Saved Specter Hadamard features to {out_path} (shape={hadamard.shape})")

        return hadamard

        
    def extract_all(self):
        """
        Run all feature computation methods in sequence,
        assemble into DataFrame, and return.
        """
        print("Starting feature extraction...")
        t0 = time.time()
        
        # sequence of feature methods
        self.compute_tfidf_similarity()
        self.compute_specter_similarity()
        self.compute_scibert_similarity()
        self.compute_bertopic_features()
        self.compute_lda_topics_features()
        self.compute_author_graph_heuristics()
        self.compute_embedding_features()
        self.compute_coauthor_distance()
        self.compute_temporal_features()
        
        # assemble DataFrame
        df_feat = pd.DataFrame(self.feat_dict, index=self.df_pairs.index)
        result = pd.concat([self.df_pairs.reset_index(drop=True), df_feat], axis=1)
        
        print(f"✅ All features extracted in {time.time() - t0:.2f}s")
        return result
        
       

In [11]:
# Example usage
if __name__ == "__main__":
    base_path = "D:/NLP/tfidf_xgboost"
    output_dir = "D:/NLP/Features_XL/train"
    fe = FeatureExtractor(base_path, output_dir, chunked=True, its=200000)
    fe.load_data()  # loads both train/val or test as needed
   # fe.compute_tfidf_similarity()
    #fe.compute_specter_similarity()
    #fe.compute_scibert_similarity()
    #fe.compute_bertopic_features()
    #fe.compute_lda_topics_features()
    #fe.compute_author_graph_heuristics()
    #fe.compute_coauthor_distance()
    #fe.compute_author_overlap_jaccard()
    #fe.compute_author_aggregate_stats()
    #fe.compute_node_level_metrics()
    
    #fe.compute_pair_heuristics() # takes long
   
    #fe.compute_co_citation_bibliographic()
    
    #fe.compute_path_based_scores() 
    #fe.compute_community_features() # takes long #INFO: Community done in 34467.4s, = 10 hours flushed to D:/NLP/Features_XL/train\community_features.npz

    #fe.compute_motif_counts()
    #fe.extract_keywords_from_abstracts("D:/NLP/tfidf_xgboost/cleaned_abstracts.parquet", top_k=10)
    #fe.compute_content_overlap()



    #fe.compute_author_domain_similarity()
    #fe.compute_paper_author_domain_similarity()
  #  fe.compute_abstract_author_svd_similarity()

    #fe.compute_embedding_features()
    
    #fe.compute_rooted_pagerank()
    
    #fe.compute_motif_and_path_features()
    #fe.compute_katz_centrality()


    ##################################################### EMBEDDING FEATURES ########################################################################
    fe.compute_specter_hadamard()
    # ... and so on for other features

INFO: Loading data...
INFO: Loaded 2183910 pairs in 0.7s
INFO: Loaded authors mapping (138499 papers)
INFO: Loaded abstracts_embeds (138499, 300)
INFO: Loaded pid_to_idx (138499 entries)
INFO: Loaded Specter embeddings array (138499, 768)
INFO: Loaded SciBERT embeddings (138499)
INFO: Loaded LDA topics (131250) in 1.8s total
INFO: Loaded graph embeddings: citation_node2vec=138499, citation_walklets=138499, author_node2vec=136863, author_walklets=136863
INFO: BERTopic dicts ready (topics=131250, authors=133493, papers_dom=138192)
INFO: Loaded SVD embeddings (abstracts=138499, authors=138499)
INFO: Built citation graph (nodes=138499, edges=982760)
INFO: Built co-author graph (nodes=136863, edges=520007)
✅ Data loaded, graphs ready.
INFO: Specter feats (2183910 pairs) est ~10.9s
INFO: Specter feats done in 50.4s, flushed to D:/NLP/Features_XL/train\specter_feats.npz
INFO: Saved Specter Hadamard features to D:/NLP/Features_XL/train\specter_hadamard.npy (shape=(1965519, 300))


In [6]:
df_test.head()

Unnamed: 0,citing,cited,label,split,tfidf_similarity
0,58335,113748,0,train,0.100756
1,18144,90516,0,train,0.005954
2,87793,130708,0,train,0.018084
3,33290,12998,1,train,0.018639
4,77459,19749,0,train,0.014315


In [None]:
# EMBEDDING FEATURES EXTRACTION
# SPECTER HADAMARD

In [None]:
# Extract test features

In [14]:
import os
import time
import numpy as np
import pandas as pd
import scipy.sparse as sp
import networkx as nx
import pickle
from itertools import combinations, product
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer

class TestFeatureExtractor:
    """
    Extract features for citation link‐prediction on an unseen test set.
    Reads a txt file of (citing,cited) pairs with no header, and computes
    the same features as the training extractor.
    """
    def __init__(self, base_path, output_dir, chunked=True, its=200000):
        # same resource paths as training
        self.base_path        = base_path
        self.authors_path     = os.path.join(base_path, "paper_to_authors.pkl")
        
        self.abstracts_emb_path = r"D:/NLP/citation_link_prediction/abstracts_embeds.npy"
        self.tfidf_idx_path = f"{base_path}/tfidf_pid_to_idx.pkl"
        
        self.specter_path = r'D:\NLP\citation_link_prediction\specter_pretrained.npy'
        self.scibert_path = "D:/NLP/data/paper_scibert_embeddings.pkl"
        self.bertopic_path    = os.path.join(base_path, "bertopic_features.parquet")
        self.lda_topics_path  = os.path.join(base_path, "paper_topics.parquet")
        
        # output
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        self.chunked = chunked
        self.its     = its

        # placeholders
        self.df_pairs       = None
        self.paper_to_authors = None
        self.pid_to_idx     = None
        self.tfidf_matrix   = None
        self.specter_dict   = None

        self.specter_emb  = None
        
        self.scibert_dict = None
        self.lda_topics     = None
        self.topic_dict     = None
        self.entropy_dict   = None
        self.topic_dist_arr = None
        self.auth_topic_dict= None
        self.paper_dom_dict = None
        self.G              = None
        self.G_auth         = None
        
        # will hold in-memory features if not chunked
        self.feat_dict = {}  

    def load_data(self, test_pairs_path):
        """
        Load test pairs (headerless txt), plus all mappings & embeddings.
        Creates dummy split/label columns so compute_* methods run unchanged.
        """
        print("INFO: Loading test data…")
        t0 = time.time()
        # -- 1. Load test pairs --
        self.df_pairs = pd.read_csv(
            test_pairs_path,
            header=None,
            names=["citing","cited"],
            dtype={"citing":int,"cited":int}
        )
        # dummy columns
        self.df_pairs["split"] = "test"
        self.df_pairs["label"] = -1
        print(f"INFO: Loaded {len(self.df_pairs)} test pairs in {time.time()-t0:.1f}s")

         # b) Paper→authors mapping
        with open(self.authors_path, "rb") as f:
            self.paper_to_authors = pickle.load(f)
        print(f"INFO: Loaded authors mapping ({len(self.paper_to_authors)} papers)")

        # c) TF–IDF index & matrix
        # ► i) Load precomputed SVD embeddings for abstracts & authors
        #(My tfidf instead of Kozel)
        # load abstracts SVD embeddings (LSA 32d)
        self.emb_abs = np.load(self.abstracts_emb_path)
        # emb_abs.shape == (n_rows_in_tfidf_matrix, 32)
        print(f"INFO: Loaded abstracts_embeds {self.emb_abs.shape}")

        # μέσα στο load_data(), μετά το "LOAD TF-IDF index"
        with open(self.tfidf_idx_path, "rb") as f:
            self.pid_to_idx = pickle.load(f)
        print(f"INFO: Loaded pid_to_idx ({len(self.pid_to_idx)} entries)")
      
        
        # d) SPECTER embeddings
        # Load Specter embeddings from .npy (shape = [n_papers_used, D_s])
        self.specter_emb = np.load(self.specter_path)
        print(f"INFO: Loaded Specter embeddings array {self.specter_emb.shape}")
        
        # e) SciBERT embeddings
        with open(self.scibert_path, "rb") as f:
            self.scibert_dict = pickle.load(f)
        print(f"INFO: Loaded SciBERT embeddings ({len(self.scibert_dict)})")
        
        # f) BERTopic & LDA topics
        self.lda_topics = pd.read_parquet(self.lda_topics_path)
        print(f"INFO: Loaded LDA topics ({len(self.lda_topics)}) in {time.time()-t0:.1f}s total")


        ## EMBEDS
        with open(os.path.join(self.base_path,
                               "split_train_val",
                               "citation_node2vec_tuned.pkl"), "rb") as f:
            self.citation_node2vec = pickle.load(f)
        with open(os.path.join(self.base_path,
                               "split_train_val",
                               "citation_node2vec_directed_weighted_q2.pkl"), "rb") as f:
            self.citation_walklets = pickle.load(f)
        with open(os.path.join(self.base_path, "author_node2vec.pkl"), "rb") as f:
            self.author_node2vec = pickle.load(f)
        with open(os.path.join(self.base_path, "author_walklets.pkl"), "rb") as f:
            self.author_walklets = pickle.load(f)
        print(f"INFO: Loaded graph embeddings: "
              f"citation_node2vec={len(self.citation_node2vec)}, "
              f"citation_walklets={len(self.citation_walklets)}, "
              f"author_node2vec={len(self.author_node2vec)}, "
              f"author_walklets={len(self.author_walklets)}")

        # ——————————————————————————————————————————————————————————————
        # ► h) Load BERTopic features and build topic‐dicts once for all functions
        df_bt = pd.read_parquet(self.bertopic_path)
        # dominant topic & entropy
        self.topic_dict   = dict(zip(df_bt.paper_id, df_bt.bertopic_dominant_topic))
        self.entropy_dict = dict(zip(df_bt.paper_id, df_bt.bertopic_topic_entropy))
        # full distribution vectors
        topic_cols = [c for c in df_bt.columns if c.startswith("topic_dist_")]
        td_df = df_bt.set_index("paper_id")[topic_cols]
        self.topic_dist_arr = {
            pid: td_df.loc[pid].to_numpy()
            for pid in td_df.index
        }
        # per‐author average topic vector
        from collections import defaultdict
        author_topic_acc = defaultdict(list)
        for pid, dist in self.topic_dist_arr.items():
            for a in self.paper_to_authors.get(pid, []):
                author_topic_acc[a].append(dist)
        self.auth_topic_dict = {
            a: np.mean(vs, axis=0)
            for a, vs in author_topic_acc.items()
        }
        # per‐paper “domain” vector via its authors
        self.paper_dom_dict = {
            pid: np.mean(
                [ self.auth_topic_dict[a] for a in authors if a in self.auth_topic_dict ],
                axis=0
            )
            for pid, authors in self.paper_to_authors.items()
            if any(a in self.auth_topic_dict for a in authors)
        }
        print(f"INFO: BERTopic dicts ready (topics={len(self.topic_dist_arr)}," 
              f" authors={len(self.auth_topic_dict)}, papers_dom={len(self.paper_dom_dict)})")
        

       # Kozel Authors SVD
        with open(r"D:\NLP\kozel\embeddings\author_emb.pkl", "rb") as f:
            self.emb_auth = pickle.load(f)   # dict: paper_id → 32‐d vector
        print(f"INFO: Loaded SVD embeddings (abstracts={len(self.emb_abs)}, authors={len(self.emb_auth)})")
        # ——————————————————————————————————————————————————————————————
        
        # j) Build citation graph (unweighted)
        train_pos = self.df_pairs[(self.df_pairs.split=="train") & (self.df_pairs.label==1)]
        self.G = nx.DiGraph()
        self.G.add_nodes_from(
            pd.unique(self.df_pairs[["citing","cited"]].values.ravel())
        )
        for u,v in zip(train_pos['citing'], train_pos['cited']):
            self.G.add_edge(int(u), int(v))
        print(f"INFO: Built citation graph (nodes={self.G.number_of_nodes()}, edges={self.G.number_of_edges()})")
        
        # h) Build co-author graph (weighted by # coauthored papers)
        G_auth = nx.Graph()
        for authors in self.paper_to_authors.values():
            for a,b in combinations(authors,2):
                if G_auth.has_edge(a,b):
                    G_auth[a][b]['weight'] += 1
                else:
                    G_auth.add_edge(a,b,weight=1)
        self.G_auth = G_auth
        print(f"INFO: Built co-author graph (nodes={G_auth.number_of_nodes()}, edges={G_auth.number_of_edges()})")
        
        print("✅ Data loaded, graphs ready.")
        return self

    def _flush(self, name, arr):
        np.save(os.path.join(self.output_dir, f"{name}.npy"), arr)
        print(f"    • Flushed {name} ({arr.shape})")

    # --- example compute methods, same signatures as train class ---
    def compute_tfidf_similarity(self, batch_size=10000):
        """
        (Παλιό όνομα, αλλά πλέον κάνει cosine similarity
         στα 32-διάστατα LSA embeddings αντί για raw TF–IDF.)
        """
        n = len(self.df_pairs)
        print(f"INFO: Abstract-LSA sim ({n} pairs) est ~{n/self.its:.1f}s")
        t0 = time.time()
    
        # προσωρινός πίνακας για τα αποτελέσματα
        sim = np.zeros(n, dtype=float)
    
        # θα αντλήσουμε σειρές από self.emb_abs βάσει pid_to_idx
        D = self.emb_abs.shape[1]
        zero = np.zeros(D, dtype=float)
    
        u = self.df_pairs['citing'].to_numpy()
        v = self.df_pairs['cited'].to_numpy()
    
        # batch loop για να μην γεμίσουμε μνήμη
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            for i in range(start, end):
                pid_u, pid_v = u[i], v[i]
                idx_u = self.pid_to_idx.get(pid_u, -1)
                idx_v = self.pid_to_idx.get(pid_v, -1)
    
                eu = self.emb_abs[idx_u] if idx_u >= 0 else zero
                ev = self.emb_abs[idx_v] if idx_v >= 0 else zero
    
                num = float(np.dot(eu, ev))
                den = np.linalg.norm(eu) * np.linalg.norm(ev) + 1e-8
                sim[i] = num / den
    
        # και το flush όπως πριν
        self._flush('tfidf_similarity', sim)
        print(f"INFO: Abstract-LSA sim done in {time.time()-t0:.1f}s")

    def compute_specter_similarity(self, batch_size=10000):
        """
        Compute three Specter-based features for each (citing,cited):
          • dot-product
          • cosine similarity
          • L1 distance
          • L2 distance
          
        Flushes all three into 'specter_feats.npz'.
        """
        n = len(self.df_pairs)
        print(f"INFO: Specter feats ({n} pairs) est ~{n/self.its:.1f}s")
        t0 = time.time()
    
        # prepare arrays
        dots      = np.zeros(n, dtype=float)
        cos_sims  = np.zeros(n, dtype=float)
        abs_diffs = np.zeros(n, dtype=float)
        specter_l2= np.zeros(n, dtype=float)
    
        # helper
        D     = self.specter_emb.shape[1]
        zero  = np.zeros(D, dtype=float)
        pid2i = self.pid_to_idx  # paper→row in emb array
    
        u = self.df_pairs['citing'].to_numpy()
        v = self.df_pairs['cited'].to_numpy()
    
        # batch-loop
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            for i in range(start, end):
                pid_u, pid_v = u[i], v[i]
                iu = pid2i.get(pid_u, -1)
                iv = pid2i.get(pid_v, -1)
                eu = self.specter_emb[iu] if iu >= 0 else zero
                ev = self.specter_emb[iv] if iv >= 0 else zero
    
                d = float(np.dot(eu, ev))
                dots[i] = d
                # cosine
                nu = np.linalg.norm(eu)
                nv = np.linalg.norm(ev)
                cos_sims[i] = d / (nu * nv + 1e-8)

                diff = eu - ev
                # L1
                abs_diffs[i] = float(np.sum(np.abs(diff)))
                # L2
                specter_l2[i] =float(np.sum(diff * diff))
    
        # flush all three at once
        out = os.path.join(self.output_dir, 'specter_feats.npz')
        np.savez(
            out,
            specter_dot       = dots,
            specter_cosine    = cos_sims,
            specter_l1        = abs_diffs,
            specter_l2        = specter_l2
        )
        print(f"INFO: Specter feats done in {time.time()-t0:.1f}s, flushed to {out}")


    def compute_scibert_similarity(self, batch_size=100000):
        """Compute and flush SciBERT cosine similarity in batches to save memory."""
        n = len(self.df_pairs)
        est = n/self.its
        print(f"INFO: SciBERT sim: {n} pairs (~{est:.1f}s)")
        t0 = time.time()
        sim = np.zeros(n, float)
        # dimension of SciBERT embeddings
        D = len(next(iter(self.scibert_dict.values())))
        # arrays of ids
        u = self.df_pairs.citing.to_numpy(); v = self.df_pairs.cited.to_numpy()
        # process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]; v_batch = v[start:end]
            # stack embeddings for this batch
            U = np.vstack([self.scibert_dict.get(pid, np.zeros(D)) for pid in u_batch])
            V = np.vstack([self.scibert_dict.get(pid, np.zeros(D)) for pid in v_batch])
            dots = np.einsum('ij,ij->i', U, V)
            nu = np.linalg.norm(U, axis=1); nv = np.linalg.norm(V, axis=1)
            sim[start:end] = dots / (nu * nv + 1e-8)
        # flush full feature
        self._flush('scibert_similarity', sim)
        print(f"INFO: SciBERT done in {time.time()-t0:.1f}s")
    
    # def compute_bertopic_features(self):
    #     pass
    def compute_bertopic_features(self, batch_size=100000):
        """
        Compute BERTopic features for each (citing, cited) pair:
          - citing & cited dominant topic
          - same topic flag
          - citing & cited topic entropy
          - cosine similarity of full distributions
        All 6 features are stacked into one array of shape (n_pairs, 6) 
        and flushed as a single file.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: BERTopic feats: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # 1. Load BERTopic table
        df_bt = pd.read_parquet(self.bertopic_path)
        dom_dict = dict(zip(df_bt.paper_id, df_bt.bertopic_dominant_topic))
        ent_dict = dict(zip(df_bt.paper_id, df_bt.bertopic_topic_entropy))
        dist_cols = [c for c in df_bt.columns if c.startswith("topic_dist_")]
        dist_mat = df_bt.set_index("paper_id")[dist_cols]

        # 2. Prepare id arrays
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # 3. Initialize arrays
        citing_dom = np.array([dom_dict.get(pid, -1) for pid in u],   dtype=int)
        cited_dom  = np.array([dom_dict.get(pid, -1) for pid in v],   dtype=int)
        same_bt    = (citing_dom == cited_dom).astype(int)
        citing_ent = np.array([ent_dict.get(pid, 0.0) for pid in u],  dtype=float)
        cited_ent  = np.array([ent_dict.get(pid, 0.0) for pid in v],  dtype=float)
        cos_sim    = np.zeros(n, dtype=float)

        # 4. Cosine similarity in batches
        D = len(dist_cols)
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]
            v_batch = v[start:end]

            U = np.vstack([
                dist_mat.loc[pid].to_numpy() if pid in dist_mat.index else np.zeros(D)
                for pid in u_batch
            ])
            V = np.vstack([
                dist_mat.loc[pid].to_numpy() if pid in dist_mat.index else np.zeros(D)
                for pid in v_batch
            ])

            dots = np.einsum('ij,ij->i', U, V)
            nu   = np.linalg.norm(U, axis=1)
            nv   = np.linalg.norm(V, axis=1)
            cos_sim[start:end] = dots / (nu * nv + 1e-8)

        # 5. Stack all 6 features into one 2D array (n_pairs × 6)
        all_feats = np.column_stack([
            citing_dom,
            cited_dom,
            same_bt,
            citing_ent,
            cited_ent,
            cos_sim
        ])

        # 6. Flush as a single file
        self._flush('bertopic_features', all_feats)
        print(f"INFO: BERTopic done in {time.time() - t0:.1f}s")
    

    # … υλοποιήστε ανάλογα compute_… για:
    #    compute_co_citation_bibliographic,
    #    compute_author_overlap_jaccard,
    #    compute_pair_heuristics,
    #    compute_lda_topics_features,
    #    compute_community_features,
    #    compute_content_overlap,
    #    compute_node_level_metrics
    # με ακριβώς την ίδια λογική όπως στη FeatureExtractor κλάση.
    def compute_co_citation_bibliographic(self):
        """
        Co‐citation & bibliographic coupling counts:
          • co_citation      = |pred(u) ∩ pred(v)|
          • bibliographic_coupling = |succ(u) ∩ succ(v)|
        """
        n = len(self.df_pairs)
        co_cite = np.zeros(n, int)
        biblio  = np.zeros(n, int)
        for i, (u, v) in enumerate(zip(self.df_pairs.citing, self.df_pairs.cited)):
            preds_u = set(self.G.predecessors(u))
            preds_v = set(self.G.predecessors(v))
            co_cite[i] = len(preds_u & preds_v)
            succs_u = set(self.G.successors(u))
            succs_v = set(self.G.successors(v))
            biblio[i] = len(succs_u & succs_v)
        self._flush('co_citation', co_cite)
        self._flush('bibliographic_coupling', biblio)

    def compute_author_overlap_jaccard(self):
        """
        Author overlap & Jaccard per paper‐pair:
          • author_overlap = |Authors(u) ∩ Authors(v)|
          • jaccard_authors = |Au ∩ Av| / |Au ∪ Av|
        """
        n = len(self.df_pairs)
        overlap = np.zeros(n, int)
        jaccard = np.zeros(n, float)
        for i, (u, v) in enumerate(zip(self.df_pairs.citing, self.df_pairs.cited)):
            Au = set(self.paper_to_authors.get(u, []))
            Av = set(self.paper_to_authors.get(v, []))
            inter = Au & Av
            uni   = Au | Av
            overlap[i] = len(inter)
            jaccard[i] = len(inter) / (len(uni) + 1e-8)
        self._flush('author_overlap', overlap)
        self._flush('jaccard_authors', jaccard)


    def compute_lda_topics_features(self):
        """
        Cosine similarity of LDA topic distributions per paper‐pair.
        """
        topic_cols = [c for c in self.lda_topics.columns if c!='paper_id']
        lda_mat = self.lda_topics.set_index('paper_id')[topic_cols]
        n = len(self.df_pairs)
        sim = np.zeros(n, float)
        for i, (u, v) in enumerate(zip(self.df_pairs.citing, self.df_pairs.cited)):
            a = lda_mat.loc[u].to_numpy() if u in lda_mat.index else np.zeros(len(topic_cols))
            b = lda_mat.loc[v].to_numpy() if v in lda_mat.index else np.zeros(len(topic_cols))
            num = np.dot(a,b)
            den = np.linalg.norm(a)*np.linalg.norm(b)+1e-8
            sim[i] = num/den
        self._flush('lda_topics_similarity', sim)


    # def compute_author_graph_heuristics(self):
    #     pass
    def compute_author_graph_heuristics(self, batch_size=100000):
        """
        Compute author-graph heuristics in batches:
          - average common neighbors count per author-pair
          - average Adamic–Adar per author-pair
          - average Resource Allocation per author-pair
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Author-graph heuristics: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Prepare ID arrays and result buffers
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        cn_arr = np.zeros(n, dtype=float)
        aa_arr = np.zeros(n, dtype=float)
        ra_arr = np.zeros(n, dtype=float)

        # Local references for speed
        G_auth = self.G_auth
        deg_auth = dict(G_auth.degree())

        # Process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]
            v_batch = v[start:end]

            for i, (uid, vid) in enumerate(zip(u_batch, v_batch)):
                # Get author lists for each paper
                Au = self.paper_to_authors.get(uid, [])
                Av = self.paper_to_authors.get(vid, [])
                cn_vals, aa_vals, ra_vals = [], [], []

                # Compute per-author-pair metrics
                for a, b in product(Au, Av):
                    if G_auth.has_node(a) and G_auth.has_node(b):
                        common = list(nx.common_neighbors(G_auth, a, b))
                        if common:
                            cn_vals.append(len(common))
                            aa_vals.append(sum(1.0 / np.log(1 + deg_auth[z]) for z in common))
                            ra_vals.append(sum(1.0 / deg_auth[z] for z in common))

                # Aggregate (mean) or default to 0
                idx = start + i
                if cn_vals:
                    cn_arr[idx] = np.mean(cn_vals)
                    aa_arr[idx] = np.mean(aa_vals)
                    ra_arr[idx] = np.mean(ra_vals)

        # Flush to disk
        # Stack all three author-graph features into one array (n_pairs × 3)
        all_feats = np.column_stack([
            cn_arr,
            aa_arr,
            ra_arr
        ])
        # Flush as a single file
        self._flush('author_graph_heuristics', all_feats)
        print(f"INFO: Author-graph heuristics done in {time.time() - t0:.1f}s")

    
    # def compute_embedding_features(self):
    #     pass
    def compute_embedding_features(self, batch_size=100000):
        """
        Compute embedding-based scalars in batches:
          - citation_node2vec_cosine, _dot, _l2
          - citation_walklets_cosine, _dot, _l2
          - author_node2vec_cosine, _dot, _l2
          - author_walklets_cosine, _dot, _l2
        Flushes all 12 arrays in a single .npz.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Embedding feats: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Prepare id arrays
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # Allocate buffers
        cnv_c_cos = np.zeros(n, dtype=float)
        cnv_c_dot = np.zeros(n, dtype=float)
        cnv_c_l2  = np.zeros(n, dtype=float)
        cnv_w_cos = np.zeros(n, dtype=float)
        cnv_w_dot = np.zeros(n, dtype=float)
        cnv_w_l2  = np.zeros(n, dtype=float)
        # Precompute author-level mean embeddings
        # assume self.author_node2vec & self.author_walklets exist
        # and self.paper_to_authors maps pid→list of a_ids
        def mean_emb(pid, emb_dict, dim):
            vs = [emb_dict[a] for a in self.paper_to_authors.get(pid, []) if a in emb_dict]
            return np.mean(vs, axis=0) if vs else np.zeros(dim, dtype=float)
        # determine dims
        d_cn = next(iter(self.citation_node2vec.values())).shape[0]
        d_aw = next(iter(self.citation_walklets.values())).shape[0]
        d_an = next(iter(self.author_node2vec.values())).shape[0]
        d_awl= next(iter(self.author_walklets.values())).shape[0]
        # process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_b, v_b = u[start:end], v[start:end]
            # citation-node2vec
            Cv = np.vstack([ self.citation_node2vec.get(pid, np.zeros(d_cn)) for pid in u_b ])
            Vv = np.vstack([ self.citation_node2vec.get(pid, np.zeros(d_cn)) for pid in v_b ])
            dot = np.einsum('ij,ij->i', Cv, Vv)
            nu  = np.linalg.norm(Cv, axis=1); nv = np.linalg.norm(Vv, axis=1)
            cnv_c_cos[start:end] = dot / (nu*nv + 1e-8)
            cnv_c_dot[start:end] = dot
            cnv_c_l2[start:end]  = np.linalg.norm(Cv - Vv, axis=1)
            # citation-walklets
            Cw = np.vstack([ self.citation_walklets.get(pid, np.zeros(d_aw)) for pid in u_b ])
            Wv = np.vstack([ self.citation_walklets.get(pid, np.zeros(d_aw)) for pid in v_b ])
            dot = np.einsum('ij,ij->i', Cw, Wv)
            nu  = np.linalg.norm(Cw, axis=1); nv = np.linalg.norm(Wv, axis=1)
            cnv_w_cos[start:end] = dot / (nu*nv + 1e-8)
            cnv_w_dot[start:end] = dot
            cnv_w_l2[start:end]  = np.linalg.norm(Cw - Wv, axis=1)
            # author-node2vec
            An = np.vstack([ mean_emb(pid, self.author_node2vec, d_an) for pid in u_b ])
            Bn = np.vstack([ mean_emb(pid, self.author_node2vec, d_an) for pid in v_b ])
            dot = np.einsum('ij,ij->i', An, Bn)
            nu  = np.linalg.norm(An, axis=1); nv = np.linalg.norm(Bn, axis=1)
            # reuse buffers names for author if desired, or separate
            # here stacking all into one npz with clear keys below
            # similarly for author-walklets
            Aw = np.vstack([ mean_emb(pid, self.author_walklets, d_awl) for pid in u_b ])
            Bw = np.vstack([ mean_emb(pid, self.author_walklets, d_awl) for pid in v_b ])
            dot_aw = np.einsum('ij,ij->i', Aw, Bw)
            nu_aw  = np.linalg.norm(Aw, axis=1); nv_aw = np.linalg.norm(Bw, axis=1)
            # store author embeddings
            if start == 0:
                an_cos = np.zeros(n, dtype=float)
                an_dot = np.zeros(n, dtype=float)
                an_l2  = np.zeros(n, dtype=float)
                aw_cos = np.zeros(n, dtype=float)
                aw_dot = np.zeros(n, dtype=float)
                aw_l2  = np.zeros(n, dtype=float)
            an_cos[start:end] = dot / (nu*nv + 1e-8)
            an_dot[start:end] = dot
            an_l2[start:end]  = np.linalg.norm(An - Bn, axis=1)
            aw_cos[start:end] = dot_aw / (nu_aw*nv_aw + 1e-8)
            aw_dot[start:end] = dot_aw
            aw_l2[start:end]  = np.linalg.norm(Aw - Bw, axis=1)

        # stack and flush all 12 features
        np.savez(
            os.path.join(self.output_dir, 'embedding_features.npz'),
            citation_node2vec_cosine       = cnv_c_cos,
            citation_node2vec_dot          = cnv_c_dot,
            citation_node2vec_l2           = cnv_c_l2,
            citation_walklets_cosine       = cnv_w_cos,
            citation_walklets_dot          = cnv_w_dot,
            citation_walklets_l2           = cnv_w_l2,
            author_node2vec_cosine         = an_cos,
            author_node2vec_dot            = an_dot,
            author_node2vec_l2             = an_l2,
            author_walklets_cosine         = aw_cos,
            author_walklets_dot            = aw_dot,
            author_walklets_l2             = aw_l2
        )
        print(f"INFO: Embedding feats done in {time.time()-t0:.1f}s")

    def compute_author_aggregate_stats(self):
        """
        Compute author-level aggregate stats for each pair:
          - citing_author_mean_pagerank, cited_author_mean_pagerank
          - citing_author_max_pagerank,  cited_author_max_pagerank
          - citing_author_mean_degree,   cited_author_mean_degree
        Results saved together in 'author_aggregate_stats.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Author agg stats: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Compute pagerank and degree on co-author graph
        auth_pr  = nx.pagerank(self.G_auth, weight='weight')
        auth_deg = dict(self.G_auth.degree())

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        mean_pr_u = np.zeros(n, dtype=float)
        mean_pr_v = np.zeros(n, dtype=float)
        max_pr_u  = np.zeros(n, dtype=float)
        max_pr_v  = np.zeros(n, dtype=float)
        mean_deg_u = np.zeros(n, dtype=float)
        mean_deg_v = np.zeros(n, dtype=float)

        for idx, (uid, vid) in enumerate(zip(u, v)):
            Au = self.paper_to_authors.get(uid, [])
            Av = self.paper_to_authors.get(vid, [])
            # pagerank stats
            prs_u = [auth_pr.get(a, 0.0) for a in Au]
            prs_v = [auth_pr.get(a, 0.0) for a in Av]
            if prs_u:
                mean_pr_u[idx] = sum(prs_u) / len(prs_u)
                max_pr_u[idx]  = max(prs_u)
            if prs_v:
                mean_pr_v[idx] = sum(prs_v) / len(prs_v)
                max_pr_v[idx]  = max(prs_v)
            # degree stats
            degs_u = [auth_deg.get(a, 0) for a in Au]
            degs_v = [auth_deg.get(a, 0) for a in Av]
            if degs_u:
                mean_deg_u[idx] = sum(degs_u) / len(degs_u)
            if degs_v:
                mean_deg_v[idx] = sum(degs_v) / len(degs_v)

        # Flush all six stats together
        out_path = os.path.join(self.output_dir, 'author_aggregate_stats.npz')
        np.savez(
            out_path,
            citing_author_mean_pagerank = mean_pr_u,
            cited_author_mean_pagerank  = mean_pr_v,
            citing_author_max_pagerank  = max_pr_u,
            cited_author_max_pagerank   = max_pr_v,
            citing_author_mean_degree   = mean_deg_u,
            cited_author_mean_degree    = mean_deg_v
        )
        print(f"INFO: Author agg stats done in {time.time() - t0:.1f}s, flushed to {out_path}")


     # def compute_coauthor_distance(self):
    #     pass
    def compute_coauthor_distance(self, batch_size=100000):
        """
        Compute co-author distance metrics in batches:
          - coauth_min_dist: minimum shortest-path between any author of citing & cited
          - coauth_mean_dist: average such shortest-path
          - coauth_max_dist: maximum such shortest-path
          - coauth_inv_min: 1 / (min_dist + 1)
          - coauth_close_bin: binary flag if min_dist <= 2
        Flushes all five arrays as a single .npz.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Co-author distance: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # Prepare id arrays and result buffers
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        min_arr  = np.zeros(n, dtype=float)
        mean_arr = np.zeros(n, dtype=float)
        max_arr  = np.zeros(n, dtype=float)

        # Maximum distance if no path exists
        max_dist = self.G_auth.number_of_nodes()

        # Process in batches
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            u_batch = u[start:end]
            v_batch = v[start:end]

            for i, (uid, vid) in enumerate(zip(u_batch, v_batch)):
                Au = self.paper_to_authors.get(uid, [])
                Av = self.paper_to_authors.get(vid, [])
                dists = []
                for a in Au:
                    for b in Av:
                        if self.G_auth.has_node(a) and self.G_auth.has_node(b):
                            try:
                                dists.append(nx.shortest_path_length(self.G_auth, a, b))
                            except nx.NetworkXNoPath:
                                dists.append(max_dist)
                idx = start + i
                if dists:
                    min_arr[idx]  = min(dists)
                    mean_arr[idx] = sum(dists) / len(dists)
                    max_arr[idx]  = max(dists)
                else:
                    min_arr[idx] = mean_arr[idx] = max_arr[idx] = max_dist

        # Derived metrics
        inv_min  = 1.0 / (min_arr + 1.0)
        close_bin = (min_arr <= 2).astype(int)

        # Flush all features together
        out_path = os.path.join(self.output_dir, 'coauthor_distance.npz')
        np.savez(
            out_path,
            coauth_min_dist  = min_arr,
            coauth_mean_dist = mean_arr,
            coauth_max_dist  = max_arr,
            coauth_inv_min   = inv_min,
            coauth_close_bin = close_bin
        )
        print(f"INFO: Co-author distance done in {time.time() - t0:.1f}s, flushed to {out_path}")


    def compute_node_level_metrics(self):
        """
        Compute node‐level graph features for each (citing, cited) pair:
          - citing_in_degree, citing_out_degree, citing_degree
          - cited_in_degree,  cited_out_degree,  cited_degree
          - citing_pagerank,   cited_pagerank
          - citing_triangles,  cited_triangles
          - citing_clustering, cited_clustering
          - citing_core,       cited_core
          - citing_onion,      cited_onion
          - citing_eigen,      cited_eigen
          - common_neighbors (undirected)
        Flush all 21 arrays as one .npz.
        """
        n = len(self.df_pairs)
        print(f"INFO: Node‐level metrics: {n} pairs (~{n/self.its:.1f}s)")
        t0 = time.time()

        # Build undirected version
        und = self.G.to_undirected()

        # Compute raw dicts
        in_deg   = dict(self.G.in_degree())
        out_deg  = dict(self.G.out_degree())
        deg      = dict(self.G.degree())
        pr       = nx.pagerank(self.G, weight=None)
        tri      = nx.triangles(und)
        clust    = nx.clustering(und, weight=None)
        core     = nx.core_number(und)
        onion    = nx.onion_layers(und)
        eig      = nx.eigenvector_centrality(self.G, max_iter=500)

        # Prepare index arrays
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # Map into arrays
        ci_deg = np.array([in_deg.get(x,0) for x in u])
        co_deg = np.array([out_deg.get(x,0) for x in u])
        ct_deg = np.array([deg.get(x,0) for x in u])
        di_deg = np.array([in_deg.get(x,0) for x in v])
        do_deg = np.array([out_deg.get(x,0) for x in v])
        dt_deg = np.array([deg.get(x,0) for x in v])

        ci_pr = np.array([pr.get(x,0.0) for x in u])
        co_pr = np.array([pr.get(x,0.0) for x in v])

        ci_tri = np.array([tri.get(x,0) for x in u])
        co_tri = np.array([tri.get(x,0) for x in v])

        ci_cl = np.array([clust.get(x,0.0) for x in u])
        co_cl = np.array([clust.get(x,0.0) for x in v])

        ci_co = np.array([core.get(x,0) for x in u])
        co_co = np.array([core.get(x,0) for x in v])

        ci_on = np.array([onion.get(x,0) for x in u])
        co_on = np.array([onion.get(x,0) for x in v])

        ci_ei = np.array([eig.get(x,0.0) for x in u])
        co_ei = np.array([eig.get(x,0.0) for x in v])

        # Common neighbors via adjacency-squared
        nodes = list(und.nodes())
        idx_map = {node:i for i,node in enumerate(nodes)}
        A = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A)
        ui = [idx_map[x] for x in u]
        vi = [idx_map[x] for x in v]
        cn = np.array(A2[ui, vi]).ravel()

        # Stack and flush
        np.savez(
            os.path.join(self.output_dir, 'node_level_metrics.npz'),
            citing_in_degree       = ci_deg,
            citing_out_degree      = co_deg,
            citing_degree          = ct_deg,
            cited_in_degree        = di_deg,
            cited_out_degree       = do_deg,
            cited_degree           = dt_deg,
            citing_pagerank        = ci_pr,
            cited_pagerank         = co_pr,
            citing_triangles       = ci_tri,
            cited_triangles        = co_tri,
            citing_clustering      = ci_cl,
            cited_clustering       = co_cl,
            citing_core_number     = ci_co,
            cited_core_number      = co_co,
            citing_onion_number    = ci_on,
            cited_onion_number     = co_on,
            citing_eigenvector     = ci_ei,
            cited_eigenvector      = co_ei,
            common_neighbors       = cn
        )
        print(f"INFO: Node‐level done in {time.time()-t0:.1f}s")



    def compute_pair_heuristics(self, batch_size=100000):
        """
        Compute pair‐level heuristics on the citation graph:
          - citation_jaccard, salton, hub_depressed, adamic_adar
          - preferential_attachment, resource_allocation
          - directed_shortest_path (–1 if none)
        Flush all 7 arrays in 'pair_heuristics.npz'.
        """
        n = len(self.df_pairs)
        print(f"INFO: Pair heuristics: {n} pairs (~{n/self.its:.1f}s)")
        t0 = time.time()

        und = self.G.to_undirected()
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # Precompute deg and common_neighbors via A2
        deg_dict = dict(self.G.degree())
        ui = u; vi = v
        nodes = list(und.nodes())
        idx_map = {node:i for i,node in enumerate(nodes)}
        A = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A)
        cn = np.array([A2[idx_map[x], idx_map[y]] for x,y in zip(u,v)])

        # Buffers
        jacc = cn / (np.array([deg_dict.get(x,0) + deg_dict.get(y,0) - c for x,y,c in zip(u,v,cn)]) + 1e-8)
        sal   = cn / np.sqrt(np.array([deg_dict.get(x,0)*deg_dict.get(y,0) for x,y in zip(u,v)]) + 1e-8)
        hub   = cn / (np.maximum([deg_dict.get(x,0) for x in u],[deg_dict.get(y,0) for y in v]) + 1e-8)
        # Adamic–Adar & RA
        aa = np.zeros(n, dtype=float)
        ra = np.zeros(n, dtype=float)
        for i,(x,y) in enumerate(zip(u,v)):
            aa[i] = sum(1.0/np.log(1+und.degree(z)) for z in nx.common_neighbors(und, x, y))
            ra[i] = sum(1.0/und.degree(z)        for z in nx.common_neighbors(und, x, y))
        # Preferential attachment
        pa = np.array([self.G.out_degree(x)*self.G.in_degree(y) for x,y in zip(u,v)])
        # Directed shortest paths (batch)
        dsp = np.full(n, -1, dtype=int)
        for start in range(0, n, batch_size):
            end = min(start+batch_size, n)
            for i,(x,y) in enumerate(zip(u[start:end],v[start:end])):
                try:
                    dsp[start+i] = nx.shortest_path_length(self.G, x, y)
                except nx.NetworkXNoPath:
                    dsp[start+i] = -1

        # Flush
        np.savez(
            os.path.join(self.output_dir,'pair_heuristics.npz'),
            citation_G_jaccard              = jacc,
            citation_G_salton               = sal,
            citation_G_hub_depressed        = hub,
            citation_G_adamic_adar          = aa,
            citation_G_resource_allocation  = ra,
            citation_G_preferential_attachment = pa,
            citation_G_sp_directed          = dsp
        )
        print(f"INFO: Pair heuristics done in {time.time()-t0:.1f}s")

    

    def compute_path_based_scores(self, beta=0.005, epsilon=0.001):
        """
        2. Path‐based Scores Beyond AA/RA:
          • Katz_index ≈ β·A + β²·A2 + β³·A3
          • Local Path index = A2 + ε·A3
        Flushes both arrays in 'path_based_scores.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Path-based scores: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # α) build undirected adjacency powers
        und = self.G.to_undirected()
        nodes = list(und.nodes())
        idx = {node:i for i,node in enumerate(nodes)}
        A  = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A)
        A3 = A2.dot(A)

        # β) buffers
        katz = np.zeros(n, dtype=float)
        lp   = np.zeros(n, dtype=float)
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # γ) extract per‐pair
        for i,(x,y) in enumerate(zip(u,v)):
            xi, yi = idx[x], idx[y]
            katz[i] = beta * A[xi,yi]       \
                    + (beta**2) * A2[xi,yi] \
                    + (beta**3) * A3[xi,yi]
            lp[i]   = A2[xi,yi] + epsilon * A3[xi,yi]

        # δ) flush
        out = os.path.join(self.output_dir, 'path_based_scores.npz')
        np.savez(out, katz_index=katz, local_path=lp)
        print(f"INFO: Path‐based done in {time.time()-t0:.1f}s, flushed to {out}")


    def compute_community_features(self):
        """
        3. Community‐Based Features on citation graph:
          • same_community: 1 if both in same Louvain community
          • comm_size_ratio: |C_u| / |C_v|
        Flushes both arrays in 'community_features.npz'.
        """
        print("INFO: Detecting communities (greedy modularity)...")
        t0 = time.time()
        und = self.G.to_undirected()
        comms = nx.algorithms.community.greedy_modularity_communities(und)
        comm_map = {node:cid for cid,comm in enumerate(comms) for node in comm}
        sizes = {cid: len(comm) for cid,comm in enumerate(comms)}

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        n = len(u)
        same  = np.zeros(n, dtype=int)
        ratio = np.zeros(n, dtype=float)

        for i,(x,y) in enumerate(zip(u,v)):
            cx, cy = comm_map.get(x,-1), comm_map.get(y,-1)
            same[i] = int(cx==cy and cx!=-1)
            if cx in sizes and cy in sizes and sizes[cy]>0:
                ratio[i] = sizes[cx] / sizes[cy]

        out = os.path.join(self.output_dir, 'community_features.npz')
        np.savez(out, same_community=same, comm_size_ratio=ratio)
        print(f"INFO: Community done in {time.time()-t0:.1f}s, flushed to {out}")


    def compute_motif_counts(self):
        """
        4. Higher‐Order Motif Counts:
          • triangles_through_edge = A2[x,y]//2
          • cycles4 ≈ number of paths length‐3 = A3[x,y]
        Flushes both arrays in 'motif_counts.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Motif counts: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        und = self.G.to_undirected()
        nodes = list(und.nodes()); idx = {node:i for i,node in enumerate(nodes)}
        A  = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A); A3 = A2.dot(A)

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        tri  = np.zeros(n, dtype=int)
        cyc4 = np.zeros(n, dtype=float)

        for i,(x,y) in enumerate(zip(u,v)):
            xi, yi = idx[x], idx[y]
            tri[i]  = int(A2[xi,yi] // 2)
            cyc4[i] = A3[xi,yi]

        out = os.path.join(self.output_dir, 'motif_counts.npz')
        np.savez(out, triangles=tri, cycles4=cyc4)
        print(f"INFO: Motifs done in {time.time()-t0:.1f}s, flushed to {out}")


    def extract_keywords_from_abstracts(self, abstracts_path, top_k=10, max_features=5000):
        """
        5a. Extract top-k keywords per paper from abstracts via TF–IDF.
        Stores self.paper_keywords: {paper_id: [kw1,…,kw_topk]}.
        """
        print("INFO: Extracting keywords via TF–IDF from abstracts...")
        t0 = time.time()
        # load abstracts parquet with columns ['paper_id','abstract']
        df_abs = pd.read_parquet(abstracts_path)
        texts  = df_abs['abstract'].fillna("").tolist()
        pids   = df_abs['paper_id'].tolist()

        from sklearn.feature_extraction.text import TfidfVectorizer
        vec = TfidfVectorizer(
            max_features=max_features,
            stop_words='english'
        )
        X = vec.fit_transform(texts)  # shape = (n_papers, max_features)
        features = vec.get_feature_names_out()

        self.paper_keywords = {}
        for i, pid in enumerate(pids):
            row = X[i].tocoo()
            if row.nnz:
                top_idx = np.argsort(row.data)[-top_k:]
                kws = [features[row.col[j]] for j in top_idx]
            else:
                kws = []
            self.paper_keywords[pid] = kws

        print(f"INFO: Keywords extracted for {len(pids)} papers in {time.time()-t0:.1f}s")

    


    def compute_content_overlap(self):
        """
        5b. Content Overlap Beyond Abstracts:
          • title_jaccard: (if titles exist)
          • keyword_jaccard: Jaccard of extracted keywords
        Requires run of extract_keywords_from_abstracts(...) first.
        Flushes 'content_overlap.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Content overlap: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # assume self.paper_keywords exists
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
        kw_j = np.zeros(n, dtype=float)

        for i,(x,y) in enumerate(zip(u,v)):
            Au = set(self.paper_keywords.get(x, []))
            Av = set(self.paper_keywords.get(y, []))
            inter = Au & Av
            uni   = Au | Av
            kw_j[i] = len(inter) / (len(uni) + 1e-8)

        out = os.path.join(self.output_dir, 'content_overlap.npz')
        np.savez(out, keyword_jaccard=kw_j)
        print(f"INFO: Content done in {time.time()-t0:.1f}s, flushed to {out}")

    def compute_author_domain_similarity(self, batch_size=100000):
        """
        Compute author-domain similarity features:
          • max_citing_topic_author_dom_cosine
          • mean_citing_topic_author_dom_cosine
          • min_citing_topic_author_dom_cosine
          • max_cited_topic_author_dom_cosine
          • mean_cited_topic_author_dom_cosine
          • min_cited_topic_author_dom_cosine
        Flushes all 6 arrays in 'author_domain_similarity.npz'.
        """
        n = len(self.df_pairs)
        est = n / self.its
        print(f"INFO: Author-domain sim: {n} pairs (~{est:.1f}s)")
        t0 = time.time()

        # prepare default zero‐vector for missing topics
        D_topic = next(iter(self.topic_dist_arr.values())).shape[0]
        zero_vec = np.zeros(D_topic, dtype=float)

        # result buffers
        max_u = np.zeros(n, float)
        mean_u = np.zeros(n, float)
        min_u = np.zeros(n, float)
        max_v = np.zeros(n, float)
        mean_v = np.zeros(n, float)
        min_v = np.zeros(n, float)

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # batch over pairs
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            for i, (uid, vid) in enumerate(zip(u[start:end], v[start:end])):
                idx = start + i

                # domain vector for cited (used by citing→authors)
                dom_v = self.topic_dist_arr.get(vid, zero_vec)
                sims_u = []
                for a in self.paper_to_authors.get(uid, []):
                    vec_a = self.auth_topic_dict.get(a)
                    if vec_a is not None:
                        sims_u.append(
                            float(
                                np.dot(vec_a, dom_v) /
                                (np.linalg.norm(vec_a) * np.linalg.norm(dom_v) + 1e-8)
                            )
                        )
                if sims_u:
                    max_u[idx], mean_u[idx], min_u[idx] = max(sims_u), np.mean(sims_u), min(sims_u)

                # domain vector for citing (used by cited→authors)
                dom_u = self.topic_dist_arr.get(uid, zero_vec)
                sims_v = []
                for a in self.paper_to_authors.get(vid, []):
                    vec_a = self.auth_topic_dict.get(a)
                    if vec_a is not None:
                        sims_v.append(
                            float(
                                np.dot(vec_a, dom_u) /
                                (np.linalg.norm(vec_a) * np.linalg.norm(dom_u) + 1e-8)
                            )
                        )
                if sims_v:
                    max_v[idx], mean_v[idx], min_v[idx] = max(sims_v), np.mean(sims_v), min(sims_v)

        # flush to disk
        out = os.path.join(self.output_dir, 'author_domain_similarity.npz')
        np.savez(
            out,
            max_citing_topic_author_dom_cosine   = max_u,
            mean_citing_topic_author_dom_cosine  = mean_u,
            min_citing_topic_author_dom_cosine   = min_u,
            max_cited_topic_author_dom_cosine    = max_v,
            mean_cited_topic_author_dom_cosine   = mean_v,
            min_cited_topic_author_dom_cosine    = min_v
        )
        print(f"INFO: Author-domain done in {time.time()-t0:.1f}s, flushed to {out}")
        
    def compute_paper_author_domain_similarity(self):
        """
        Compute paper‐level author‐domain similarity:
          • paper_total_auth_dom_cosine_citing_vs_cited
          • paper_total_auth_dom_dot_citing_vs_cited
          • paper_total_auth_dom_l2_citing_vs_cited
        Uses self.paper_dom_dict.
        Flushes 3 arrays in 'paper_author_domain.npz'.
        """
        n = len(self.df_pairs)
        print(f"INFO: Paper–author‐domain sim: {n} pairs (~{n/self.its:.1f}s)")
        t0 = time.time()

        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()

        # build matrices
        D = next(iter(self.paper_dom_dict.values())).shape[0]
        U = np.vstack([ self.paper_dom_dict.get(pid, np.zeros(D)) for pid in u ])
        V = np.vstack([ self.paper_dom_dict.get(pid, np.zeros(D)) for pid in v ])

        dot = np.einsum('ij,ij->i', U, V)
        nu  = np.linalg.norm(U, axis=1)
        nv  = np.linalg.norm(V, axis=1)
        cos = dot / (nu * nv + 1e-8)
        l2  = np.linalg.norm(U - V, axis=1)

        out = os.path.join(self.output_dir, 'paper_author_domain.npz')
        np.savez(
            out,
            paper_total_auth_dom_cosine_citing_vs_cited = cos,
            paper_total_auth_dom_dot_citing_vs_cited    = dot,
            paper_total_auth_dom_l2_citing_vs_cited     = l2
        )
        print(f"INFO: Paper–author‐domain done in {time.time()-t0:.1f}s, flushed to {out}")
        

    def compute_abstract_author_svd_similarity(self):
        n = len(self.df_pairs)
        dot_abs = np.zeros(n); cos_abs = np.zeros(n)
        dot_auth= np.zeros(n); cos_auth= np.zeros(n)
    
        u = self.df_pairs.citing.to_numpy()
        v = self.df_pairs.cited.to_numpy()
    
        for i,(uid,vid) in enumerate(zip(u,v)):
            idx_u = self.pid_to_idx.get(uid, -1)
            idx_v = self.pid_to_idx.get(vid, -1)
            eu = self.emb_abs[idx_u] if idx_u>=0 else np.zeros(self.emb_abs.shape[1])
            ev = self.emb_abs[idx_v] if idx_v>=0 else np.zeros(self.emb_abs.shape[1])
            da = eu.dot(ev)
            na = np.linalg.norm(eu); nb = np.linalg.norm(ev)
            dot_abs[i] = da
            cos_abs[i] = da/(na*nb+1e-8)
    
            # αν έχεις emb_auth
            au = self.emb_auth.get(uid, np.zeros_like(eu))
            av = self.emb_auth.get(vid, np.zeros_like(eu))
            du = au.dot(av)
            nu = np.linalg.norm(au); nv = np.linalg.norm(av)
            dot_auth[i] = du
            cos_auth[i] = du/(nu*nv+1e-8)
    
        out = os.path.join(self.output_dir,'svd_text_author.npz')
        np.savez(out,
                 cosine_abs_svd_koz  = cos_abs,
                 cosine_auth_svd_koz = cos_auth)
        print(f"INFO: SVD text/author done.")
        
    
    def compute_rooted_pagerank(self, alpha=0.85, max_iter=100, tol=1e-6):
        """
        Compute Personalized (Rooted) PageRank score from each citing u to cited v,
        grouping by u to avoid OOM and redundant computation:
          • For each unique u, run PageRank on self.G with teleport vector focused on u
          • Record the PageRank score at v for every (u,v) pair
        Flushes array 'rooted_pagerank_score.npy'.
        """
        n   = len(self.df_pairs)
        # Estimate time assuming each PR is ~10× heavier than a single it/sec
        est = n / (self.its / 10)
        print(f"INFO: Rooted PageRank: {n} pairs est ~{est:.1f}s")
        t0  = time.time()

        # Prepare output buffer
        scores = np.zeros(n, dtype=float)

        # Group indices by citing‐paper u
        from collections import defaultdict
        idxs_by_u = defaultdict(list)
        for idx, (u, v) in enumerate(zip(self.df_pairs.citing, self.df_pairs.cited)):
            idxs_by_u[u].append((idx, v))

        # Compute PageRank once per unique u, assign and free memory immediately
        for u, lst in idxs_by_u.items():
            # Build personalization vector: teleport only to u
            pers = {node: 0.0 for node in self.G.nodes()}
            pers[u] = 1.0
            pr_u = nx.pagerank(
                G=self.G,
                alpha=alpha,
                personalization=pers,
                max_iter=max_iter,
                tol=tol,
                weight=None
            )
            # Assign scores for all (idx, v) belonging to this u
            for idx, v in lst:
                scores[idx] = pr_u.get(v, 0.0)
            # Free the PageRank vector before next u
            del pr_u

        # Flush to disk
        out = os.path.join(self.output_dir, 'rooted_pagerank_score.npy')
        np.save(out, scores)
        print(f"INFO: Rooted PageRank done in {time.time()-t0:.1f}s, flushed to {out}")

    def compute_motif_and_path_features(self, beta=0.005, epsilon=0.001, batch_size=100_000):
        """
        Memory‐safe, chunked computation of:
          • katz_index            = β²·A² + β³·A³
          • local_path            = A² + ε·A³
          • triangles_through_edge= A²[u,v] // 2
          • cycles4               = A³[u,v]
          • citation_G_sp_directed= 2 if A²>0, 3 if A³>0, else -1
    
        Uses np.memmap to avoid holding all features in RAM,
        and processes the pair list in chunks.
        """
        import os
        import time
        import numpy as np
        from numpy.lib.format import open_memmap
    
        n = len(self.df_pairs)
        print(f"INFO: Motif+Path feats (chunked): {n} pairs")
        t0 = time.time()
    
        # — Build undirected adjacency and its powers once —
        und = self.G.to_undirected()
        nodes = list(und.nodes())
        idx_map = {node: i for i, node in enumerate(nodes)}
        A  = nx.to_scipy_sparse_matrix(und, nodes, format='csr')
        A2 = A.dot(A)
        A3 = A2.dot(A)
    
        # — Prepare on‐disk arrays via memmap —
        out_dir = self.output_dir
        katz_mmap = open_memmap(os.path.join(out_dir, 'katz_index_h2.npy'),
                            mode='w+', dtype='float32', shape=(n,))
        lp_mmap   = open_memmap(os.path.join(out_dir, 'local_path_h2.npy'),
                                mode='w+', dtype='float32', shape=(n,))
        tri_mmap  = open_memmap(os.path.join(out_dir, 'triangles_through_edge_h2.npy'),
                                mode='w+', dtype='int32',   shape=(n,))
        cyc4_mmap = open_memmap(os.path.join(out_dir, 'cycles4_h2.npy'),
                                mode='w+', dtype='int32',   shape=(n,))
        spd_mmap  = open_memmap(os.path.join(out_dir, 'citation_G_sp_directed_h2.npy'),
                                mode='w+', dtype='int8',    shape=(n,))
      
        
    
        # — Load u/v arrays once —
        u_arr = self.df_pairs['citing'].to_numpy()
        v_arr = self.df_pairs['cited'].to_numpy()
    
        # — Process in chunks —
        for start in range(0, n, batch_size):
            end = min(n, start + batch_size)
            for i in range(start, end):
                u, v = u_arr[i], v_arr[i]
                ui, vi = idx_map[u], idx_map[v]
                a2 = A2[ui, vi]
                a3 = A3[ui, vi]
    
                katz_mmap[i] = (beta**2) * a2 + (beta**3) * a3
                lp_mmap[i]   = a2 + epsilon * a3
                tri_mmap[i]  = int(a2 // 2)
                cyc4_mmap[i] = int(a3)
                if a2 > 0:
                    spd_mmap[i] = 2
                elif a3 > 0:
                    spd_mmap[i] = 3
                # else leaves -1
    
            print(f"  • Processed rows {start}–{end} in {time.time()-t0:.1f}s")
    
        # — flush & cleanup memmaps —
        del katz_mmap, lp_mmap, tri_mmap, cyc4_mmap, spd_mmap

        print(f"INFO: Motif+Path done in {time.time()-t0:.1f}s ")

     ############################################################## EMBEDDINGS FEATURES #########################################################
    def compute_specter_hadamard(self):
        """
        Για κάθε (citing, cited) ζεύγος στο train split υπολογίζει
        το element-wise Hadamard product των Specter embeddings
        και το αποθηκεύει σε NumPy array shape = (n_train_pairs, D).
        Επιστρέφει το array.
        """
        # 1) Φόρτωση Specter embeddings
        embeds = np.load(self.abstracts_emb_path)  # shape = [n_papers, D]
        D = embeds.shape[1]

        # 2) Επιλογή μόνο των train ζευγών
        train_df = self.df_pairs
        u_ids = train_df['citing'].astype(int).to_numpy()
        v_ids = train_df['cited'].astype(int).to_numpy()
        n = len(train_df)

        # 3) Πρίζουμε πίνακα για τα Hadamard features
        hadamard = np.zeros((n, D), dtype=float)

        # 4) Υπολογισμός element-wise product για κάθε ζεύγος
        for i, (u, v) in enumerate(zip(u_ids, v_ids)):
            hadamard[i, :] = embeds[u] * embeds[v]

        # 5) Αποθήκευση σε .npy
        out_path = os.path.join(self.output_dir, 'specter_hadamard.npy')
        np.save(out_path, hadamard)
        print(f"INFO: Saved Specter Hadamard features to {out_path} (shape={hadamard.shape})")

        return hadamard





In [15]:
# 1) Δημιουργία extractor για test
fe = TestFeatureExtractor(base_path="D:/NLP/tfidf_xgboost",
                          output_dir="D:/NLP/Features_XL/test",
                          chunked=True,
                          its=200000)

# 2) Φόρτωμα των test pairs
fe.load_data(test_pairs_path=r"C:\Users\mysmu\Desktop\Natural Language Processing\nlp-cse-uoi-2025\data_new\test.txt")

# 3) Κλήση όλων των compute_... μεθόδων

#fe.compute_tfidf_similarity()
#fe.compute_specter_similarity()

# fe.compute_co_citation_bibliographic()
# fe.compute_author_overlap_jaccard()
# fe.compute_pair_heuristics()
# fe.compute_lda_topics_features()

#fe.compute_community_features()       # same_community, comm_size_ratio
#fe.compute_motif_counts()
#fe.extract_keywords_from_abstracts("D:/NLP/tfidf_xgboost/cleaned_abstracts.parquet", top_k=10)
#fe.compute_content_overlap()

#fe.compute_node_level_metrics()       # citing_degree + raw common_neighbors

#fe.compute_motif_and_path_features()
#fe.compute_abstract_author_svd_similarity()

# fe.compute_scibert_similarity()
# fe.compute_bertopic_features()
# fe.compute_paper_author_domain_similarity()
# fe.compute_author_domain_similarity()
# fe.compute_author_aggregate_stats()
# fe.compute_coauthor_distance()
# fe.compute_author_graph_heuristics()

#fe.compute_embedding_features()
# fe.compute_rooted_pagerank()

 ############################################################## EMBEDDINGS FEATURES #########################################################
fe.compute_specter_hadamard()

# Τώρα μέσα στον φάκελο output_dir θα έχεις όλα τα .npy/.npz με τα test-features

INFO: Loading test data…
INFO: Loaded 106692 test pairs in 0.0s
INFO: Loaded authors mapping (138499 papers)
INFO: Loaded abstracts_embeds (138499, 300)
INFO: Loaded pid_to_idx (138499 entries)
INFO: Loaded Specter embeddings array (138499, 768)
INFO: Loaded SciBERT embeddings (138499)
INFO: Loaded LDA topics (131250) in 1.6s total
INFO: Loaded graph embeddings: citation_node2vec=138499, citation_walklets=138499, author_node2vec=136863, author_walklets=136863
INFO: BERTopic dicts ready (topics=131250, authors=133493, papers_dom=138192)
INFO: Loaded SVD embeddings (abstracts=138499, authors=138499)
INFO: Built citation graph (nodes=99760, edges=0)
INFO: Built co-author graph (nodes=136863, edges=520007)
✅ Data loaded, graphs ready.
INFO: Saved Specter Hadamard features to D:/NLP/Features_XL/test\specter_hadamard.npy (shape=(106692, 300))


array([[ 1.09443619e-01, -2.59840318e-02,  6.74534152e-02, ...,
         3.23523901e-04, -3.45791189e-04, -1.65615114e-03],
       [ 0.00000000e+00, -0.00000000e+00,  0.00000000e+00, ...,
        -0.00000000e+00,  0.00000000e+00, -0.00000000e+00],
       [ 1.43329907e-01,  3.82459367e-03, -1.04114404e-02, ...,
         1.74879250e-04, -1.84282744e-03, -1.03489366e-03],
       ...,
       [ 9.19556235e-02,  5.87895661e-03, -1.69018526e-02, ...,
         6.64736521e-04,  9.31989034e-04,  1.07722241e-03],
       [ 6.10241669e-02,  7.37106453e-03,  8.96277853e-03, ...,
        -8.63924838e-04, -8.96194906e-05, -1.55157385e-06],
       [ 8.38653397e-02, -1.58332135e-02,  6.22138566e-03, ...,
         1.17844388e-03, -1.80144037e-04, -1.15641253e-04]])