In [1]:
import pandas as pd
from LinkPredictor import EnhancedLinkPredictor
from EncoderDecoder import BottleneckT5Autoencoder
import networkx as nx
import numpy as np
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re
import unicodedata
from sklearn.decomposition import LatentDirichletAllocation
import spacy
#!python -m spacy download en_core_web_sm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('dblp_processing/dblp_papers_2020_SampleWithRefs.csv', sep=';')
df['abstract'] = df['abstract'].fillna('')
df.head()

Unnamed: 0,id,title,year,references,abstract
0,3005773274,Malware classification algorithm using advance...,2020.0,[],"Recently, Internet of Drones (IoD) are issu..."
1,3002219790,Generalized transitivity: A systematic compari...,2020.0,"[79079207, 170687150, 756957829, 1517040319, 1...",Reciprocal relations are binary relations Q...
2,2999570098,A New DGNSS Positioning Infrastructure for And...,2020.0,"[1978255623, 2109436662, 2146023544, 218050761...",One’s position has become an important piece o...
3,2998778732,A New Cycle Slip Detection and Repair Method U...,2020.0,[2482503770],The detection and repair of the cycle slip is ...
4,2972569439,Existence and concentration of positive ground...,2020.0,[],This paper is concerned with the following ...


In [3]:
class CitationGraphAbstractGenerator:
    def __init__(self, embedding_model_name='t5', linkPredictor_model_name='enhanced'):
        """
        Initialize the citation graph abstract generator
        
        Args:
            embedding_model: Model for embedding abstracts
            linkPredictor_model: Model for link prediction
        """
        # Download nltk resources if needed

        self.embedding_model_name = embedding_model_name
        self.linkPredictor_model_name = linkPredictor_model_name
        if embedding_model_name=='t5':
            self.embedding_model = BottleneckT5Autoencoder()
        if linkPredictor_model_name=='enhanced':
            self.linkPredictor_model = EnhancedLinkPredictor()
        self.citation_graph = nx.DiGraph()
        self.stop_words = list(stopwords.words('english'))

    def _update_linkPredictor(self):
        """
        Update the linkPredictor model, used when we update the grap (e.g., for new papers)
        """
        if self.linkPredictor_model_name=='enhanced':
            self.linkPredictor_model = EnhancedLinkPredictor(self.citation_graph)
        

    def load_citation_graph(self, papers):
        """
        Load papers and their references into a graph
        
        Args:
            papers: List of dictionaries, each containing 'id', 'abstract', and 'references' (list of paper ids)
        """
        # Add nodes (papers) to the graph
        for paper in papers:
            # Extract key concepts from the abstract
            key_concepts = self.extract_key_concepts(paper['abstract'])
            
            self.citation_graph.add_node(
                paper['id'], 
                abstract=paper['abstract'],
                embedding=None,  # Will be populated later
                key_concepts=key_concepts  # Added key_concepts property
            )
            
        # Add edges (references)
        for paper in papers:
            for cited_paper_id in paper['references']:
                if cited_paper_id in self.citation_graph:
                    self.citation_graph.add_edge(paper['id'], cited_paper_id)
                    
        print(f"Graph created with {self.citation_graph.number_of_nodes()} nodes and {self.citation_graph.number_of_edges()} edges")
        
    def extract_key_concepts(self,text, score_threshold=0.2):
        """
        Extract key concepts from an abstract using TF-IDF with a score threshold.

        Args:
            text (str): Input text (e.g., an abstract).
            score_threshold (float): Minimum TF-IDF score to consider as a concept.

        Returns:
            list: List of key concepts.
        """
        # Tokenization and cleaning (removing special characters and non-alphanumeric tokens)
        tokens = re.findall(r'\b\w+\b', text.lower())

        # Remove stop words
        filtered_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]  # Exclude stop words and short words

        if len(filtered_tokens) < 5:
            # If there are fewer than 5 meaningful tokens, return the tokens directly
            return list(set(filtered_tokens))

        # Apply TF-IDF to extract important terms
        tfidf = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
        
        try:
            tfidf_matrix = tfidf.fit_transform([text])
            feature_names = tfidf.get_feature_names_out()
            scores = tfidf_matrix.toarray()[0]

            # Pair terms with their TF-IDF scores
            scored_concepts = [(feature_names[i], scores[i]) for i in range(len(scores))]

            # Filter terms based on the score threshold
            scored_concepts = sorted(scored_concepts, key=lambda x: x[1], reverse=True)

            # Only include concepts above the threshold
            filtered_concepts = [concept for concept, score in scored_concepts if score >= score_threshold]

            return filtered_concepts

        except ValueError as e:
            # If TF-IDF fails (empty vocabulary), return tokens or an error message
            print(f"Error with TF-IDF: {e}")
            return list(set(filtered_tokens))

        except Exception as e:
            print(f"TF-IDF failed: {e}")
            print(f"Tokens: {tokens}")
            print(f"Type of tokens: {type(tokens)}")
            return list(set(tokens))[:10]

        
    def compute_embeddings(self):
        """
        Compute and store embeddings for all papers in the graph
        """
        for node_id in self.citation_graph.nodes():
            abstract = self.citation_graph.nodes[node_id]['abstract']
            
            if not abstract:
                print(f"Skipping node {node_id}: Abstract is empty or None.")
                continue  # Skip this node if abstract is empty
            
            # Ensure abstract is a valid string
            if isinstance(abstract, str):
                print(f"Embedding abstract for node {node_id}: {abstract[:100]}...")  # Debug output for first 100 characters
                embedding = self.embedding_model.embed(abstract)  # Embedding computation
                self.citation_graph.nodes[node_id]['embedding'] = embedding
            else:
                print(f"Invalid abstract format for node {node_id}: {abstract}")
            
        print("Embeddings computed for all nodes")
    
    def create_new_node(self):
        """
        Create a new node in the graph without any links
        
        Returns:
            new_node_id: ID of the new node
        """
        # Create new node ID
        new_node_id = f"new_paper_{len([n for n in self.citation_graph.nodes() if 'new_paper' in str(n)])}"
        
        # Add new node to graph without any connections
        self.citation_graph.add_node(new_node_id, abstract=None, embedding=None, key_concepts=[])

        predicted_links = self.predict_links(new_node_id)  # Predict links for the new node

        # Add the predicted links
        self.add_links(new_node_id, predicted_links)
        
        # Generate abstract based on the links
        self.generate_abstract_from_links(new_node_id)

        # Update the target node's embedding and key concepts
        self._update_linkPredictor()

        print(f"Created new node {new_node_id} (no connections yet)")
        return new_node_id
    
    def predict_links(self, node_id):
        """
        Predict potential links for a new node using only structural information
        with stochastic elements similar to random graph generation.
        
        This method determines the number of links dynamically based on network properties
        and uses structural network measures with controlled randomness to select links.
        
        Args:
            node_id: ID of the node to predict links for
            
        Returns:
            predicted_links: List of node IDs that are predicted to be linked
        """
        
        # Get all nodes except the target and other new nodes
        
        predicted_links = self.linkPredictor_model.predict_links(node_id)
        
        return predicted_links
    
    def add_links(self, node_id, link_ids):
        """
        Add links from a node to other nodes
        
        Args:
            node_id: ID of the source node
            link_ids: List of target node IDs
        """
        for target_id in link_ids:
            if target_id in self.citation_graph:
                self.citation_graph.add_edge(node_id, target_id)

        print(f"Added {len(link_ids)} links from node {node_id}")
    
    def generate_abstract_from_links(self, node_id):
        """
        Generate an abstract for a node based on its links using embedding averaging
        
        Args:
            node_id: ID of the node to generate an abstract for
        """
        # Get linked nodes
        linked_nodes = list(self.citation_graph.successors(node_id))
        
        if not linked_nodes:
            print("No links found for the node")
            return None
        
        # Get abstracts of linked papers
        linked_abstracts = [self.citation_graph.nodes[n]['abstract'] for n in linked_nodes]
        
        # Create embeddings for the linked papers if they don't exist
        for n in linked_nodes:
            if self.citation_graph.nodes[n]['embedding'] is None:
                embedding = self.embedding_model.encode(self.citation_graph.nodes[n]['abstract'])
                self.citation_graph.nodes[n]['embedding'] = embedding
        
        # Get embeddings of linked papers
        linked_embeddings = [self.citation_graph.nodes[n]['embedding'] for n in linked_nodes]
        
        # Create an aggregated embedding for the new node by averaging
        aggregated_embedding = np.mean(linked_embeddings, axis=0)
        self.citation_graph.nodes[node_id]['embedding'] = aggregated_embedding
        
        # Extract just the abstract part
        abstract = self.embedding_model.generate_from_latent(aggregated_embedding)
        
        # Store the abstract
        self.citation_graph.nodes[node_id]['abstract'] = abstract
        
        # Extract key concepts from the abstract
        key_concepts = self.extract_key_concepts(abstract)
        self.citation_graph.nodes[node_id]['key_concepts'] = key_concepts
        
        print(f"Generated abstract with {len(key_concepts)} key concepts: {', '.join(key_concepts)}")
        
    
    def calculate_concept_similarity(self, concepts1, concepts2, threshold=0.7):
        """
        Calculate similarity between two sets of concepts
        
        Args:
            concepts1: First list of concepts
            concepts2: Second list of concepts
            threshold: Similarity threshold for considering concepts as matching
            
        Returns:
            float: Percentage of concepts1 that have a match in concepts2
        """
        if not concepts1 or not concepts2:
            return 0.0
        
        # Encode concepts to get embeddings
        embeddings1 = self.embedding_model.encode(concepts1)
        embeddings2 = self.embedding_model.encode(concepts2)
        
        # Calculate similarity matrix
        similarity_matrix = cosine_similarity(embeddings1, embeddings2)
        
        # Count concepts in concepts1 that have a match above threshold in concepts2
        matched_concepts = 0
        for i in range(len(concepts1)):
            if any(similarity_matrix[i, j] > threshold for j in range(len(concepts2))):
                matched_concepts += 1
        
        # Return percentage
        return matched_concepts / len(concepts1)
    
    def evaluate_node_concepts(self, node_id, threshold=0.7):
        """
        Evaluate the concepts of a node against its linked nodes
        
        Args:
            node_id: ID of the node to evaluate
            threshold: Similarity threshold for considering concepts as matching
            
        Returns:
            dict: Dictionary of evaluation metrics
        """
        # Get node concepts
        node_concepts = self.citation_graph.nodes[node_id]['key_concepts']
        
        if not node_concepts:
            print(f"No concepts found for node {node_id}")
            return None
        
        # Get linked nodes
        linked_nodes = list(self.citation_graph.successors(node_id))
        
        if not linked_nodes:
            print(f"No links found for node {node_id}")
            return None
        
        # Calculate concept similarity for each linked node
        similarity_percentages = []
        all_linked_concepts = set()
        
        for linked_id in linked_nodes:
            linked_concepts = self.citation_graph.nodes[linked_id]['key_concepts']
            all_linked_concepts.update(linked_concepts)
            
            similarity = self.calculate_concept_similarity(node_concepts, linked_concepts, threshold)
            similarity_percentages.append(similarity)
        
        # Calculate new concepts (concepts not in any linked node)
        new_concepts = []
        for concept in node_concepts:
            # Encode concept
            concept_embedding = self.embedding_model.encode([concept])[0]
            
            # Check if the concept exists in any linked node
            is_new = True
            for linked_id in linked_nodes:
                linked_concepts = self.citation_graph.nodes[linked_id]['key_concepts']
                if not linked_concepts:
                    continue
                    
                linked_embeddings = self.embedding_model.encode(linked_concepts)
                similarities = cosine_similarity([concept_embedding], linked_embeddings)[0]
                
                if any(sim > threshold for sim in similarities):
                    is_new = False
                    break
                    
            if is_new:
                new_concepts.append(concept)
        
        # Calculate metrics
        metrics = {
            'avg_concept_similarity': np.mean(similarity_percentages) if similarity_percentages else 0.0,
            'new_concepts_count': len(new_concepts),
            'new_concepts_percentage': len(new_concepts) / len(node_concepts) if node_concepts else 0.0,
            'new_concepts': new_concepts
        }
        
        return metrics
    
    def evaluate_abstract(self, node_id):
        """
        Evaluate the generated abstract
        
        Args:
            node_id: ID of the node
            
        Returns:
            metrics: Dictionary of evaluation metrics
        """
        # Get the abstract and its embedding
        abstract = self.citation_graph.nodes[node_id]['abstract']
        
        if abstract is None:
            print("No abstract found for the node")
            return None
        
        # Compute embedding if not already done
        if self.citation_graph.nodes[node_id]['embedding'] is None:
            embedding = self.embedding_model.encode(abstract)
            self.citation_graph.nodes[node_id]['embedding'] = embedding
        else:
            embedding = self.citation_graph.nodes[node_id]['embedding']
        
        # Get linked nodes
        linked_nodes = list(self.citation_graph.successors(node_id))
        
        if not linked_nodes:
            print("No links found for the node")
            return None
        
        # Compute similarity with linked papers
        similarities = []
        for n in linked_nodes:
            other_embedding = self.citation_graph.nodes[n]['embedding']
            similarity = cosine_similarity([embedding], [other_embedding])[0][0]
            similarities.append(similarity)
        
        # Calculate metrics
        metrics = {
            'avg_similarity': np.mean(similarities),
            'max_similarity': np.max(similarities),
            'min_similarity': np.min(similarities),
            'std_similarity': np.std(similarities),
            'num_linked_papers': len(linked_nodes)
        }
        
        # Add concept evaluation metrics
        concept_metrics = self.evaluate_node_concepts(node_id)
        if concept_metrics:
            metrics.update(concept_metrics)
        
        return metrics
    
    def visualize_graph(self, highlight_node=None):
        """
        Visualize the citation graph
        
        Args:
            highlight_node: Node to highlight in the visualization
        """
        plt.figure(figsize=(12, 8))
        
        # Define node colors
        node_colors = []
        for node in self.citation_graph.nodes():
            if node == highlight_node:
                node_colors.append('red')
            elif 'new_paper' in str(node):
                node_colors.append('green')
            else:
                node_colors.append('lightblue')
        
        # Define node sizes based on in-degree (citation count)
        node_sizes = []
        for node in self.citation_graph.nodes():
            in_degree = self.citation_graph.in_degree(node)
            node_sizes.append(300 + in_degree * 50)
        
        # Create layout
        pos = nx.spring_layout(self.citation_graph, seed=42)
        
        # Draw the graph
        nx.draw(
            self.citation_graph, 
            pos=pos, 
            with_labels=True, 
            node_color=node_colors,
            node_size=node_sizes, 
            alpha=0.7, 
            arrows=True
        )
        
        plt.title("Citation Graph with Generated Papers")
        plt.show()

In [4]:
# Create sample papers with abstracts and references
papers = df.to_dict(orient='records')

# Initialize the generator
generator = CitationGraphAbstractGenerator()

# Load citation graph
generator.load_citation_graph(papers)

# Compute embeddings for existing papers
generator.compute_embeddings()

# Create a new node
new_node_id = generator.create_new_node()

# Evaluate the abstract
metrics = generator.evaluate_abstract(new_node_id)
print("Evaluation Metrics:")
for metric, value in metrics.items():
    if isinstance(value, float):
        print(f"- {metric}: {value:.4f}")
    elif isinstance(value, list):
        print(f"- {metric}: {value}")
    else:
        print(f"- {metric}: {value}")

# Visualize the graph
generator.visualize_graph(highlight_node=new_node_id)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ValueError: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']