In [None]:
import networkx as nx
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

class CitationGraphAbstractGenerator:
    def __init__(self, embedding_model="sentence-transformers/sentence-t5-xxl"):
        """
        Initialize the citation graph abstract generator
        
        Args:
            embedding_model: Model name for embedding abstracts
        """
        self.embedding_model = SentenceTransformer(embedding_model)
        self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
        self.generator = AutoModelForCausalLM.from_pretrained("gpt2")
        self.citation_graph = nx.DiGraph()
        
    def load_citation_graph(self, papers):
        """
        Load papers and their citations into a graph
        
        Args:
            papers: List of dictionaries, each containing 'id', 'abstract', and 'citations' (list of paper ids)
        """
        # Add nodes (papers) to the graph
        for paper in papers:
            self.citation_graph.add_node(
                paper['id'], 
                abstract=paper['abstract'],
                embedding=None  # Will be populated later
            )
            
        # Add edges (citations)
        for paper in papers:
            for cited_paper_id in paper['citations']:
                if cited_paper_id in self.citation_graph:
                    self.citation_graph.add_edge(paper['id'], cited_paper_id)
                    
        print(f"Graph created with {self.citation_graph.number_of_nodes()} nodes and {self.citation_graph.number_of_edges()} edges")
        
    def compute_embeddings(self):
        """
        Compute and store embeddings for all papers in the graph
        """
        for node_id in self.citation_graph.nodes():
            abstract = self.citation_graph.nodes[node_id]['abstract']
            embedding = self.embedding_model.encode(abstract)
            self.citation_graph.nodes[node_id]['embedding'] = embedding
        
        print("Embeddings computed for all nodes")
    
    def create_new_node(self):
        """
        Create a new node in the graph without any links
        
        Returns:
            new_node_id: ID of the new node
        """
        # Create new node ID
        new_node_id = f"new_paper_{len([n for n in self.citation_graph.nodes() if 'new_paper' in str(n)])}"
        
        # Add new node to graph without any connections
        self.citation_graph.add_node(new_node_id, abstract=None, embedding=None)
        
        print(f"Created new node {new_node_id} (no connections yet)")
        return new_node_id
    
    def predict_links(self, node_id):
        """
        Predict potential links for a new node using only structural information
        with stochastic elements similar to random graph generation.
        
        This method determines the number of links dynamically based on network properties
        and uses structural network measures with controlled randomness to select links.
        
        Args:
            node_id: ID of the node to predict links for
            
        Returns:
            predicted_links: List of node IDs that are predicted to be linked
        """
        
        # Get all nodes except the target and other new nodes
        candidate_nodes = [n for n in self.citation_graph.nodes() 
                        if n != node_id and 'new_paper' not in str(n)]
        
        if not candidate_nodes:
            print("No candidate nodes available for link prediction")
            return []
        
        # STEP 1: Calculate network properties to determine num_links
        
        # Calculate degree distribution
        in_degrees = dict(self.citation_graph.in_degree())
        out_degrees = dict(self.citation_graph.out_degree())
        total_degrees = {n: in_degrees.get(n, 0) + out_degrees.get(n, 0) for n in self.citation_graph.nodes()}
        
        # Analyze degree distribution for random graph model parameters
        avg_degree = np.mean(list(total_degrees.values()))
        std_degree = np.std(list(total_degrees.values()))
        max_degree = max(total_degrees.values()) if total_degrees else 1
        
        # Get clustering coefficient - represents transitivity in the network
        clustering_coef = nx.average_clustering(self.citation_graph)
        
        # STEP 2: Determine number of links using network properties
        
        # Base the number of links on degree distribution
        # For citation networks, out-degree (references) follows a different distribution
        # than in-degree (citations)
        avg_out_degree = np.mean(list(out_degrees.values()))
        
        # Calculate num_links based on network's average out-degree with some variation
        # This mimics how many papers a typical new paper in this network would cite
        base_num_links = max(1, int(round(avg_out_degree)))
        
        # Add stochastic element - similar to random graph models, but informed by the 
        # actual degree variance in the network
        degree_variance_factor = std_degree / (avg_degree + 1)  # Normalized variance
        stochastic_factor = np.random.normal(loc=1.0, scale=degree_variance_factor)
        stochastic_factor = max(0.5, min(stochastic_factor, 1.5))  # Keep within reasonable bounds
        
        # Apply stochastic factor
        num_links = max(1, int(round(base_num_links * stochastic_factor)))
        
        # Ensure we don't predict more links than available candidates
        num_links = min(num_links, len(candidate_nodes))
        
        # Log the network analysis and link count decision
        print(f"Network analysis: avg_out_degree={avg_out_degree:.2f}, clustering={clustering_coef:.2f}, "
            f"degree_variance={degree_variance_factor:.2f}")
        print(f"Dynamically determined num_links={num_links} for node {node_id} "
            f"(stochastic_factor={stochastic_factor:.2f})")
        
        # STEP 3: Calculate purely structural scores for link candidates
        
        # Calculate key structural metrics
        # 1. Preferential attachment (rich-get-richer effect)
        # Papers with more citations are more likely to be cited again
        pref_attach_scores = {}
        total_citations = sum(in_degrees.values())
        for n in candidate_nodes:
            if total_citations > 0:
                pref_attach_scores[n] = in_degrees.get(n, 0) / total_citations
            else:
                pref_attach_scores[n] = 1.0 / len(candidate_nodes)
        
        # 2. Edge formation via transitivity (triangle closing)
        # If papers A and B both cite paper C, then a new paper citing A might also cite B
        # We approximate this using clustering coefficient and degree centrality
        transitivity_scores = {}
        degree_centrality = nx.degree_centrality(self.citation_graph)
        for n in candidate_nodes:
            # Papers with higher centrality and in clusters are more likely to be co-cited
            transitivity_scores[n] = degree_centrality.get(n, 0) * clustering_coef
        
        # 3. Calculate recency score if 'year' attribute exists
        # Recent papers are more likely to be cited by new papers
        recency_scores = {}
        current_year = 2025  # Example current year
        for n in candidate_nodes:
            if 'year' in self.citation_graph.nodes[n]:
                year = self.citation_graph.nodes[n]['year']
                # Linear decay over 10 years
                recency_scores[n] = max(0, 1 - (current_year - year) / 10)
            else:
                recency_scores[n] = 0.5  # Default value
        
        # STEP 4: Calculate randomness factor similar to random graph models
        
        # Calculate random probability for each node
        # Different random graph models use different probability distributions:
        # - Erdős–Rényi: uniform probabilities
        # - Barabási–Albert: preferential attachment
        # - Watts–Strogatz: high clustering and short paths
        
        # We'll use a hybrid approach with citation network characteristics
        
        # Generate random component for each node
        random_scores = {n: np.random.rand() for n in candidate_nodes}
        
        # STEP 5: Combine structural scores with controlled randomness
        
        # Set randomness parameter to control stochastic vs. structural influence
        # Lower clustering suggests more random connections are needed
        randomness_factor = max(0.2, min(0.5, 1.0 - clustering_coef))
        
        combined_scores = {}
        for n in candidate_nodes:
            # Calculate deterministic structural score
            structural_score = (
                0.5 * pref_attach_scores.get(n, 0) +   # Preferential attachment (citations)
                0.3 * transitivity_scores.get(n, 0) +  # Transitivity (cluster formation)
                0.2 * recency_scores.get(n, 0)         # Recency
            )
            
            # Combine structural and random components
            combined_scores[n] = (
                (1 - randomness_factor) * structural_score + 
                randomness_factor * random_scores[n]
            )
        
        # STEP 6: Select links using weighted random sampling
        # This mimics the probabilistic edge formation in random graph models
        # while respecting structural properties
        
        # Convert scores to probabilities
        total_score = sum(combined_scores.values())
        if total_score == 0:
            probabilities = [1.0/len(candidate_nodes) for _ in candidate_nodes]
        else:
            probabilities = [combined_scores[n]/total_score for n in candidate_nodes]
        
        # Perform weighted sampling without replacement
        try:
            selected_indices = np.random.choice(
                range(len(candidate_nodes)), 
                size=num_links, 
                replace=False, 
                p=probabilities
            )
            predicted_links = [candidate_nodes[i] for i in selected_indices]
        except ValueError as e:
            # Fallback if there's an issue with probability distribution
            print(f"Sampling error: {e}. Using deterministic selection instead.")
            predicted_links = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:num_links]
            predicted_links = [n for n, _ in predicted_links]
        
        print(f"Predicted {len(predicted_links)} links for node {node_id} "
            f"with randomness factor {randomness_factor:.2f}")
        
        return predicted_links
    
    def add_links(self, node_id, link_ids):
        """
        Add links from a node to other nodes
        
        Args:
            node_id: ID of the source node
            link_ids: List of target node IDs
        """
        for target_id in link_ids:
            if target_id in self.citation_graph:
                self.citation_graph.add_edge(node_id, target_id)
        
        print(f"Added {len(link_ids)} links from node {node_id}")
    
    def generate_abstract_from_links(self, node_id):
        """
        Generate an abstract for a node based on its links
        
        Args:
            node_id: ID of the node
            
        Returns:
            abstract: Generated abstract
        """
        # Get linked nodes
        linked_nodes = list(self.citation_graph.successors(node_id))
        
        if not linked_nodes:
            print("No links found for the node")
            return None
        
        # Get abstracts of linked papers
        linked_abstracts = [self.citation_graph.nodes[n]['abstract'] for n in linked_nodes]
        
        # Create embeddings for the linked papers
        linked_embeddings = [self.citation_graph.nodes[n]['embedding'] for n in linked_nodes]
        
        # Create an aggregated embedding for the new node
        aggregated_embedding = np.mean(linked_embeddings, axis=0)
        self.citation_graph.nodes[node_id]['embedding'] = aggregated_embedding
        
        # Create a prompt for the generator based on linked papers
        prompt = f"Based on the following research papers, write a novel abstract for a new paper that builds upon and extends these ideas:\n\n"
        prompt += "\n\n".join([f"Paper {i+1}: {abstract}" for i, abstract in enumerate(linked_abstracts)])
        prompt += "\n\nWrite a cohesive abstract that integrates concepts from these papers and proposes a novel approach or finding:"
        
        # Generate abstract
        inputs = self.tokenizer(prompt, return_tensors="pt")
        
        outputs = self.generator.generate(
            inputs.input_ids,
            max_length=len(inputs.input_ids[0]) + 250,  # Allow for a reasonably sized abstract
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract just the abstract part
        abstract = generated_text.split("a novel approach or finding:")[1].strip() if "a novel approach or finding:" in generated_text else generated_text
        
        # Store the abstract
        self.citation_graph.nodes[node_id]['abstract'] = abstract
        
        return abstract
    
    def evaluate_abstract(self, node_id):
        """
        Evaluate the generated abstract
        
        Args:
            node_id: ID of the node
            
        Returns:
            metrics: Dictionary of evaluation metrics
        """
        # Get the abstract and its embedding
        abstract = self.citation_graph.nodes[node_id]['abstract']
        
        if abstract is None:
            print("No abstract found for the node")
            return None
        
        # Compute embedding if not already done
        if self.citation_graph.nodes[node_id]['embedding'] is None:
            embedding = self.embedding_model.encode(abstract)
            self.citation_graph.nodes[node_id]['embedding'] = embedding
        else:
            embedding = self.citation_graph.nodes[node_id]['embedding']
        
        # Get linked nodes
        linked_nodes = list(self.citation_graph.successors(node_id))
        
        if not linked_nodes:
            print("No links found for the node")
            return None
        
        # Compute similarity with linked papers
        similarities = []
        for n in linked_nodes:
            other_embedding = self.citation_graph.nodes[n]['embedding']
            similarity = cosine_similarity([embedding], [other_embedding])[0][0]
            similarities.append(similarity)
        
        # Calculate metrics
        metrics = {
            'avg_similarity': np.mean(similarities),
            'max_similarity': np.max(similarities),
            'min_similarity': np.min(similarities),
            'std_similarity': np.std(similarities),
            'num_linked_papers': len(linked_nodes)
        }
        
        return metrics
    
    def visualize_graph(self, highlight_node=None):
        """
        Visualize the citation graph
        
        Args:
            highlight_node: Node to highlight in the visualization
        """
        plt.figure(figsize=(12, 8))
        
        # Define node colors
        node_colors = []
        for node in self.citation_graph.nodes():
            if node == highlight_node:
                node_colors.append('red')
            elif 'new_paper' in str(node):
                node_colors.append('green')
            else:
                node_colors.append('lightblue')
        
        # Define node sizes based on in-degree (citation count)
        node_sizes = []
        for node in self.citation_graph.nodes():
            in_degree = self.citation_graph.in_degree(node)
            node_sizes.append(300 + in_degree * 50)
        
        # Create layout
        pos = nx.spring_layout(self.citation_graph, seed=42)
        
        # Draw the graph
        nx.draw(
            self.citation_graph, 
            pos=pos, 
            with_labels=True, 
            node_color=node_colors,
            node_size=node_sizes, 
            alpha=0.7, 
            arrows=True
        )
        
        plt.title("Citation Graph with Generated Papers")
        plt.show()

# Example usage
def run_demo():
    # Create sample papers with abstracts and citations
    papers = [
        {
            'id': 'paper1',
            'abstract': "This paper introduces a new method for graph-based text generation using attention mechanisms and transformer models.",
            'citations': [],
            'year': 2020
        },
        {
            'id': 'paper2',
            'abstract': "We propose a novel approach to citation network analysis using reinforcement learning and knowledge graphs.",
            'citations': ['paper1'],
            'year': 2021
        },
        {
            'id': 'paper3',
            'abstract': "This work extends previous research on knowledge graphs with transformer architectures and bidirectional encodings.",
            'citations': ['paper1', 'paper2'],
            'year': 2022
        },
        {
            'id': 'paper4',
            'abstract': "Our research combines graph neural networks with language models for scientific discovery and automated hypothesis generation.",
            'citations': ['paper2', 'paper3'],
            'year': 2023
        },
        {
            'id': 'paper5',
            'abstract': "This paper presents a novel approach to feature propagation in citation networks using diffusion models.",
            'citations': ['paper1', 'paper4'],
            'year': 2024
        }
    ]
    
    # Initialize the generator
    generator = CitationGraphAbstractGenerator()
    
    # Load citation graph
    generator.load_citation_graph(papers)
    
    # Compute embeddings for existing papers
    generator.compute_embeddings()
    
    # Create a new node
    new_node_id = generator.create_new_node()
    
    # Predict links for the new node
    predicted_links = generator.predict_links(new_node_id, num_links=3)
    print(f"Predicted links: {predicted_links}")
    
    # Add the predicted links
    generator.add_links(new_node_id, predicted_links)
    
    # Generate abstract based on the links
    abstract = generator.generate_abstract_from_links(new_node_id)
    print(f"\nGenerated Abstract:\n{abstract}\n")
    
    # Evaluate the abstract
    metrics = generator.evaluate_abstract(new_node_id)
    print("Evaluation Metrics:")
    for metric, value in metrics.items():
        print(f"- {metric}: {value:.4f}")
    
    # Visualize the graph
    generator.visualize_graph(highlight_node=new_node_id)

if __name__ == "__main__":
    run_demo()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
