# Paper Citation Prediction and Recommendation Models

This notebook implements comprehensive citation prediction and paper recommendation models using the OpenAlex dataset. We'll build baseline and advanced models to predict citation relationships and recommend relevant papers to researchers.

## 1. Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, roc_curve, auc,
                             confusion_matrix, classification_report)
from sklearn.decomposition import TruncatedSVD
from scipy.spatial.distance import cosine
from scipy.sparse import csr_matrix
import networkx as nx
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

## 2. Synthetic Dataset Creation and Exploration

In [None]:
# Create synthetic paper dataset
def create_synthetic_papers(n_papers=500, n_authors=200, n_venues=50, seed=42):
    """
    Create synthetic academic papers dataset with metadata.
    """
    np.random.seed(seed)
    
    # Create paper metadata
    papers = []
    venues = [f"Venue_{i}" for i in range(n_venues)]
    authors_list = [f"Author_{i}" for i in range(n_authors)]
    
    for paper_id in range(n_papers):
        paper = {
            'paper_id': paper_id,
            'title': f'Paper_{paper_id}',
            'authors': list(np.random.choice(authors_list, size=np.random.randint(1, 5))),
            'venue': np.random.choice(venues),
            'year': np.random.randint(2015, 2024),
            'citations_count': np.random.exponential(scale=10),
            'abstract_length': np.random.randint(100, 1000),
            'references_count': np.random.randint(20, 100),
        }
        papers.append(paper)
    
    papers_df = pd.DataFrame(papers)
    return papers_df

# Create citation links with realistic patterns
def create_citation_network(papers_df, sparsity=0.95, seed=42):
    """
    Create citation edges with patterns:
    - Papers from same venue are more likely to cite each other
    - Recent papers cite older papers
    - Papers by same authors cite each other
    """
    np.random.seed(seed)
    
    citation_edges = []
    n_papers = len(papers_df)
    
    for citing_idx in range(n_papers):
        citing_paper = papers_df.iloc[citing_idx]
        
        # Potential papers that can be cited (published before this one)
        potential_cited = papers_df[papers_df['year'] < citing_paper['year']].index.tolist()
        
        if len(potential_cited) == 0:
            continue
        
        # Probability of citation influenced by multiple factors
        probabilities = []
        for cited_idx in potential_cited:
            cited_paper = papers_df.iloc[cited_idx]
            
            # Base probability
            prob = 0.02
            
            # Author overlap boost
            author_overlap = len(set(citing_paper['authors']) & set(cited_paper['authors']))
            prob += author_overlap * 0.1
            
            # Venue similarity boost
            if citing_paper['venue'] == cited_paper['venue']:
                prob += 0.05
            
            # Citation count boost (popular papers cited more)
            prob += min(0.1, cited_paper['citations_count'] / 100)
            
            probabilities.append(min(prob, 0.3))  # Cap at 0.3
        
        # Sample citations based on probabilities
        probabilities = np.array(probabilities)
        citations = np.random.binomial(1, probabilities)
        
        for i, cited_idx in enumerate(potential_cited):
            if citations[i] == 1:
                citation_edges.append({
                    'citing_paper_id': citing_idx,
                    'cited_paper_id': cited_idx,
                    'label': 1
                })
    
    return pd.DataFrame(citation_edges)

# Generate datasets
papers_df = create_synthetic_papers(n_papers=500, n_authors=150, n_venues=40)
citations_df = create_citation_network(papers_df)

print("Papers Dataset Shape:", papers_df.shape)
print("\nCitations Dataset Shape:", citations_df.shape)
print("\nPapers Sample:")
print(papers_df.head())
print("\nCitations Sample:")
print(citations_df.head())

## 3. Exploratory Data Analysis

In [None]:
# Analysis of citation patterns
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Citation count distribution
axes[0, 0].hist(papers_df['citations_count'], bins=30, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Distribution of Citation Counts')
axes[0, 0].set_xlabel('Number of Citations')
axes[0, 0].set_ylabel('Frequency')

# Papers by year
year_counts = papers_df['year'].value_counts().sort_index()
axes[0, 1].bar(year_counts.index, year_counts.values, color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Number of Papers by Year')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Count')

# Citations by venue (top 10)
venue_citations = papers_df.groupby('venue')['citations_count'].mean().nlargest(10)
axes[1, 0].barh(range(len(venue_citations)), venue_citations.values, color='lightgreen', edgecolor='black')
axes[1, 0].set_yticks(range(len(venue_citations)))
axes[1, 0].set_yticklabels(venue_citations.index)
axes[1, 0].set_title('Average Citations by Top 10 Venues')
axes[1, 0].set_xlabel('Average Citations')

# Citation network statistics
print("\n=== Citation Network Statistics ===")
print(f"Total papers: {len(papers_df)}")
print(f"Total citation pairs: {len(citations_df)}")
print(f"Citation sparsity: {1 - len(citations_df) / (len(papers_df) * len(papers_df)):.4f}")
print(f"Average citations per paper: {len(citations_df) / len(papers_df):.2f}")
print(f"Citation density: {len(citations_df) / (len(papers_df) * (len(papers_df) - 1) / 2):.4f}")

# Network visualization
G = nx.DiGraph()
G.add_nodes_from(papers_df['paper_id'])
for _, row in citations_df.iterrows():
    G.add_edge(row['citing_paper_id'], row['cited_paper_id'])

in_degree = dict(G.in_degree())
out_degree = dict(G.out_degree())
axes[1, 1].scatter([in_degree[node] for node in G.nodes()], 
                   [out_degree[node] for node in G.nodes()], 
                   alpha=0.6, s=50)
axes[1, 1].set_title('In-Degree vs Out-Degree')
axes[1, 1].set_xlabel('In-Degree (Cited By)')
axes[1, 1].set_ylabel('Out-Degree (Cites)')

plt.tight_layout()
plt.show()

print(f"\nNetwork density: {nx.density(G):.4f}")
print(f"Number of weakly connected components: {nx.number_weakly_connected_components(G)}")
print(f"Average clustering coefficient: {nx.average_clustering(G.to_undirected()):.4f}")

## 4. Feature Engineering for Citation Prediction

In [None]:
def extract_features(citing_idx, cited_idx, papers_df, citations_df, G):
    """
    Extract features for a citation pair.
    """
    citing_paper = papers_df.iloc[citing_idx]
    cited_paper = papers_df.iloc[cited_idx]
    
    features = {}
    
    # 1. Temporal Features
    time_diff = citing_paper['year'] - cited_paper['year']
    features['time_diff'] = time_diff
    features['citing_recency'] = 2024 - citing_paper['year']
    features['cited_recency'] = 2024 - cited_paper['year']
    
    # 2. Author Features
    citing_authors = set(citing_paper['authors'])
    cited_authors = set(cited_paper['authors'])
    author_overlap = len(citing_authors & cited_authors)
    features['author_overlap'] = author_overlap
    features['author_overlap_jaccard'] = author_overlap / len(citing_authors | cited_authors) if len(citing_authors | cited_authors) > 0 else 0
    
    # 3. Venue Features
    features['same_venue'] = 1 if citing_paper['venue'] == cited_paper['venue'] else 0
    
    # 4. Citation Count Features
    features['citing_citation_count'] = citing_paper['citations_count']
    features['cited_citation_count'] = cited_paper['citations_count']
    features['cite_count_diff'] = citing_paper['citations_count'] - cited_paper['citations_count']
    
    # 5. References Count Features
    features['citing_references'] = citing_paper['references_count']
    features['cited_references'] = cited_paper['references_count']
    
    # 6. Abstract/Title Similarity (based on length as proxy)
    features['abstract_diff'] = abs(citing_paper['abstract_length'] - cited_paper['abstract_length'])
    features['title_similarity'] = 1.0 if citing_paper['title'] == cited_paper['title'] else 0.0
    
    # 7. Graph Features
    try:
        features['citing_in_degree'] = G.in_degree(citing_idx)
        features['citing_out_degree'] = G.out_degree(citing_idx)
        features['cited_in_degree'] = G.in_degree(cited_idx)
        features['cited_out_degree'] = G.out_degree(cited_idx)
        features['common_citations'] = len(list(G.predecessors(citing_idx)) & set(G.predecessors(cited_idx)))
    except:
        features['citing_in_degree'] = 0
        features['citing_out_degree'] = 0
        features['cited_in_degree'] = 0
        features['cited_out_degree'] = 0
        features['common_citations'] = 0
    
    return features

# Extract features for positive examples (existing citations)
positive_features_list = []
for _, row in citations_df.iterrows():
    features = extract_features(row['citing_paper_id'], row['cited_paper_id'], papers_df, citations_df, G)
    features['label'] = 1
    positive_features_list.append(features)

# Create negative examples (pairs that don't cite each other)
existing_citations = set((row['citing_paper_id'], row['cited_paper_id']) for _, row in citations_df.iterrows())
negative_features_list = []
n_negative = len(positive_features_list)  # Balance the dataset

np.random.seed(42)
count = 0
attempts = 0
max_attempts = 10000

while count < n_negative and attempts < max_attempts:
    citing_idx = np.random.randint(0, len(papers_df))
    cited_idx = np.random.randint(0, len(papers_df))
    
    # Ensure valid pair
    if citing_idx != cited_idx and (citing_idx, cited_idx) not in existing_citations:
        # Only include if temporal order is valid (citing published after cited)
        if papers_df.iloc[citing_idx]['year'] >= papers_df.iloc[cited_idx]['year']:
            features = extract_features(citing_idx, cited_idx, papers_df, citations_df, G)
            features['label'] = 0
            negative_features_list.append(features)
            count += 1
    
    attempts += 1

# Combine and create dataset
all_features_list = positive_features_list + negative_features_list
features_df = pd.DataFrame(all_features_list)

print(f"Total features extracted: {len(features_df)}")
print(f"Positive examples: {(features_df['label'] == 1).sum()}")
print(f"Negative examples: {(features_df['label'] == 0).sum()}")
print(f"\nFeature columns: {features_df.columns.tolist()}")
print(f"\nFeature statistics:")
print(features_df.describe())

## 5. Data Preprocessing and Baseline Models

In [None]:
# Prepare data for modeling
X = features_df.drop(['label'], axis=1)
y = features_df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())

# ===== BASELINE MODELS =====
print("\n" + "="*60)
print("BASELINE MODELS")
print("="*60)

# 1. Random Baseline
random_preds = np.random.binomial(1, y_train.mean(), size=len(y_test))
print("\n1. RANDOM BASELINE")
print(f"Accuracy: {accuracy_score(y_test, random_preds):.4f}")
print(f"Precision: {precision_score(y_test, random_preds, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, random_preds, zero_division=0):.4f}")

# 2. Majority Class Baseline
majority_preds = np.ones(len(y_test)) * (1 if y_train.mean() > 0.5 else 0)
print("\n2. MAJORITY CLASS BASELINE")
print(f"Accuracy: {accuracy_score(y_test, majority_preds):.4f}")
print(f"Precision: {precision_score(y_test, majority_preds, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, majority_preds, zero_division=0):.4f}")

# 3. Popularity-based Baseline (cite high-citation papers)
def popularity_baseline(X_val, X_test, y_test, feature_name='cited_citation_count'):
    median_val = X_val[feature_name].median()
    preds_val = (X_val[feature_name] > median_val).astype(int)
    preds_test = (X_test[feature_name] > median_val).astype(int)
    
    print("\n3. POPULARITY-BASED BASELINE")
    print(f"Accuracy: {accuracy_score(y_test, preds_test):.4f}")
    print(f"Precision: {precision_score(y_test, preds_test, zero_division=0):.4f}")
    print(f"Recall: {recall_score(y_test, preds_test, zero_division=0):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_test, preds_test):.4f}")
    
    return preds_test

popularity_preds = popularity_baseline(X_val, X_test, y_test)

# 4. Author Overlap Baseline
def author_overlap_baseline(X_val, X_test, y_test):
    median_val = X_val['author_overlap'].median()
    preds_val = (X_val['author_overlap'] > median_val).astype(int)
    preds_test = (X_test['author_overlap'] > median_val).astype(int)
    
    print("\n4. AUTHOR OVERLAP BASELINE")
    print(f"Accuracy: {accuracy_score(y_test, preds_test):.4f}")
    print(f"Precision: {precision_score(y_test, preds_test, zero_division=0):.4f}")
    print(f"Recall: {recall_score(y_test, preds_test, zero_division=0):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_test, preds_test):.4f}")
    
    return preds_test

author_preds = author_overlap_baseline(X_val, X_test, y_test)

## 6. Advanced Citation Prediction Models

In [None]:
# Train advanced models
models = {}
predictions = {}

print("\n" + "="*60)
print("ADVANCED CITATION PREDICTION MODELS")
print("="*60)

# 1. Logistic Regression
print("\n1. LOGISTIC REGRESSION")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]
models['Logistic Regression'] = lr_model
predictions['Logistic Regression'] = lr_pred

print(f"Accuracy: {accuracy_score(y_test, lr_pred):.4f}")
print(f"Precision: {precision_score(y_test, lr_pred):.4f}")
print(f"Recall: {recall_score(y_test, lr_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, lr_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, lr_pred_proba):.4f}")

# Feature importance for LR
lr_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': lr_model.coef_[0]
}).sort_values('coefficient', ascending=False)
print("\nTop 10 Features (Logistic Regression):")
print(lr_importance.head(10))

# 2. Random Forest
print("\n2. RANDOM FOREST")
rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]
models['Random Forest'] = rf_model
predictions['Random Forest'] = rf_pred

print(f"Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
print(f"Precision: {precision_score(y_test, rf_pred):.4f}")
print(f"Recall: {recall_score(y_test, rf_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, rf_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, rf_pred_proba):.4f}")

# Feature importance for RF
rf_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Features (Random Forest):")
print(rf_importance.head(10))

# 3. Gradient Boosting
print("\n3. GRADIENT BOOSTING")
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_pred_proba = gb_model.predict_proba(X_test)[:, 1]
models['Gradient Boosting'] = gb_model
predictions['Gradient Boosting'] = gb_pred

print(f"Accuracy: {accuracy_score(y_test, gb_pred):.4f}")
print(f"Precision: {precision_score(y_test, gb_pred):.4f}")
print(f"Recall: {recall_score(y_test, gb_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, gb_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, gb_pred_proba):.4f}")

# Feature importance for GB
gb_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': gb_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Features (Gradient Boosting):")
print(gb_importance.head(10))