In [1]:
pip install requests pandas




In [10]:
import requests
import pandas as pd

# Function to fetch paper data from Semantic Scholar API
def fetch_papers(query="research", num_papers=1000, batch_size=100):
    # Base URL for the Semantic Scholar API
    base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
    papers = []  # List to store all papers

    # Loop through in batches to fetch all requested papers
    for offset in range(0, num_papers, batch_size):
        # Parameters for the API request
        params = {
            'query': query,
            'offset': offset,  # Offset for pagination
            'limit': min(batch_size, num_papers - offset),  # Fetch in batches
            'fields': 'title,abstract,authors'  # Select only title, abstract, and authors
        }

        # Send the request to the API
        response = requests.get(base_url, params=params)

        # Handle response errors
        if response.status_code != 200:
            print(f"Error: Unable to fetch data. HTTP Status Code {response.status_code}")
            break

        data = response.json()  # Parse the JSON response

        # Extract paper details
        for paper in data.get('data', []):
            title = paper['title']
            abstract = paper.get('abstract', '')  # Some papers might not have abstracts
            authors = ', '.join([author['name'] for author in paper.get('authors', [])])  # Join author names
            papers.append({
                'title': title,
                'abstract': abstract,
                'authors': authors
            })

        # Break if no more papers are returned
        if len(data.get('data', [])) == 0:
            break

    return papers

# Fetch research papers with a general query
papers = fetch_papers(query="computer science", num_papers=1000)

# Convert the fetched papers into a pandas DataFrame
df = pd.DataFrame(papers)

# Save the DataFrame to a CSV file
df.to_csv('fetched_research_papers.csv', index=False)

# Display the number of samples fetched
print(f"The dataset contains {df.shape[0]} samples.")


The dataset contains 1000 samples.


In [12]:
from sentence_transformers import SentenceTransformer, util

# Load the pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Handle missing abstracts
df['abstract'] = df['abstract'].fillna('No abstract available')  # Replace None with a placeholder

# Encode the abstracts
df['embedding'] = df['abstract'].apply(lambda x: model.encode(x, convert_to_tensor=True))

# Example user query
user_query = "Deep Learning in Medicine"
query_embedding = model.encode(user_query, convert_to_tensor=True)

# Calculate cosine similarity
df['similarity'] = df['embedding'].apply(lambda x: util.pytorch_cos_sim(x, query_embedding).item())

# Sort by similarity
ranked_df = df.sort_values(by='similarity', ascending=False).reset_index(drop=True)
print(ranked_df.head(10))  # Display top 10 papers


                                               title  \
0  eDoctor: machine learning and the future of me...   
1  Geometric Deep Learning: Going beyond Euclidea...   
2  TensorFlow: Large-Scale Machine Learning on He...   
3                           Expert Systems Research.   
4  Review: A Primer on Aspects of Cognition for M...   
5  All-optical machine learning using diffractive...   
6  Geometric Deep Learning on Graphs and Manifold...   
7  Combining Machine Learning and Computational C...   
8                     The Graph Neural Network Model   
9  Machine learning: Trends, perspectives, and pr...   

                                            abstract  \
0  Machine learning (ML) is a burgeoning field of...   
1  Many scientific fields study data with an unde...   
2  TensorFlow is an interface for expressing mach...   
3  Artificial intelligence, long a topic of basic...   
4  As a multidisciplinary field, medical informat...   
5  All-optical deep learning Deep learning uses

In [13]:
from transformers import pipeline

# Load T5 model for summarization from Hugging Face
summarizer = pipeline("summarization", model="t5-small")

# Function to summarize the top N papers
def summarize_papers(df, top_n=5):
    summaries = []

    # Extract the top N papers based on similarity
    top_papers = df.head(top_n)

    for index, row in top_papers.iterrows():
        # Summarize the abstract of each paper
        abstract = row['abstract']

        # Use T5 to summarize the abstract (max_length=200, min_length=50 are adjustable parameters)
        summary = summarizer(abstract, max_length=200, min_length=50, do_sample=False)

        # Collect the paper title, authors, and summary
        summaries.append({
            'title': row['title'],
            'authors': row['authors'],
            'summary': summary[0]['summary_text']  # Extract the summarized text
        })

    return summaries

# Generate summaries for the top 5 papers
summaries = summarize_papers(ranked_df, top_n=5)

# Display the summaries
for paper in summaries:
    print(f"Title: {paper['title']}")
    print(f"Authors: {paper['authors']}")
    print(f"Summary: {paper['summary']}")
    print("-" * 80)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Your max_length is set to 200, but your input_length is only 158. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)
Your max_length is set to 200, but your input_length is only 178. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=89)
Your max_length is set to 200, but your input_length is only 80. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)
Your max_length is set to 200, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


Title: eDoctor: machine learning and the future of medicine
Authors: G. Handelman, H. Kok, R. Chandra, A. H. Razavi, M. Lee, H. Asadi
Summary: ML is a burgeoning field of medicine with huge resources being applied to fuse computer science and statistics to medical problems . proponents of ML extol its ability to deal with large, complex and disparate data, often found within medicine .
--------------------------------------------------------------------------------
Title: Geometric Deep Learning: Going beyond Euclidean data
Authors: M. Bronstein, Joan Bruna, Yann LeCun, Arthur Szlam, P. Vandergheynst
Summary: many scientific fields study data with an underlying structure that is non-Euclidean . some examples include social networks in computational social sciences, sensor networks in communications, functional networks in brain imaging, regulatory networks in genetics, and meshed surfaces in computer graphics . deep neural networks have proven to be powerful tools for a broad range of 

In [14]:
from transformers import pipeline

# Load GPT-2 model for text generation (or use T5 for summarization)
review_generator = pipeline("text-generation", model="gpt2")

# Function to synthesize the literature review from the summaries
def generate_literature_review(summaries):
    # Combine the paper summaries into one text
    combined_summaries = "\n\n".join([f"Title: {paper['title']}\nSummary: {paper['summary']}" for paper in summaries])

    # Generate a coherent review
    review = review_generator(combined_summaries, max_length=500, num_return_sequences=1)[0]['generated_text']

    return review

# Generate the literature review from the summaries
literature_review = generate_literature_review(summaries)

# Display the synthesized literature review
print(literature_review)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Title: eDoctor: machine learning and the future of medicine
Summary: ML is a burgeoning field of medicine with huge resources being applied to fuse computer science and statistics to medical problems . proponents of ML extol its ability to deal with large, complex and disparate data, often found within medicine .

Title: Geometric Deep Learning: Going beyond Euclidean data
Summary: many scientific fields study data with an underlying structure that is non-Euclidean . some examples include social networks in computational social sciences, sensor networks in communications, functional networks in brain imaging, regulatory networks in genetics, and meshed surfaces in computer graphics . deep neural networks have proven to be powerful tools for a broad range of problems from computer vision, natural-language processing, and audio analysis .

Title: TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems
Summary: TensorFlow is an interface for expressing machine learni

# Inference


In [15]:
import requests
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load pre-trained sentence transformer model for similarity calculation
model = SentenceTransformer('all-MiniLM-L6-v2')

# Fetch papers dynamically based on the query
def fetch_papers(query, num_papers=100, batch_size=100):
    base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
    papers = []

    for offset in range(0, num_papers, batch_size):
        params = {
            'query': query,
            'offset': offset,
            'limit': min(batch_size, num_papers - offset),
            'fields': 'title,abstract,authors'
        }

        response = requests.get(base_url, params=params)

        if response.status_code == 200:
            data = response.json()
            for paper in data.get('data', []):
                title = paper['title']
                abstract = paper.get('abstract', '')
                authors = ', '.join([author['name'] for author in paper.get('authors', [])])
                papers.append({
                    'title': title,
                    'abstract': abstract,
                    'authors': authors
                })

        if len(data.get('data', [])) == 0:
            break

    return papers

# Example usage of fetch_papers function
def search_and_summarize(query, num_papers=10, top_n=5):
    papers = fetch_papers(query=query, num_papers=num_papers)
    df = pd.DataFrame(papers)

    # Handle missing abstracts
    df['abstract'] = df['abstract'].fillna('No abstract available')

    # Encode abstracts for similarity calculation
    df['embedding'] = df['abstract'].apply(lambda x: model.encode(x, convert_to_tensor=True))

    # Example query embedding (you can replace with user input)
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Calculate cosine similarity
    df['similarity'] = df['embedding'].apply(lambda x: util.pytorch_cos_sim(x, query_embedding).item())

    # Sort by similarity
    ranked_df = df.sort_values(by='similarity', ascending=False).reset_index(drop=True)

    # Summarize the top N papers
    return summarize_papers(ranked_df, top_n)

# Summarize the papers
def summarize_papers(df, top_n=5):
    from transformers import pipeline

    summarizer = pipeline("summarization", model="t5-small")
    summaries = []

    # Extract the top N papers based on similarity
    top_papers = df.head(top_n)

    for index, row in top_papers.iterrows():
        abstract = row['abstract']
        summary = summarizer(abstract, max_length=200, min_length=50, do_sample=False)
        summaries.append({
            'title': row['title'],
            'authors': row['authors'],
            'summary': summary[0]['summary_text']
        })

    return summaries

# Example: Running the system with a user-defined query
query = input("Enter your query (e.g., 'Deep Learning in Healthcare'): ")
num_papers = int(input("Enter the number of papers to retrieve (default 10): ") or 10)
top_n = int(input("Enter the number of top papers to summarize (default 5): ") or 5)

summaries = search_and_summarize(query, num_papers=num_papers, top_n=top_n)

# Display the summaries
for paper in summaries:
    print(f"Title: {paper['title']}")
    print(f"Authors: {paper['authors']}")
    print(f"Summary: {paper['summary']}")
    print("-" * 80)


Enter your query (e.g., 'Deep Learning in Healthcare'): deep learning in finance
Enter the number of papers to retrieve (default 10): 5
Enter the number of top papers to summarize (default 5): 3


Your max_length is set to 200, but your input_length is only 130. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=65)
Your max_length is set to 200, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Title: Deep Learning in Finance
Authors: J. B. Heaton, Nicholas G. Polson, J. Witte
Summary: deep learning can detect and exploit interactions in the data that are, at least currently, invisible to any existing nancial economic theory . deep learning methods can produce more useful results than standard methods in nance . we explore the use of deep learning hierarchical models for problems in financial prediction and classication.
--------------------------------------------------------------------------------
Title: A bibliometric analysis on the application of deep learning in finance: status, development and future directions
Authors: R. L. Manogna, Aayush Anand
Summary: PurposeDeep learning (DL) is a relatively unexplored field that finds immense applications in many industries, especially ones that must make detailed observations, inferences and predictions based on extensive and scattered datasets . authors collected 693 articles published in 2000–2022 from journals indexed in th