#Research Bot
##This project is helpful for researchers who want to research about some topics but dont know from where to start. If you have any research idea or if you have a project abstract ready you can input that in this program and it will perform literature survey for you and will provide you with Literature Review containing overview about all recent works in that domain with proper references.
Steps -


1.   Connect to a GPU runtime
2.   Run the notebook
3.   Add your HuggingFace Token when asked for
4.   Finally Input your prompt or query as input and wait for the results !!!




In [None]:
!pip install langchain-huggingface
!pip install langchain-community
!pip install arxiv
!pip install sentence-transformers
!pip install accelerate

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TokenClassificationPipeline, AutoModelForTokenClassification, pipeline
from langchain_community.utilities import ArxivAPIWrapper
from transformers.pipelines import AggregationStrategy
from sentence_transformers import SentenceTransformer
import arxiv
import numpy as np
import tensorflow as tf
from huggingface_hub import notebook_login
import torch
notebook_login()

In [None]:
summarizer_model_name = "microsoft/Phi-3-mini-4k-instruct"
feature_extractor_model_name = "ml6team/keyphrase-extraction-kbir-inspec"
ranker_model_name = "sentence-transformers/all-MiniLM-L6-v2"

class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])

def init_pipeline() :
    summarizer_model = AutoModelForCausalLM.from_pretrained(
        summarizer_model_name,
        device_map="cuda",
        torch_dtype=torch.float16,
        trust_remote_code=True,
    )
    summarizer_tokenizer = AutoTokenizer.from_pretrained(summarizer_model_name)

    feature_extractor_model = KeyphraseExtractionPipeline(model=feature_extractor_model_name)

    ranker_model=SentenceTransformer(ranker_model_name)

    arxiv_agent = ArxivAPIWrapper(top_k_results = 5, doc_content_chars_max = None, load_max_docs = 10)
    return {
        "summarizer" : summarizer_model,
        "summarizer_tokenizer" : summarizer_tokenizer,
        "feature_extractor" : feature_extractor_model,
        "ranker" : ranker_model,
        "arxiv_agent" : arxiv_agent
    }

def extract_keywords(model, abstract):
    keyphrases = model(abstract)
    print(keyphrases)
    return keyphrases


def search_papers(arxiv_agent, keywords):
    query = " ".join(keywords)
    results = arxiv_agent.get_summaries_as_docs(query)
    #print("arxiv ouptut ")
    #print(results)
    return results

def re_rank_papers(model, query_abstract, papers):
    summaries = {paper.page_content : {"Title":paper.metadata['Title']} for paper in papers}
    print(summaries)
    target_embeddings = model.encode([query_abstract])
    summaries_embeddings = model.encode(list(summaries.keys()))

    cosine_similarities = -tf.keras.losses.cosine_similarity(target_embeddings, summaries_embeddings)
    cosine_similarities = cosine_similarities.numpy().tolist()
    i = 0
    for key in summaries.keys() :
        summaries[key]["score"] = cosine_similarities[i]
        i+=1
    return dict(sorted(summaries.items(), key=lambda x: x[1]["score"], reverse=True))

def format_abstracts_as_references(papers):
    cite_text = ""
    i = 0
    for key in papers.keys() :
        citation = f"{i+1}"
        cite_text = f"{cite_text}[{citation}]: {key}\n"
        i+=1
    return cite_text

def format_authors(authors):
    formatted_authors = []
    for author in authors:
        name_parts = author.name.split()
        last_name = name_parts[-1]
        initials = ''.join([name[0] for name in name_parts[:-1]])
        formatted_authors.append(f"{last_name} {initials}")
    return ', '.join(formatted_authors)

def to_vancouver_style(entry):
    authors = format_authors(entry.authors)
    title = entry.title
    journal = 'arXiv'
    year = entry.published.year
    arxiv_id = entry.get_short_id()
    return f"{authors}. {title}. {journal}. {year}. arXiv:{arxiv_id}"

def generate_refs(papers) :
    client = arxiv.Client()
    results = []
    for key in papers.keys() :
        search = arxiv.Search(
          query = papers[key]["Title"],
          max_results = 1,
          sort_by = arxiv.SortCriterion.Relevance
        )
        results.append(list(client.results(search))[0])

    references = [to_vancouver_style(entry) for entry in results]

    i = 0
    refs = "\n\nReferences:\n"
    for reference in references:
        refs = f"{refs}[{i+1}] {reference}\n"
        i+=1
    return refs

def generate_related_work(model, tokenizer, query_abstract, ranked_papers, base_prompt, sentence_plan):
    input_text = f"Abstract: {query_abstract}\n"
    i = 1
    for key in ranked_papers.keys():
        input_text += f"{i+1}. {ranked_papers[key]['Title']} - {key}\n"
        i+=1

    data = f"Abstract: {query_abstract} \n {format_abstracts_as_references(ranked_papers)} \n Plan: {sentence_plan}"
    complete_prompt = f"{base_prompt}\n```{data}```"
    messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": complete_prompt}]

    pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    )

    generation_args = {
    "max_new_tokens": 1600,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
    }

    output = pipe(messages, **generation_args)
    print(output)
    related_work = output[0]['generated_text']
    related_work += generate_refs(ranked_papers)
    f = open("literature review.txt", "w")
    f.write(related_work)
    f.close()
    return related_work

In [None]:
print("Program running")
llms = init_pipeline()
print('Model loaded')

In [None]:
base_prompt = "You will be provided with an abstract of a scientific document and other references papers in triple quotes. Your task is to write the related work section of the document using only the provided abstracts and other references papers. Please write the related work section creating a cohesive storyline by doing a critical analysis of prior work comparing the strengths and weaknesses while also motivating the proposed approach. You are also provided a sentence plan mentioning the total number of lines and the citations to refer in different lines. You should cite all the other related documents as [#] whenever you are referring it in the related work. Do not cite abstract. Do not include any extra notes or newline characters at the end. Do not copy the abstracts of reference papers directly but compare and contrast to the main work concisely. Do not provide the output in bullet points. Do not provide references at the end. Please cite all the provided reference papers. Please follow the plan when generating sentences, especially the number of lines to generate."
sentence_plan = "1. Introduction sentence\n2. Overview of relevant studies\n3. Detailed discussion on key papers\n4. Summary of related work\n"

def gen_lit_review(query, llms) :
   keywords = extract_keywords(llms['feature_extractor'], query)
   papers = search_papers(llms['arxiv_agent'], keywords)
   ranked_papers = re_rank_papers(llms['ranker'], query, papers)
   literature_review = generate_related_work(llms['summarizer'], llms['summarizer_tokenizer'], query, ranked_papers, base_prompt, sentence_plan)
   return literature_review

In [None]:
query = input("Enter abstract of your paper or what topic your research is all about: ")
if query == "" :
    query = "We explore the possibility of using exponents for Image Augmentation in Convolutional Neural Networks (CNN). Furthermore we also explore the extent of controlled non-linearity we can introduce in the Neural Networks using this augmentation technique."
output = gen_lit_review(query, llms)

Enter abstract of your paper or what topic your research is all about: We explore the possibility of using exponents for Image Augmentation in Convolutional Neural Networks (CNN). Furthermore we also explore the extent of controlled non-linearity we can introduce in the Neural Networks using this augmentation technique.
['Convolutional Neural Networks' 'Image Augmentation' 'augmentation'
 'controlled non-linearity']
{'Successful training of convolutional neural networks (CNNs) requires a\nsubstantial amount of data. With small datasets networks generalize poorly.\nData Augmentation techniques improve the generalizability of neural networks by\nusing existing training data more effectively. Standard data augmentation\nmethods, however, produce limited plausible alternative data. Generative\nAdversarial Networks (GANs) have been utilized to generate new data and improve\nthe performance of CNNs. Nevertheless, data augmentation techniques for\ntraining GANs are under-explored compared to 

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


[{'generated_text': ' Related Work\n\nThe exploration of image augmentation techniques for enhancing the performance of convolutional neural networks (CNNs) has been a subject of extensive research. Image augmentation has been recognized as a powerful tool for improving the generalization ability of neural networks by introducing controlled non-linearity and expanding the diversity of training data. This paper builds upon the foundational work in the field, including the use of rotation, shearing, and color distortion as traditional augmentation methods [1], the development of the Albumentations library for efficient image transformations [2], the investigation into the correlation between different types of image augmentation techniques and neural loss functions [3], the introduction of style augmentation for improving robustness in CNNs [4], and the application of GANs for data augmentation in medical imaging [5].\n\nThe work of [1] has demonstrated the effectiveness of basic image t

In [None]:
print(output)

 Related Work

The exploration of image augmentation techniques for enhancing the performance of convolutional neural networks (CNNs) has been a subject of extensive research. Image augmentation has been recognized as a powerful tool for improving the generalization ability of neural networks by introducing controlled non-linearity and expanding the diversity of training data. This paper builds upon the foundational work in the field, including the use of rotation, shearing, and color distortion as traditional augmentation methods [1], the development of the Albumentations library for efficient image transformations [2], the investigation into the correlation between different types of image augmentation techniques and neural loss functions [3], the introduction of style augmentation for improving robustness in CNNs [4], and the application of GANs for data augmentation in medical imaging [5].

The work of [1] has demonstrated the effectiveness of basic image transformations in combati