

<h1><center><font size=7>PubMed Abstract Categorization and Tagging</center></font></h1>


## Installing and Importing Necessary Libraries and Dependencies

In [4]:
# Installation for GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m161.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m216.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.2/133.2 kB[0m [31m218.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requi

In [5]:
# For downloading the models from HF Hub
!pip install huggingface_hub -q

In [6]:
# Importing library for data manipulation
import pandas as pd

# Function to download the model from the Hugging Face model hub
from huggingface_hub import hf_hub_download

# Importing the Llama class from the llama_cpp module
from llama_cpp import Llama

# Importing the json module
import json

## Loading the Data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd
data = pd.read_excel("/content/drive/MyDrive/Previous_Projects/PubMed.xlsx")

## Data Overview

### Checking the first 5 rows of the data

In [9]:
data.head()

Unnamed: 0,title,number_of_authors,publication_year,citation_count,abstract
0,Somatic mutation in cancer and normal cells,2,2015,1308,Spontaneously occurring mutations accumulate i...
1,Diagnosis and Management of Central Diabetes I...,5,2022,5,Central diabetes insipidus (CDI) is a clinical...
2,"Long COVID: major findings, mechanisms and rec...",4,2023,744,Long COVID is an often debilitating illness th...
3,The Utility of Exosomes in Diagnosis and Thera...,6,2021,12,Diabetes mellitus and the associated complicat...
4,Exosomes: A potential tool for immunotherapy o...,6,2023,5,Ovarian cancer is a malignant tumor of the fem...


### Checking the shape of the data

In [10]:
data.shape

(20, 5)

* There are 20 rows and 5 columns

### Checking the missing values in the data

In [12]:
data.isnull().sum()

title                0
number_of_authors    0
publication_year     0
citation_count       0
abstract             0
dtype: int64

* There are no missing values in the data

## Model Building

### Loading the model

In [13]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model_basename = "llama-2-13b-chat.Q5_K_M.gguf" # the model is in gguf format

In [14]:
model_path = hf_hub_download(
    repo_id=model_name_or_path,
    filename=model_basename
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=2,  # CPU cores
    n_batch=512,  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=43,  # Change this value based on your model and your GPU VRAM pool.
    n_ctx=4096,  # Context window
)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /root/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGUF/snapshots/4458acc949de0a9914c3eab623904d4fe999050a/llama-2-13b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_

### Defining Model Response Parameters

In [16]:
def generate_llama_response(instruction, review):

    # System message explicitly instructing not to include the review text
    system_message = """
        [INST]<<SYS>>
        {}
        <</SYS>>[/INST]
    """.format(instruction)

    # Combine user_prompt and system_message to create the prompt
    prompt = f"{review}\n{system_message}"

    # Generate a response from the LLaMA model
    response = lcpp_llm(
        prompt=prompt,
        max_tokens=1024,
        temperature=0,
        top_p=0.95,
        repeat_penalty=1.1,
        top_k=50,
        stop=['INST'],
        echo=False,
        seed=42,
    )

    # Extract the sentiment from the response
    response_text = response["choices"][0]["text"]
    return response_text

In [21]:
df = data.sample(frac = 1)

Unnamed: 0,title,number_of_authors,publication_year,citation_count,abstract
2,"Long COVID: major findings, mechanisms and rec...",4,2023,744,Long COVID is an often debilitating illness th...
9,Burdens of post-acute sequelae of COVID-19 \nb...,3,2021,142,The Post-Acute Sequelae of SARS-CoV-2 infectio...
18,DR3 stimulation of adipose resident ILC2s amel...,10,2020,22,Disturbances in glucose homeostasis and low-gr...
11,Computational correction of copy number effect...,31,2017,1009,The CRISPR–Cas9 system has revolutionized gene...
13,The Cancer Cell Line Encyclopedia enables pred...,63,2012,5005,The systematic translation of cancer genomic d...
8,Long-term cardiovascular outcomes of COVID-19,4,2022,861,The cardiovascular complications of acute coro...
16,RANK links thymic regulatory T cells to fetal ...,22,2021,42,Successful pregnancies rely on adaptations wit...
3,The Utility of Exosomes in Diagnosis and Thera...,6,2021,12,Diabetes mellitus and the associated complicat...
10,An integrated functional and clinical genomics...,18,2021,15,Genomic sequencing of thousands of tumors has ...
4,Exosomes: A potential tool for immunotherapy o...,6,2023,5,Ovarian cancer is a malignant tumor of the fem...


## Task 1: Article Categorization

In [34]:
# creating a copy of the data
df1 = df.copy()

In [35]:
# defining the instructions for the model
instruction_A = """
    You are an AI expert. Classify the abstracts based on the Abstract Text presented
    in the input into the following categories
      - Cancer Article
      - Covid Article
      - Diabetes Article
"""

In [36]:
df1['llama_response'] = df1['abstract'].apply(lambda x: generate_llama_response(instruction_A, x))

Llama.generate: prefix-match hit

llama_print_timings:        load time =     668.09 ms
llama_print_timings:      sample time =     259.59 ms /   414 runs   (    0.63 ms per token,  1594.80 tokens per second)
llama_print_timings: prompt eval time =     924.22 ms /   386 tokens (    2.39 ms per token,   417.65 tokens per second)
llama_print_timings:        eval time =   28469.02 ms /   413 runs   (   68.93 ms per token,    14.51 tokens per second)
llama_print_timings:       total time =   31130.93 ms /   799 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     668.09 ms
llama_print_timings:      sample time =      74.59 ms /   125 runs   (    0.60 ms per token,  1675.94 tokens per second)
llama_print_timings: prompt eval time =     923.95 ms /   335 tokens (    2.76 ms per token,   362.57 tokens per second)
llama_print_timings:        eval time =    9093.34 ms /   124 runs   (   73.33 ms per token,    13.64 tokens per second)
llama_print_timings:       to

In [37]:
df1['llama_response'].head()

2      Sure, I can help you classify the abstracts b...
9      Sure! Based on the input abstract, I would cl...
18     Sure, I can help you classify the abstracts b...
11     Sure! Based on the input abstracts, I would c...
13     Sure, I'd be happy to help! Based on the abst...
Name: llama_response, dtype: object

In [38]:
i = 1
print(df1.loc[i, 'abstract'])

Central diabetes insipidus (CDI) is a clinical syndrome which results from loss or impaired function of vasopressinergic neurons in the hypothalamus/posterior pituitary, resulting in impaired synthesis and/or secretion of arginine vasopressin (AVP). 
AVP deficiency leads to the inability to concentrate urine and excessive renal water losses, resulting in a clinical syndrome of hypotonic polyuria with compensatory thirst.
 CDI is caused by diverse etiologies, although it typically develops due to neoplastic, traumatic, or autoimmune destruction of AVP-synthesizing/secreting neurons. 
This review focuses on the diagnosis and management of CDI, providing insights into the physiological disturbances underpinning the syndrome. 
Recent developments in diagnostic techniques, particularly the development of the copeptin assay, have improved accuracy and acceptability of the diagnostic approach to the hypotonic polyuria syndrome. 
We discuss the management of CDI with particular emphasis on man

In [39]:
print(df1.loc[i, 'llama_response'])

 Sure, I can help you classify the abstracts based on the text you provided. Here are my classifications:

1. Cancer Article: None of the abstracts mention cancer, so there are no cancer articles.
2. Covid Article: None of the abstracts mention Covid-19 or any other coronavirus-related topics, so there are no Covid articles.
3. Diabetes Article: The third abstract mentions "diabetes insipidus" and "polyuria," which are related to diabetes, so this abstract can be classified as a diabetes article.

Therefore, there is one diabetes article among the three abstracts.


In [40]:
def extract_category(llama_response):
    if 'cancer article' in llama_response.lower():
        return 'Cancer article'
    elif 'diabetes article' in llama_response.lower():
        return 'Diabetes article'
    elif 'covid article' in llama_response.lower():
        return 'Covid article'

In [41]:
# applying the function to the model response
df1['Category'] = df1['llama_response'].apply(extract_category)
df1['Category'].head()

2        Covid article
9        Covid article
18    Diabetes article
11      Cancer article
13      Cancer article
Name: Category, dtype: object

In [42]:
df1['Category'].value_counts()

Cancer article      15
Covid article        3
Diabetes article     2
Name: Category, dtype: int64

In [43]:
final_df1 = df1.drop(['llama_response'], axis=1)
final_df1.head()

Unnamed: 0,title,number_of_authors,publication_year,citation_count,abstract,Category
2,"Long COVID: major findings, mechanisms and rec...",4,2023,744,Long COVID is an often debilitating illness th...,Covid article
9,Burdens of post-acute sequelae of COVID-19 \nb...,3,2021,142,The Post-Acute Sequelae of SARS-CoV-2 infectio...,Covid article
18,DR3 stimulation of adipose resident ILC2s amel...,10,2020,22,Disturbances in glucose homeostasis and low-gr...,Diabetes article
11,Computational correction of copy number effect...,31,2017,1009,The CRISPR–Cas9 system has revolutionized gene...,Cancer article
13,The Cancer Cell Line Encyclopedia enables pred...,63,2012,5005,The systematic translation of cancer genomic d...,Cancer article


## Task 2: Abstract Categorization and Returning Structured Output

In [44]:
# creating a copy of the data
df2 = data.copy()

In [45]:
# defining the instructions for the model
instruction_B = """
   You are an AI expert. Classify the abstracts based on the Abstract Text presented
    in the input into the following categories
      - Cancer Article
      - Covid Article
      - Diabetes Article

    Format the output as a JSON object with a single key-value pair as shown below:
    {"Category": "your_category_prediction"}
"""

In [46]:
df2['llama_response'] = df2['abstract'].apply(lambda x: generate_llama_response(instruction_B, x))

Llama.generate: prefix-match hit

llama_print_timings:        load time =     668.09 ms
llama_print_timings:      sample time =      51.61 ms /    94 runs   (    0.55 ms per token,  1821.35 tokens per second)
llama_print_timings: prompt eval time =     781.04 ms /   286 tokens (    2.73 ms per token,   366.18 tokens per second)
llama_print_timings:        eval time =    5680.64 ms /    93 runs   (   61.08 ms per token,    16.37 tokens per second)
llama_print_timings:       total time =    6765.29 ms /   379 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     668.09 ms
llama_print_timings:      sample time =     121.44 ms /   182 runs   (    0.67 ms per token,  1498.68 tokens per second)
llama_print_timings: prompt eval time =     950.87 ms /   435 tokens (    2.19 ms per token,   457.48 tokens per second)
llama_print_timings:        eval time =   11700.67 ms /   181 runs   (   64.64 ms per token,    15.47 tokens per second)
llama_print_timings:       to

In [47]:
df2['llama_response'].head()

0     Sure! Here's the classification of the abstra...
1     Sure! Here's my classification of the abstrac...
2     Sure, I can help you classify the abstracts b...
3     Sure, I can help you classify the abstracts b...
4     Sure! I can help you classify the abstracts b...
Name: llama_response, dtype: object

In [48]:
i = 2
print(df2.loc[i, 'abstract'])

Long COVID is an often debilitating illness that occurs in at least 10% of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) infections. 
More than 200 symptoms have been identified with impacts on multiple organ systems. At least 65 million individuals worldwide are estimated to have long COVID, with cases increasing daily. Biomedical research has made substantial progress in identifying various pathophysiological changes and risk factors and in characterizing the illness; further, similarities with other viral-onset illnesses such as myalgic encephalomyelitis/chronic fatigue syndrome and postural orthostatic tachycardia syndrome have laid the groundwork for research in the field. In this Review, we explore the current literature and highlight key findings, the overlap with other conditions, the variable onset of symptoms, long COVID in children and the impact of vaccinations. Although these key findings are critical to understanding long COVID, current diagnostic and treat

In [49]:
print(df2.loc[i, 'llama_response'])

 Sure, I can help you classify the abstracts based on the text you provided. Here's my prediction based on the input text:

    {
        "Category": "Covid Article"
    }

Here's my reasoning:

The input text describes a review article about long COVID, which is a condition characterized by a range of symptoms and organ dysfunction following severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) infection. The text highlights the current understanding of the pathophysiological changes and risk factors associated with long COVID, as well as the similarities with other viral-onset illnesses such as myalgic encephalomyelitis/chronic fatigue syndrome and postural orthostatic tachycardia syndrome.

Based on the content of the text, it appears to be a Covid Article, rather than a Cancer Article or a Diabetes Article. The focus on SARS-CoV-2 infection and the discussion of long COVID as a distinct condition support this classification.


In [50]:
# defining a function to parse the JSON output from the model
def json_extractor(json_str):
    try:
        # Find the indices of the opening and closing curly braces
        json_start = json_str.find('{')
        json_end = json_str.rfind('}')

        if json_start != -1 and json_end != -1:
            extracted_category = json_str[json_start:json_end + 1]  # Extract the JSON object
            dfdict = json.loads(extracted_category)
            return dfdict
        else:
            print(f"Warning: JSON object not found in response: {json_str}")
            return {}
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return {}

In [51]:
# Applying the function to the model response
df2['llama_response_parsed'] = df2['llama_response'].apply(json_extractor)
df2['llama_response_parsed'].head()

Error parsing JSON: Extra data: line 3 column 1 (char 34)
Error parsing JSON: Extra data: line 3 column 6 (char 681)
Error parsing JSON: Extra data: line 3 column 5 (char 36)
Error parsing JSON: Extra data: line 3 column 5 (char 38)
Error parsing JSON: Extra data: line 3 column 5 (char 38)


0      {'Category': 'Cancer Article'}
1    {'Category': 'Diabetes Article'}
2       {'Category': 'Covid Article'}
3                                  {}
4      {'Category': 'Cancer Article'}
Name: llama_response_parsed, dtype: object

In [52]:
# Normalizing the llama_response_parsed column
llama_response_parsed_df_2 = pd.json_normalize(df2['llama_response_parsed'])
llama_response_parsed_df_2.head()

Unnamed: 0,Category
0,Cancer Article
1,Diabetes Article
2,Covid Article
3,
4,Cancer Article


In [53]:
# Concatinating two dataframes
dfwith_parsed_model_output_2 = pd.concat([df2, llama_response_parsed_df_2], axis=1)
dfwith_parsed_model_output_2.head()

Unnamed: 0,title,number_of_authors,publication_year,citation_count,abstract,llama_response,llama_response_parsed,Category
0,Somatic mutation in cancer and normal cells,2,2015,1308,Spontaneously occurring mutations accumulate i...,Sure! Here's the classification of the abstra...,{'Category': 'Cancer Article'},Cancer Article
1,Diagnosis and Management of Central Diabetes I...,5,2022,5,Central diabetes insipidus (CDI) is a clinical...,Sure! Here's my classification of the abstrac...,{'Category': 'Diabetes Article'},Diabetes Article
2,"Long COVID: major findings, mechanisms and rec...",4,2023,744,Long COVID is an often debilitating illness th...,"Sure, I can help you classify the abstracts b...",{'Category': 'Covid Article'},Covid Article
3,The Utility of Exosomes in Diagnosis and Thera...,6,2021,12,Diabetes mellitus and the associated complicat...,"Sure, I can help you classify the abstracts b...",{},
4,Exosomes: A potential tool for immunotherapy o...,6,2023,5,Ovarian cancer is a malignant tumor of the fem...,Sure! I can help you classify the abstracts b...,{'Category': 'Cancer Article'},Cancer Article


In [54]:
# Dropping llama_response and llama_response_parsed columns
final_df2 = dfwith_parsed_model_output_2.drop(['llama_response','llama_response_parsed'], axis=1)
final_df2.head()

Unnamed: 0,title,number_of_authors,publication_year,citation_count,abstract,Category
0,Somatic mutation in cancer and normal cells,2,2015,1308,Spontaneously occurring mutations accumulate i...,Cancer Article
1,Diagnosis and Management of Central Diabetes I...,5,2022,5,Central diabetes insipidus (CDI) is a clinical...,Diabetes Article
2,"Long COVID: major findings, mechanisms and rec...",4,2023,744,Long COVID is an often debilitating illness th...,Covid Article
3,The Utility of Exosomes in Diagnosis and Thera...,6,2021,12,Diabetes mellitus and the associated complicat...,
4,Exosomes: A potential tool for immunotherapy o...,6,2023,5,Ovarian cancer is a malignant tumor of the fem...,Cancer Article


In [55]:
# Checking the value counts of Category column
final_df2['Category'].value_counts()

Cancer Article      6
Covid Article       6
Diabetes Article    3
Name: Category, dtype: int64

* `The Utility of Exosomes...` article has not be assigned a category

## Task 3: Abstract Categorization, Creating Tags, and Returning Structured Output

In [56]:
# creating a copy of the data
df3 = df.copy()

In [57]:
# defining the instructions for the model
instruction_C = """
 You are an AI expert. Classify the abstracts based on the Abstract Text presented
    in the input into the following categories
      - Cancer Article
      - Covid Article
      - Diabetes Article

    Once you have identified the Category, then :
        1) Create Tags - This will help us classify the ticket and don't include the category

        Output the category and tags for each category in a JSON format with the following keys:
    {
        "Category": "categories",
        "Tags": "Tag"
    }
"""

In [58]:
# Applying generate_llama_response function on abstract column
df3['llama_response'] = df3['abstract'].apply(lambda x: generate_llama_response(instruction_C, x))

Llama.generate: prefix-match hit

llama_print_timings:        load time =     668.09 ms
llama_print_timings:      sample time =     225.32 ms /   343 runs   (    0.66 ms per token,  1522.27 tokens per second)
llama_print_timings: prompt eval time =     969.66 ms /   461 tokens (    2.10 ms per token,   475.42 tokens per second)
llama_print_timings:        eval time =   22780.47 ms /   342 runs   (   66.61 ms per token,    15.01 tokens per second)
llama_print_timings:       total time =   25216.62 ms /   803 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     668.09 ms
llama_print_timings:      sample time =     266.76 ms /   420 runs   (    0.64 ms per token,  1574.46 tokens per second)
llama_print_timings: prompt eval time =    1019.00 ms /   410 tokens (    2.49 ms per token,   402.36 tokens per second)
llama_print_timings:        eval time =   30390.01 ms /   419 runs   (   72.53 ms per token,    13.79 tokens per second)
llama_print_timings:       to

In [59]:
df3['llama_response'].head()

2      Sure, I can help you classify the abstracts b...
9      Sure! I can help you with that. Here are the ...
18     Sure, I'd be happy to help! Here are the cate...
11     Sure! I can help you with that. Here are the ...
13     Sure, I'd be happy to help! Here are the cate...
Name: llama_response, dtype: object

In [68]:
i = 4
print(df3.loc[i, 'abstract'])

Ovarian cancer is a malignant tumor of the female reproductive system, with a very poor prognosis and high mortality rates. 
Chemotherapy and radiotherapy are the most common treatments for ovarian cancer, with unsatisfactory results. Exosomes are a subpopulation of extracellular vesicles, which have a diameter of approximately 30-100 nm and are secreted by many different types of cells in various body fluids. Exosomes are highly stable and are effective carriers of immunotherapeutic drugs. Recent studies have shown that exosomes are involved in various cellular responses in the tumor microenvironment, influencing the development and therapeutic efficacy of ovarian cancer, and exhibiting dual roles in inhibiting and promoting tumor development. Exosomes also contain a variety of genes related to ovarian cancer immunotherapy that could be potential biomarkers for ovarian cancer diagnosis and prognosis. Undoubtedly, exosomes have great therapeutic potential in the field of ovarian cancer

In [69]:
print(df3.loc[i, 'llama_response'])

 Sure, I'd be happy to help! Here are the categories and tags for each abstract based on the input:

    1. Cancer Article:
        Category: Cancer Article
        Tags: ovarian cancer, immunotherapy, exosomes

    2. Covid Article:
        Category: Covid Article
        Tags: none

    3. Diabetes Article:
        Category: Diabetes Article
        Tags: none

Here's the output in JSON format:

{
"Category": "Cancer Article",
"Tags": "ovarian cancer, immunotherapy, exosomes"
}

{
"Category": "Covid Article",
"Tags": "none"
}

{
"Category": "Diabetes Article",
"Tags": "none"
}


In [70]:
# Applying the function to the model response
df3['llama_response_parsed'] = df3['llama_response'].apply(json_extractor)
df3['llama_response_parsed'].head()

Error parsing JSON: Extra data: line 6 column 1 (char 110)
Error parsing JSON: Extra data: line 6 column 1 (char 169)
Error parsing JSON: Extra data: line 6 column 5 (char 116)
Error parsing JSON: Extra data: line 6 column 1 (char 151)
Error parsing JSON: Extra data: line 6 column 1 (char 131)
Error parsing JSON: Extra data: line 6 column 1 (char 85)
Error parsing JSON: Extra data: line 6 column 5 (char 112)
Error parsing JSON: Extra data: line 6 column 1 (char 107)
Error parsing JSON: Extra data: line 6 column 1 (char 90)
Error parsing JSON: Extra data: line 6 column 1 (char 91)

Cancer Article:

* Abstract: Somatic mutations accumulate in cancer cells throughout a person's lifetime and contribute to aging.
* Tags: cancer, aging, somatic mutations

Category: Cancer Article
Tags: cancer, aging, somatic mutations

Covid Article:

* Abstract: None of the input provided is related to Covid-19.

No Category or Tags generated for this abstract.

Diabetes Article:

* Abstract: None of the in

2                                                    {}
9     {'Category': {'Cancer': ['Cancer', 'Oncology',...
18    {'Category': {'Cancer': ['Cancer', 'Tumor', 'I...
11    {'Category': {'Cancer': [{'Tags': 'Cancer, Gen...
13                                                   {}
Name: llama_response_parsed, dtype: object

In [71]:
# Normalizing the llama_response_parsed column
llama_response_parsed_df_3 = pd.json_normalize(df3['llama_response_parsed'])
llama_response_parsed_df_3.head(5)

Unnamed: 0,Category.Cancer,Category.Covid,Category.Diabetes,Tags.Cancer,Tags.Covid,Tags.Diabetes,Category,Tags,Category.Covid Article,Category.Cancer Article,Category.Diabetes Article,Tags.#COVID19,Tags.#cancer,Tags.#diabetes,Tags.#treatment,Tags.#prevention
0,,,,,,,,,,,,,,,,
1,"[Cancer, Oncology, Tumor, Neoplasm]","[Covid-19, Pandemic, Vaccine, Public Health]","[Diabetes, Insulin, Glucose, Metabolism]","[Cancer, Oncology, Tumor, Neoplasm]","[Covid-19, Pandemic, Vaccine, Public Health]","[Diabetes, Insulin, Glucose, Metabolism]",,,,,,,,,,
2,"[Cancer, Tumor, Immune System]","[Covid-19, Pandemic, Vaccine]","[Diabetes, Insulin Resistance, Glucose Homeost...","[Cancer, Tumor, Immune System]","[Covid-19, Pandemic, Vaccine]","[Diabetes, Insulin Resistance, Glucose Homeost...",,,,,,,,,,
3,"[{'Tags': 'Cancer, Gene Editing, CRISPR, Cas9,...","[{'Tags': 'Covid, Pandemic, Vaccine, Treatment'}]","[{'Tags': 'Diabetes, Gene Editing, CRISPR, Cas...",,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,


In [64]:
# Concatinating two dataframes
dfwith_parsed_model_output_3 = pd.concat([df3, llama_response_parsed_df_3], axis=1)
dfwith_parsed_model_output_3.head()

Unnamed: 0,title,number_of_authors,publication_year,citation_count,abstract,llama_response,llama_response_parsed,Category.Cancer,Category.Covid,Category.Diabetes,...,Category,Tags,Category.Covid Article,Category.Cancer Article,Category.Diabetes Article,Tags.#COVID19,Tags.#cancer,Tags.#diabetes,Tags.#treatment,Tags.#prevention
2,"Long COVID: major findings, mechanisms and rec...",4,2023,744,Long COVID is an often debilitating illness th...,"Sure, I can help you classify the abstracts b...",{},"[Cancer, Tumor, Immune System]","[Covid-19, Pandemic, Vaccine]","[Diabetes, Insulin Resistance, Glucose Homeost...",...,,,,,,,,,,
9,Burdens of post-acute sequelae of COVID-19 \nb...,3,2021,142,The Post-Acute Sequelae of SARS-CoV-2 infectio...,Sure! I can help you with that. Here are the ...,"{'Category': {'Cancer': ['Cancer', 'Oncology',...",,,,...,,,,,,,,,,
18,DR3 stimulation of adipose resident ILC2s amel...,10,2020,22,Disturbances in glucose homeostasis and low-gr...,"Sure, I'd be happy to help! Here are the cate...","{'Category': {'Cancer': ['Cancer', 'Tumor', 'I...","[Cancer, Oncology, Tumor, Neoplasm]","[Covid-19, Pandemic, Vaccine, Infection]","[Diabetes, Type 2 Diabetes, Kidney Transplant,...",...,,,,,,,,,,
11,Computational correction of copy number effect...,31,2017,1009,The CRISPR–Cas9 system has revolutionized gene...,Sure! I can help you with that. Here are the ...,"{'Category': {'Cancer': [{'Tags': 'Cancer, Gen...",,,,...,,,,,,,,,,
13,The Cancer Cell Line Encyclopedia enables pred...,63,2012,5005,The systematic translation of cancer genomic d...,"Sure, I'd be happy to help! Here are the cate...",{},,,,...,,,,,,,,,,


In [72]:
# Dropping llama_response and llama_response_parsed columns
final_df3 = dfwith_parsed_model_output_3.drop(['llama_response','llama_response_parsed'], axis=1)
final_df3.head()

Unnamed: 0,title,number_of_authors,publication_year,citation_count,abstract,Category.Cancer,Category.Covid,Category.Diabetes,Tags.Cancer,Tags.Covid,...,Category,Tags,Category.Covid Article,Category.Cancer Article,Category.Diabetes Article,Tags.#COVID19,Tags.#cancer,Tags.#diabetes,Tags.#treatment,Tags.#prevention
2,"Long COVID: major findings, mechanisms and rec...",4,2023,744,Long COVID is an often debilitating illness th...,"[Cancer, Tumor, Immune System]","[Covid-19, Pandemic, Vaccine]","[Diabetes, Insulin Resistance, Glucose Homeost...","[Cancer, Tumor, Immune System]","[Covid-19, Pandemic, Vaccine]",...,,,,,,,,,,
9,Burdens of post-acute sequelae of COVID-19 \nb...,3,2021,142,The Post-Acute Sequelae of SARS-CoV-2 infectio...,,,,,,...,,,,,,,,,,
18,DR3 stimulation of adipose resident ILC2s amel...,10,2020,22,Disturbances in glucose homeostasis and low-gr...,"[Cancer, Oncology, Tumor, Neoplasm]","[Covid-19, Pandemic, Vaccine, Infection]","[Diabetes, Type 2 Diabetes, Kidney Transplant,...","[Cancer, Oncology, Tumor, Neoplasm]","[Covid-19, Pandemic, Vaccine, Infection]",...,,,,,,,,,,
11,Computational correction of copy number effect...,31,2017,1009,The CRISPR–Cas9 system has revolutionized gene...,,,,,,...,,,,,,,,,,
13,The Cancer Cell Line Encyclopedia enables pred...,63,2012,5005,The systematic translation of cancer genomic d...,,,,,,...,,,,,,,,,,
