In [2]:
import numpy as np
import pandas as pd

In [3]:
kg_giant = pd.read_csv("/home/ubuntu/Project_Files/Finetune/Data/kg_giant.csv", low_memory=False)

In [4]:
kg_giant.head()

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,protein_protein,ppi,9796,gene/protein,PHYHIP,NCBI,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,7918,gene/protein,GPANK1,NCBI,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,8233,gene/protein,ZRSR2,NCBI,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,4899,gene/protein,NRF1,NCBI,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,5297,gene/protein,PI4KA,NCBI,8601,gene/protein,RGS20,NCBI


In [5]:
kg_giant.shape

(8196862, 10)

In [6]:
# All unique relations, display_relations, x_type, y_type, x_name and y_name
print("----|||| Unique relations are:", kg_giant['relation'].unique())
print("----|||| Unique display relations are:", kg_giant['display_relation'].unique())
print("----|||| Unique x_type values are:", kg_giant['x_type'].unique())
print("----|||| Unique y_type values are:", kg_giant['y_type'].unique())
print("----|||| Unique x_name values are:", kg_giant['x_name'].unique())
print("----|||| Unique y_name values are:", kg_giant['y_name'].unique())

----|||| Unique relations are: ['protein_protein' 'drug_protein' 'contraindication' 'indication'
 'off-label use' 'drug_drug' 'phenotype_protein' 'phenotype_phenotype'
 'disease_phenotype_negative' 'disease_phenotype_positive'
 'disease_protein' 'disease_disease' 'drug_effect' 'bioprocess_bioprocess'
 'molfunc_molfunc' 'cellcomp_cellcomp' 'molfunc_protein'
 'cellcomp_protein' 'bioprocess_protein' 'exposure_protein'
 'exposure_disease' 'exposure_exposure' 'exposure_bioprocess'
 'exposure_molfunc' 'exposure_cellcomp' 'pathway_pathway'
 'pathway_protein' 'anatomy_anatomy' 'anatomy_protein_present'
 'anatomy_protein_absent']
----|||| Unique display relations are: ['ppi' 'carrier' 'enzyme' 'target' 'transporter' 'contraindication'
 'indication' 'off-label use' 'synergistic interaction' 'associated with'
 'parent-child' 'phenotype absent' 'phenotype present' 'side effect'
 'interacts with' 'linked to' 'expression present' 'expression absent']
----|||| Unique x_type values are: ['gene/protein

In [7]:
kg_giant.nunique()

relation                30
display_relation        18
x_id                 90354
x_type                  10
x_name              134381
x_source                 8
y_id                 90354
y_type                  10
y_name              134381
y_source                 8
dtype: int64

In [8]:
def generate_well_formed_sentences_from_df(df):
    sentences = []
    for _, row in df.iterrows():
        relation, display_relation, x_type, x_name, y_type, y_name = row['relation'], row['display_relation'], row['x_type'], row['x_name'], row['y_type'], row['y_name']

        # Constructing sentences based on the relation type and information
        if relation in ["anatomy_protein_present", "anatomy_anatomy", "protein_protein", 
                        "molfunc_protein", "disease_protein", "bioprocess_protein", 
                        "cellcomp_protein", "anatomy_protein_absent", "drug_protein",
                        "pathway_protein", "phenotype_protein", "exposure_protein"]:
            sentence = f"In the {x_name}, which is a type of {x_type}, there is a noted {display_relation} of the {y_type} {y_name}."

        elif relation in ["drug_drug", "exposure_disease", "disease_disease", "disease_phenotype_positive",
                          "drug_effect", "contraindication", "indication", "off-label use",
                          "disease_phenotype_negative", "exposure_exposure"]:
            sentence = f"The {x_type} {x_name} and the {y_type} {y_name} have a {display_relation}."

        elif relation in ["bioprocess_bioprocess", "molfunc_molfunc", "phenotype_phenotype",
                          "cellcomp_cellcomp", "pathway_pathway", "exposure_bioprocess",
                          "exposure_cellcomp", "exposure_molfunc"]:
            sentence = f"The {x_type} {x_name} is related to the {y_type} {y_name} in terms of {display_relation}."

        sentences.append(sentence)
    return sentences

# Generating well-formed sentences from the dataframe
well_formed_sentences = generate_well_formed_sentences_from_df(kg_giant)

# Creating a new DataFrame with the sentences
sentences_df = pd.DataFrame(well_formed_sentences, columns=["Sentence"])

# Displaying the first few sentences to verify
sentences_df.head()

# Saving the sentences to a csv file
sentences_df.to_csv('/home/ubuntu/Project_Files/Finetune/Data/sentences.csv', index=False)


In [9]:
df = sentences_df

In [10]:
df.head()

Unnamed: 0,Sentence
0,"In the PHYHIP, which is a type of gene/protein..."
1,"In the GPANK1, which is a type of gene/protein..."
2,"In the ZRSR2, which is a type of gene/protein,..."
3,"In the NRF1, which is a type of gene/protein, ..."
4,"In the PI4KA, which is a type of gene/protein,..."


In [11]:
import torch
import pandas as pd
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the model and tokenizer
model_name = "sberbank-ai/mGPT"
# model_name = "meta-llama/Llama-2-7b"
# model_name = "moreh/MoMo-72B-lora-1.8.7-DPO" ## ChatModel
# model_name = "cloudyu/Mixtral_34Bx2_MoE_60B" ## Pretrained




tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.cuda()
model.eval()

# Set the seed for reproducibility
transformers.set_seed(11)

# Example of correct sentence structure
example_incorrect = "In the UBC, which is a type of gene/protein, there is a noted ppi of the gene/protein DCAF1"
example_correct = "In the UBC, a type of gene/protein, there is a noted protein-protein interaction (ppi) with the gene/protein DCAF1."

  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 1.46k/1.46k [00:00<00:00, 9.37MB/s]
vocab.json: 100%|██████████| 1.89M/1.89M [00:00<00:00, 33.8MB/s]
merges.txt: 100%|██████████| 1.20M/1.20M [00:00<00:00, 39.8MB/s]
special_tokens_map.json: 100%|██████████| 606/606 [00:00<00:00, 5.35MB/s]
config.json: 100%|██████████| 738/738 [00:00<00:00, 5.84MB/s]
pytorch_model.bin: 100%|██████████| 3.45G/3.45G [00:53<00:00, 64.7MB/s]
  return self.fget.__get__(instance, owner)()


In [12]:
corrected_sentences = []
for index, row in df.iterrows():
    sentence_to_correct = row['Sentence'] 
    prompt = f"Incorrect: {example_incorrect} Correct: {example_correct} Incorrect: {sentence_to_correct} Correct:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()
    out = model.generate(
        input_ids,
        max_length=200,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2
    )
    generated_text = tokenizer.decode(out[0], skip_special_tokens=True)
    corrected_sentences.append(generated_text)


KeyboardInterrupt: 

In [None]:

corrected_df = pd.DataFrame(corrected_sentences, columns=['Corrected Sentence'])
corrected_df.to_csv("corrected_sentences.csv", index=False)
