In [None]:
import numpy as np
import pandas as pd

In [None]:
kg_giant = pd.read_csv("/home/ubuntu/Project_Files/Finetune/Data/kg_giant.csv", low_memory=False)

In [None]:
kg_giant.head()

In [None]:
kg_giant.shape

In [None]:
# All unique relations, display_relations, x_type, y_type, x_name and y_name
print("----|||| Unique relations are:", kg_giant['relation'].unique())
print("----|||| Unique display relations are:", kg_giant['display_relation'].unique())
print("----|||| Unique x_type values are:", kg_giant['x_type'].unique())
print("----|||| Unique y_type values are:", kg_giant['y_type'].unique())
print("----|||| Unique x_name values are:", kg_giant['x_name'].unique())
print("----|||| Unique y_name values are:", kg_giant['y_name'].unique())

In [None]:
kg_giant.nunique()

In [None]:
def generate_well_formed_sentences_from_df(df):
    sentences = []
    for _, row in df.iterrows():
        relation, display_relation, x_type, x_name, y_type, y_name = row['relation'], row['display_relation'], row['x_type'], row['x_name'], row['y_type'], row['y_name']

        # Constructing sentences based on the relation type and information
        if relation in ["anatomy_protein_present", "anatomy_anatomy", "protein_protein", 
                        "molfunc_protein", "disease_protein", "bioprocess_protein", 
                        "cellcomp_protein", "anatomy_protein_absent", "drug_protein",
                        "pathway_protein", "phenotype_protein", "exposure_protein"]:
            sentence = f"In the {x_name}, which is a type of {x_type}, there is a noted {display_relation} of the {y_type} {y_name}."

        elif relation in ["drug_drug", "exposure_disease", "disease_disease", "disease_phenotype_positive",
                          "drug_effect", "contraindication", "indication", "off-label use",
                          "disease_phenotype_negative", "exposure_exposure"]:
            sentence = f"The {x_type} {x_name} and the {y_type} {y_name} have a {display_relation}."

        elif relation in ["bioprocess_bioprocess", "molfunc_molfunc", "phenotype_phenotype",
                          "cellcomp_cellcomp", "pathway_pathway", "exposure_bioprocess",
                          "exposure_cellcomp", "exposure_molfunc"]:
            sentence = f"The {x_type} {x_name} is related to the {y_type} {y_name} in terms of {display_relation}."

        sentences.append(sentence)
    return sentences

# Generating well-formed sentences from the dataframe
well_formed_sentences = generate_well_formed_sentences_from_df(kg_giant)

# Creating a new DataFrame with the sentences
sentences_df = pd.DataFrame(well_formed_sentences, columns=["Sentence"])

# Displaying the first few sentences to verify
sentences_df.head()

# Saving the sentences to a csv file
sentences_df.to_csv('/home/ubuntu/Project_Files/Finetune/Data/sentences.csv', index=False)


In [None]:
df = sentences_df

In [None]:
df.head()

In [7]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.42.0


In [1]:
import torch
import pandas as pd
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model and tokenizer
# model_name = "sberbank-ai/mGPT"
# model_name = "meta-llama/Llama-2-7b"
# model_name = "moreh/MoMo-72B-lora-1.8.7-DPO" ## ChatModel
model_path = "cloudyu/Mixtral_34Bx2_MoE_60B" ## Pretrained

df = pd.read_csv("/home/ubuntu/Project_Files/Finetune/Data/sentences.csv", low_memory=False)



  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_default_system_prompt=False)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', torch_dtype=torch.float32,local_files_only=False, load_in_4bit=False)

model.cuda()
model.eval()

# Set the seed for reproducibility
transformers.set_seed(11)

# Example of correct sentence structure
example_incorrect = "In the UBC, which is a type of gene/protein, there is a noted ppi of the gene/protein DCAF1"
# example_correct = "In the UBC, a type of gene/protein, there is a noted protein-protein interaction (ppi) with the gene/protein DCAF1."
example_correct = "The gene/protein UBC exhibits a notable protein-protein interaction (PPI) with another gene/protein, DCAF1."

Loading checkpoint shards:  15%|█▌        | 2/13 [00:11<01:01,  5.57s/it]


KeyboardInterrupt: 

In [None]:
corrected_sentences = []
for index, row in df.iterrows():
    sentence_to_correct = row['Sentence'] 
    prompt = f"Incorrect: {example_incorrect} Correct: {example_correct} Incorrect: {sentence_to_correct} Correct:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()
    out = model.generate(
        input_ids,
        max_length=200,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2
    )
    generated_text = tokenizer.decode(out[0], skip_special_tokens=True)
    corrected_sentences.append(generated_text)


In [None]:

corrected_df = pd.DataFrame(corrected_sentences, columns=['Corrected Sentence'])
corrected_df.to_csv("corrected_sentences.csv", index=False)
