<a href="https://colab.research.google.com/github/rauana-carvalho/Singularity/blob/main/chatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import unicodedata
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd

# Lê o arquivo CSV
file_path = '/content/testerelatorios.csv'  # Substitua pelo caminho do seu arquivo CSV
data = pd.read_csv(file_path, delimiter=',')


print("Nomes das colunas:")
print(data.columns.tolist())

# Function to query the GPT-2 model
def query_gpt2(question):
    input_ids = tokenizer.encode(question, return_tensors='pt')

    with torch.no_grad():
        output = model.generate(input_ids, max_length=50)

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.strip()

def normalize_string(text):
    text = text.lower().strip()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')  # Remove accents
    return text

def remove_duplicate_spaces(text):
    return re.sub(r'\s+', ' ', text)

def fetch_experiment_data(question_input):
    response = []

    question = remove_duplicate_spaces(normalize_string(question_input))

    # Dictionary mapping keywords to dataframe columns
    mapping = {
        "experiment name": "Source_Name",
        "sample name": "Sample_Name",
        "organism": "Characteristics_Organism",
        "strain": "Characteristics_Strain",
        "laboratory": "Characteristics_Animal_Source",
        "genotype": "Characteristics_Genotype",
        "material": "Characteristics_Material_Type",
        "space flight": "Factor_Value_Spaceflight",
        "age at launch": "Characteristics_Age_at_Launch",
        "sex": "Characteristics_Sex",
        "habitat": "Parameter_Value_habitat",
        "duration": "Parameter_Value_duration",
        "light cycle": "Parameter_Value_light_cycle",
        "enrichment": "Parameter_Value_Enrichment_material",
        "diet": "Parameter_Value_diet",
        "feeding schedule": "Parameter_Value_Feeding_Schedule",
        "euthanasia method": "Parameter_Value_Euthanasia_Method",
        "age at euthanasia": "Parameter_Value_Age_at_Euthanasia",
        "preservation": "Parameter_Value_Sample_Preservation_Method",
        "storage temperature": "Parameter_Value_Sample_Storage_Temperature",
        "body weight at euthanasia": "Parameter_Value_Body_Weight_upon_Euthanasia",
        "euthanasia date": "Comment_Euthanasia_Date",
        "description": "Comment_Source_Description",
        "protocol": "Protocol_REF"
    }

    # Prefixes that indicate that all information should be returned
    info_prefixes = ["information about", "info on", "characteristics about", "characteristics of"]

    # Helper function to check if the question contains one of the prefixes
    def contains_prefix(question, prefixes):
        for prefix in prefixes:
            if prefix in question:
                return True
        return False


    # Iterate over each sample in the dataset
    for i in range(len(data["Sample_Name"])):
        sample_name = normalize_string(data['Sample_Name'][i])
        if f"experiment {sample_name}" in question:

            if contains_prefix(question, info_prefixes):
                for key, column in mapping.items():
                    response.append(f"{key.capitalize()}: {data[column][i]}")
                return "\n".join(response)

            else:
                for key, column in mapping.items():
                    if key in question:
                        response.append(f"{key.capitalize()}: {data[column][i]}")

                return "\n".join(response)

    return "Sorry, I couldn't find any information about the mentioned experiment."

# Predefined responses for standard queries
predefined_responses = {
    "help": "I am a virtual assistant to help with information about experiments. You can ask about the name, organism, or any other characteristic of NASA space experiments.",
    "experiments": "Currently, we have various information about NASA space experiments available. Please provide the sample name or any other information you're looking for."
}

def list_experiments():
    # Get all unique Sample_Name from the dataset
    return data["Sample_Name"].unique()

def search_available_experiments(question):
    if "which experiments" in question.lower() or "available experiments" in question.lower():
        samples = list_experiments()
        return "The available experiments are:\n" + "\n".join(samples)
    return None  # Return None if no relevant question is found

def search_predefined_response(question):
    for key, response in predefined_responses.items():
        if key in question.lower():
            return response
    return None  # Return None if no predefined response is found

def interact_with_user():
    print("Hello! I am a virtual assistant. How can I help you with information about the experiments?")
    while True:
        user_question = input("\nYou: ")

        if user_question.lower() == 'exit':
            print("Assistant: Goodbye! If you need more information, feel free to ask.")
            break

        experiment_response = search_available_experiments(user_question)
        if experiment_response:
            print(f"Assistant:\n{experiment_response}\n")
            continue

        predefined_response = search_predefined_response(user_question)
        if predefined_response:
            print(f"Assistant:\n{predefined_response}\n")
            continue

        response = fetch_experiment_data(user_question)

        print(f"Assistant:\n{response}\n")


interact_with_user()


Nomes das colunas:
['Source_File', 'Source_Name', 'Sample_Name', 'Characteristics_Organism', 'Characteristics_Strain', 'Characteristics_Animal_Source', 'Characteristics_Genotype', 'Characteristics_Material_Type', 'Factor_Value_Spaceflight', 'Characteristics_Age_at_Launch', 'Characteristics_Sex', 'Protocol_REF', 'Parameter_Value_habitat', 'Parameter_Value_duration', 'Parameter_Value_light_cycle', 'Parameter_Value_Enrichment_material', 'Parameter_Value_diet', 'Parameter_Value_Feeding_Schedule', 'Protocol_REF_1', 'Parameter_Value_Euthanasia_Method', 'Parameter_Value_Age_at_Euthanasia', 'Parameter_Value_Sample_Preservation_Method', 'Parameter_Value_Sample_Storage_Temperature', 'Parameter_Value_Body_Weight_upon_Euthanasia', 'Comment_Euthanasia_Date', 'Comment_Source_Description']
Hello! I am a virtual assistant. How can I help you with information about the experiments?

You: info on experiment RR23_R-EDL_FLT_F1_techrep1
Assistant:
Experiment name: RR-23_F1
Sample name: RR23_R-EDL_FLT_F1_te

In [None]:
!pip install transformers
!pip install gradio

Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

In [74]:
import torch
import unicodedata
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import gradio as gr

# Lê o arquivo CSV
file_path = '/content/testerelatorios.csv'  # Substitua pelo caminho do seu arquivo CSV
data = pd.read_csv(file_path, delimiter=',')

# Carrega o modelo e o tokenizador
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def query_gpt2(question):
    input_ids = tokenizer.encode(question, return_tensors='pt')

    with torch.no_grad():
        output = model.generate(input_ids, max_length=50)

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.strip()

def normalize_string(text):
    text = text.lower().strip()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')  # Remove accents
    return text

def remove_duplicate_spaces(text):
    return re.sub(r'\s+', ' ', text)

def fetch_experiment_data(question_input):
    response = []

    question = remove_duplicate_spaces(normalize_string(question_input))

    mapping = {
        "experiment name": "Source_Name",
        "sample name": "Sample_Name",
        "organism": "Characteristics_Organism",
        "strain": "Characteristics_Strain",
        "laboratory": "Characteristics_Animal_Source",
        "genotype": "Characteristics_Genotype",
        "material": "Characteristics_Material_Type",
        "space flight": "Factor_Value_Spaceflight",
        "age at launch": "Characteristics_Age_at_Launch",
        "sex": "Characteristics_Sex",
        "habitat": "Parameter_Value_habitat",
        "duration": "Parameter_Value_duration",
        "light cycle": "Parameter_Value_light_cycle",
        "enrichment": "Parameter_Value_Enrichment_material",
        "diet": "Parameter_Value_diet",
        "feeding schedule": "Parameter_Value_Feeding_Schedule",
        "euthanasia method": "Parameter_Value_Euthanasia_Method",
        "age at euthanasia": "Parameter_Value_Age_at_Euthanasia",
        "preservation": "Parameter_Value_Sample_Preservation_Method",
        "storage temperature": "Parameter_Value_Sample_Storage_Temperature",
        "body weight at euthanasia": "Parameter_Value_Body_Weight_upon_Euthanasia",
        "euthanasia date": "Comment_Euthanasia_Date",
        "description": "Comment_Source_Description",
        "protocol": "Protocol_REF"
    }

    info_prefixes = ["information about", "info on", "characteristics about", "characteristics of"]

    def contains_prefix(question, prefixes):
        for prefix in prefixes:
            if prefix in question:
                return True
        return False

    for i in range(len(data["Sample_Name"])):
        sample_name = normalize_string(data['Sample_Name'][i])
        if f"experiment {sample_name}" in question:

            if contains_prefix(question, info_prefixes):
                for key, column in mapping.items():
                    response.append(f"{key.capitalize()}: {data[column][i]}")
                return "\n".join(response)

            else:
                for key, column in mapping.items():
                    if key in question:
                        response.append(f"{key.capitalize()}: {data[column][i]}")

                return "\n".join(response)

    return "Sorry, I couldn't find any information about the mentioned experiment."

def chatbot_response(user_input):
    experiment_response = search_available_experiments(user_input)
    if experiment_response:
        return experiment_response

    predefined_response = search_predefined_response(user_input)
    if predefined_response:
        return predefined_response

    response = fetch_experiment_data(user_input)
    return response

# Predefined responses for standard queries
predefined_responses = {
    "help": "I am a virtual assistant to help with information about experiments. You can ask about the name, organism, or any other characteristic of NASA space experiments.",
    "experiments": "Currently, we have various information about NASA space experiments available. Please provide the sample name or any other information you're looking for."
}

def list_experiments():
    return data["Sample_Name"].unique()

def search_available_experiments(question):
    if "which experiments" in question.lower() or "available experiments" in question.lower():
        samples = list_experiments()
        return "The available experiments are:\n" + "\n".join(samples)
    return None

def search_predefined_response(question):
    for key, response in predefined_responses.items():
        if key in question.lower():
            return response
    return None

# Gradio Interface
iface = gr.Interface(fn=chatbot_response,
                     inputs="text",
                     outputs="text",
                     title="NASA Experiment Chatbot",
                     description="Ask about NASA space experiments and get information based on our database.")

iface.launch()




Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://2845adca7c47668d01.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


