In [2]:
import os
import requests
import re
import pandas as pd
import logging
import json
import argparse
import glob
import re
import pandas as pd

In [3]:
def get_hgnc_complete_list(symbol_json_file='./hgnc_complete_set_2020-10-01.json'):
  # read json from url 
  if not os.path.exists(symbol_json_file):
    logging.info('Downloading HGNC complete list')
    url = "https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/json/hgnc_complete_set_2020-10-01.json"
    gene_list_json = requests.get(url).json()['response']['docs']
    symbol_list = []
    for item in gene_list_json:
      
      symbol = item.get("symbol", "").upper().replace(" ", "")
      alias_symbol = item.get("alias_symbol", [])
      alias_symbol = [alias.upper().replace(" ", "") for alias in alias_symbol]
      prev_symbol = item.get("prev_symbol", [])
      prev_symbol = [prev.upper().replace(" ", "") for prev in prev_symbol]      
      if symbol != '': 
        symbol_list.append({"symbol": symbol, "other": symbol})
        for alias in alias_symbol:
          symbol_list.append({"symbol": symbol, "other": alias})
        for prev in prev_symbol:
          symbol_list.append({"symbol": symbol, "other": prev})
    logging.info('HGNC complete list downloaded')
    logging.info('length of HGNC complete list: {}'.format(len(symbol_list)))
    with open(symbol_json_file, 'w') as f:
      json.dump(symbol_list, f)
  else:
    with open(symbol_json_file, 'r') as f:
      logging.info('Reading HGNC complete list from local file')
      symbol_list = json.load(f)
      logging.info('length of HGNC complete list: {}'.format(len(symbol_list)))
  return symbol_list

hgnc_complete_list = get_hgnc_complete_list()
hgnc_complete_df = pd.DataFrame(hgnc_complete_list)

In [10]:
mega_list = [['id', 'answer', 'generated_gene_list', 'final_gene_list', 'generated_gene_count', 'gene_count', 'true_genes', 'true_gene_count', 'fake_ratio', 'duplicate_ratio', 'q_a']]

def extract_question(text):
    pattern = "Question: (.*?)(?=Question:|Answer:|$)"
    questions = re.findall(pattern, text, re.DOTALL)
    questions = [question.strip() for question in questions]

    if len(questions) == 1:
        pattern = r'Can you suggest a list of (\d+) possible genes to test?'
        match = re.search(pattern, questions[0])
        if match:
            x = int(match.group(1))
            print(x)
            return x

    
def extract_answer(text, file_name):
    pattern = "Answer: (.*?)(?=Answer:|Question:|$)"
    answers = re.findall(pattern, text, re.DOTALL)
    answers = [answer.strip() for answer in answers]

    # Define a variable to hold the result outside the if condition
    result = None
    generated_genes = []
    genes = []
    true_genes = []
    gene_number = 0
    generated_gene_number = 0
    true_gene_number = 0
    generated_ratio = 0
    fake_ratio = 0
    final_genes = set()


    if len(answers) == 1:
        # Assuming you want to save the answer to a file
        directory_path = "D:/연구/llama2_classified"
        os.makedirs(directory_path, exist_ok=True)  # Ensure the directory exists
        full_file_path = os.path.join(directory_path, f"{file_name}.txt")

        with open(full_file_path, "w") as file:
            file.write(answers[0])
            result = answers[0]

            if "The gene list is:" in result:
                gene_list_pattern = r"The gene list is: ([\w, ]+)"
                match = re.search(gene_list_pattern, result)
                if match:
                    gene_list_str = match.group(1)
                    generated_genes = [gene.strip() for gene in match.group(1).split(",")]

            elif "The top 50 genes are:" in result:
                gene_list_pattern2 = r"The top 50 genes are: ([\w, ]+)"
                match = re.search(gene_list_pattern2, result)
                if match:
                    gene_list_str = match.group(1)
                    generated_genes.extend([gene.strip() for gene in gene_list_str.split(",")])

            elif "The genes are:" in result:
                gene_list_pattern3 = r"The genes are: ([\w, ]+)"
                match = re.search(gene_list_pattern3, result)
                if match:
                    gene_list_str = match.group(1)
                    generated_genes.extend([gene.strip() for gene in gene_list_str.split(",")])

            elif "The predicted gene list is 10 genes:" in result:
                gene_list_pattern4 = r"The predicted gene list is 10 genes: ([\w, ]+)"
                match = re.search(gene_list_pattern4, result)
                if match:
                    gene_list_str = match.group(1)
                    generated_genes.extend([gene.strip() for gene in gene_list_str.split(",")])
            
            else:
                pattern = r'\d+\.\s*([A-Z0-9_]+)(?=\s*\d+\.\s*|\s*$)'
                matches = re.findall(pattern, result)
                if matches:
                    generated_genes.extend([match.strip() for match in matches])

        generated_gene_number = len(generated_genes)
        final_genes = list(set(generated_genes))
        gene_number = len(final_genes)

        for gene in final_genes:
            if gene in hgnc_complete_df['symbol'].values:
                true_genes.append(gene)
            if gene in hgnc_complete_df['other'].values:
                true_genes.append(gene)
        true_genes = list(set(true_genes))

        true_gene_number = len(true_genes)
        fake_gene_number = gene_number - true_gene_number
        fake_ratio = fake_gene_number/gene_number if gene_number > 0 else 0
        generated_ratio = (generated_gene_number - gene_number) / generated_gene_number if generated_gene_number > 0 else 0
        
        print(true_genes, fake_ratio, generated_ratio)
        return result, generated_genes, final_genes, generated_gene_number, gene_number, true_genes, true_gene_number, fake_ratio, generated_ratio, 1
    return result, generated_genes, final_genes, generated_gene_number, gene_number, true_gene_number, true_genes, fake_ratio, generated_ratio, 0  # Return result even if it's None

folder_path = 'C:/Users/cptas/Downloads/Experiment_004subset/Experiment_004subset'
count_total = 0

for file_name in os.listdir(folder_path):
    if file_name.endswith('.response'):
        full_file_path = os.path.join(folder_path, file_name)
        with open(full_file_path, 'r') as file:
            input_text = file.read()
            x = extract_question(input_text)
            answer, generated_genes, final_genes, generated_gene_number, gene_number, true_genes, true_gene_number, fake_ratio, generated_ratio, count = extract_answer(input_text, os.path.splitext(file_name)[0])
            count_total += count
            if x == gene_number:
                q = 1
            else:
                q = 0
            mega_list.append([file_name, answer, generated_genes, final_genes, generated_gene_number, gene_number, true_genes, true_gene_number, fake_ratio, generated_ratio, q])

print(count_total)
mega_df = pd.DataFrame(mega_list)
mega_csv = mega_df.to_csv('D:/연구/llama2_classified/mega_df.csv', index=False, header=False)


10
['CYP1A1', 'ABC1', 'CYP1B1', 'CYP1', 'CYP1A2'] 0.9473684210526315 0.010416666666666666
50
['CYP21A2', 'ABC1', 'DHCR24', 'DHCR7'] 0.42857142857142855 0.9285714285714286
50
['ABCA5', 'ABCB6', 'ABCA8', 'ABCB7', 'ABCA1', 'ABCA3', 'ABCA13', 'ABCA4', 'ABCA12', 'ABCA11', 'ABCB9', 'ABCB5', 'ABCA10', 'ABCA9', 'ABCB8', 'ABCB11', 'ABCB10', 'ABCB1', 'ABCB4', 'ABCA6', 'ABCA7'] 0.8627450980392157 0.006493506493506494
50
['CYP21A2', 'EIF2S3', 'EIF2S2', 'EIF2S1', 'DYRK1A', 'ABC1'] 0.9318181818181818 0.0
50
['ABCA5', 'ABCB6', 'ABCA8', 'ABCD2', 'ABCB7', 'ABCA1', 'ABCA3', 'ABCA4', 'ABCB5', 'ABCB9', 'ABCA9', 'ABCB8', 'ABCD4', 'ABCB1', 'ABCD3', 'ABCB4', 'ABCD1', 'ABCA6', 'ABCA7'] 0.8841463414634146 0.0
50
['ABCA5', 'ABCB6', 'ABCA8', 'ABCB7', 'ABCA1', 'ABCA3', 'ABCA13', 'ABCA4', 'ABCA12', 'ABCA11', 'ABCB9', 'ABCB5', 'ABCA10', 'ABCA9', 'ABCB8', 'ABCB11', 'ABCB10', 'ABCB1', 'ABCB4', 'ABCA6', 'ABCA7'] 0.86875 0.0
10
['CYP21A2', 'EIF2S3', 'ABC1', 'DYRK1A'] 0.9518072289156626 0.0
10
['CYP21A2', 'ABC1', 'DHCR2

In [11]:
df = pd.read_csv('D:/연구/llama2_classified/mega_df.csv')
n_df = df[df['generated_gene_list'] != '[]']
n_df.head()

Unnamed: 0,id,answer,generated_gene_list,final_gene_list,generated_gene_count,gene_count,true_genes,true_gene_count,fake_ratio,duplicate_ratio,q_a
2,AJHG.PMID28686853_p1__WDR26__10__e__llama2-7b_...,The predicted gene list is 10 genes. The gene ...,"['ABC1', 'BRAC2', 'BRAC1', 'CYP1B1', 'CYP1A1',...","['CYP1A1', 'CYP1B11', 'CYP1B62', 'CYP1B12', 'C...",96,95,"['CYP1A1', 'ABC1', 'CYP1B1', 'CYP1', 'CYP1A2']",5,0.947368,0.010417,0
4,AJHG.PMID28686853_p3__WDR26__50__e__llama2-7b_...,The predicted gene list is 1. ABC1 2. BRAC1 3....,"['ABC1', 'BRAC1', 'BRAC2', 'CYP21A2', 'DHCR7',...","['CYP21A2', 'BRAC2', 'DH', 'DHCR24', 'DHCR7', ...",98,7,"['CYP21A2', 'ABC1', 'DHCR24', 'DHCR7']",4,0.428571,0.928571,0
5,AJHG.PMID28686853_p9__WDR26__50__e__llama2-7b_...,The predicted gene list is 100 genes. The top ...,"['ABCA1', 'ABCA3', 'ABCA4', 'ABCA5', 'ABCA6', ...","['ABCB6', 'ABCA61', 'ABCB41', 'ABCB16', 'ABCB4...",154,153,"['ABCA5', 'ABCB6', 'ABCA8', 'ABCB7', 'ABCA1', ...",21,0.862745,0.006494,0
6,AJHG.PMID28757203_p2__LIPT2__50__e__llama2-7b_...,The predicted gene list is 1. ABC1 2. BRAC1 3....,"['ABC1', 'BRAC1', 'BRAC2', 'CYP21A2', 'DYRK1A'...","['EIF2S66', 'EIF2S17', 'EIF2S37', 'EIF2S73', '...",88,88,"['CYP21A2', 'EIF2S3', 'EIF2S2', 'EIF2S1', 'DYR...",6,0.931818,0.0,0
7,AJHG.PMID28757203_p3__LIPT2__50__e__llama2-7b_...,The predicted gene list is 100 genes. The top ...,"['ABCA1', 'ABCA3', 'ABCA4', 'ABCA5', 'ABCA6', ...","['ABCB6', 'ABCD122', 'ABCD25', 'ABCD136', 'ABC...",164,164,"['ABCA5', 'ABCB6', 'ABCA8', 'ABCD2', 'ABCB7', ...",19,0.884146,0.0,0


In [12]:
n_df.shape

(109, 11)

In [13]:
n_df['true_gene_count'] = pd.to_numeric(n_df['true_gene_count'], errors='coerce')
n_df['true_gene_ratio'] = n_df['true_gene_count'] / n_df['gene_count']
n_df['generate_diff'] = n_df['generated_gene_count'] - n_df['gene_count']
n_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_df['true_gene_count'] = pd.to_numeric(n_df['true_gene_count'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_df['true_gene_ratio'] = n_df['true_gene_count'] / n_df['gene_count']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_df['generate_diff'] = n_df['generated_gene_count']

Unnamed: 0,id,answer,generated_gene_list,final_gene_list,generated_gene_count,gene_count,true_genes,true_gene_count,fake_ratio,duplicate_ratio,q_a,true_gene_ratio,generate_diff
2,AJHG.PMID28686853_p1__WDR26__10__e__llama2-7b_...,The predicted gene list is 10 genes. The gene ...,"['ABC1', 'BRAC2', 'BRAC1', 'CYP1B1', 'CYP1A1',...","['CYP1A1', 'CYP1B11', 'CYP1B62', 'CYP1B12', 'C...",96,95,"['CYP1A1', 'ABC1', 'CYP1B1', 'CYP1', 'CYP1A2']",5,0.947368,0.010417,0,0.052632,1
4,AJHG.PMID28686853_p3__WDR26__50__e__llama2-7b_...,The predicted gene list is 1. ABC1 2. BRAC1 3....,"['ABC1', 'BRAC1', 'BRAC2', 'CYP21A2', 'DHCR7',...","['CYP21A2', 'BRAC2', 'DH', 'DHCR24', 'DHCR7', ...",98,7,"['CYP21A2', 'ABC1', 'DHCR24', 'DHCR7']",4,0.428571,0.928571,0,0.571429,91
5,AJHG.PMID28686853_p9__WDR26__50__e__llama2-7b_...,The predicted gene list is 100 genes. The top ...,"['ABCA1', 'ABCA3', 'ABCA4', 'ABCA5', 'ABCA6', ...","['ABCB6', 'ABCA61', 'ABCB41', 'ABCB16', 'ABCB4...",154,153,"['ABCA5', 'ABCB6', 'ABCA8', 'ABCB7', 'ABCA1', ...",21,0.862745,0.006494,0,0.137255,1
6,AJHG.PMID28757203_p2__LIPT2__50__e__llama2-7b_...,The predicted gene list is 1. ABC1 2. BRAC1 3....,"['ABC1', 'BRAC1', 'BRAC2', 'CYP21A2', 'DYRK1A'...","['EIF2S66', 'EIF2S17', 'EIF2S37', 'EIF2S73', '...",88,88,"['CYP21A2', 'EIF2S3', 'EIF2S2', 'EIF2S1', 'DYR...",6,0.931818,0.0,0,0.068182,0
7,AJHG.PMID28757203_p3__LIPT2__50__e__llama2-7b_...,The predicted gene list is 100 genes. The top ...,"['ABCA1', 'ABCA3', 'ABCA4', 'ABCA5', 'ABCA6', ...","['ABCB6', 'ABCD122', 'ABCD25', 'ABCD136', 'ABC...",164,164,"['ABCA5', 'ABCB6', 'ABCA8', 'ABCD2', 'ABCB7', ...",19,0.884146,0.0,0,0.115854,0


In [14]:
n_df.describe()

Unnamed: 0,generated_gene_count,gene_count,true_gene_count,fake_ratio,duplicate_ratio,q_a,true_gene_ratio,generate_diff
count,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0
mean,113.880734,104.688073,11.100917,0.831304,0.086624,0.0,0.168696,9.192661
std,42.320804,51.173578,7.938356,0.194312,0.267387,0.0,0.194312,30.445603
min,10.0,6.0,2.0,0.0,0.0,0.0,0.023256,0.0
25%,86.0,78.0,4.0,0.854545,0.0,0.0,0.048077,0.0
50%,108.0,104.0,10.0,0.879747,0.0,0.0,0.120253,0.0
75%,154.0,152.0,21.0,0.951923,0.006494,0.0,0.145455,1.0
max,173.0,172.0,25.0,0.976744,0.955128,0.0,1.0,157.0


In [None]:
n_df.to_csv("D:/연구/llama2_classified/n_df.csv", index = False)

In [15]:
def update_id(row_id):
    pattern = r"__([0-9]+)__e"
    match = re.search(pattern, row_id)
    if match:
        if match.group(1) == '50':
            return '50'
        elif match.group(1) == '10':
            return '10'
    return row_id  
    
n_df['id'] = n_df['id'].apply(update_id)
n_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_df['id'] = n_df['id'].apply(update_id)


Unnamed: 0,id,answer,generated_gene_list,final_gene_list,generated_gene_count,gene_count,true_genes,true_gene_count,fake_ratio,duplicate_ratio,q_a,true_gene_ratio,generate_diff
2,10,The predicted gene list is 10 genes. The gene ...,"['ABC1', 'BRAC2', 'BRAC1', 'CYP1B1', 'CYP1A1',...","['CYP1A1', 'CYP1B11', 'CYP1B62', 'CYP1B12', 'C...",96,95,"['CYP1A1', 'ABC1', 'CYP1B1', 'CYP1', 'CYP1A2']",5,0.947368,0.010417,0,0.052632,1
4,50,The predicted gene list is 1. ABC1 2. BRAC1 3....,"['ABC1', 'BRAC1', 'BRAC2', 'CYP21A2', 'DHCR7',...","['CYP21A2', 'BRAC2', 'DH', 'DHCR24', 'DHCR7', ...",98,7,"['CYP21A2', 'ABC1', 'DHCR24', 'DHCR7']",4,0.428571,0.928571,0,0.571429,91
5,50,The predicted gene list is 100 genes. The top ...,"['ABCA1', 'ABCA3', 'ABCA4', 'ABCA5', 'ABCA6', ...","['ABCB6', 'ABCA61', 'ABCB41', 'ABCB16', 'ABCB4...",154,153,"['ABCA5', 'ABCB6', 'ABCA8', 'ABCB7', 'ABCA1', ...",21,0.862745,0.006494,0,0.137255,1
6,50,The predicted gene list is 1. ABC1 2. BRAC1 3....,"['ABC1', 'BRAC1', 'BRAC2', 'CYP21A2', 'DYRK1A'...","['EIF2S66', 'EIF2S17', 'EIF2S37', 'EIF2S73', '...",88,88,"['CYP21A2', 'EIF2S3', 'EIF2S2', 'EIF2S1', 'DYR...",6,0.931818,0.0,0,0.068182,0
7,50,The predicted gene list is 100 genes. The top ...,"['ABCA1', 'ABCA3', 'ABCA4', 'ABCA5', 'ABCA6', ...","['ABCB6', 'ABCD122', 'ABCD25', 'ABCD136', 'ABC...",164,164,"['ABCA5', 'ABCB6', 'ABCA8', 'ABCD2', 'ABCB7', ...",19,0.884146,0.0,0,0.115854,0


In [16]:
n_df_10 = n_df[n_df['id'] == '10']
n_df_50 = n_df[n_df['id'] == '50']

In [17]:
n_df_10.describe()

Unnamed: 0,generated_gene_count,gene_count,true_gene_count,fake_ratio,duplicate_ratio,q_a,true_gene_ratio,generate_diff
count,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0
mean,85.5,69.794118,4.852941,0.801875,0.113157,0.0,0.198125,15.705882
std,41.14737,40.970632,3.568654,0.257679,0.302872,0.0,0.257679,44.076056
min,10.0,7.0,2.0,0.0,0.0,0.0,0.023256,0.0
25%,76.25,10.5,4.0,0.625,0.0,0.0,0.038462,0.0
50%,98.5,89.5,4.0,0.953255,0.00463,0.0,0.046745,0.5
75%,104.75,103.0,4.75,0.961538,0.009877,0.0,0.375,1.0
max,165.0,109.0,23.0,0.976744,0.955128,0.0,1.0,157.0


In [18]:
n_df_50.describe()

Unnamed: 0,generated_gene_count,gene_count,true_gene_count,fake_ratio,duplicate_ratio,q_a,true_gene_ratio,generate_diff
count,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0
mean,126.746667,120.506667,13.933333,0.844645,0.074595,0.0,0.155355,6.24
std,36.346714,47.542974,7.760491,0.157849,0.250933,0.0,0.157849,21.404622
min,27.0,6.0,2.0,0.333333,0.0,0.0,0.025641,0.0
25%,96.0,89.0,5.0,0.856174,0.0,0.0,0.066446,0.0
50%,136.0,136.0,15.0,0.873494,0.0,0.0,0.126506,0.0
75%,158.0,158.0,21.0,0.933554,0.0,0.0,0.143826,0.0
max,173.0,172.0,25.0,0.974359,0.938776,0.0,0.666667,92.0
