In [1]:
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import csv

In [2]:
# Set PubTator3-annotated file path
input_file_path = os.getcwd() + '/Pubtator_Output/'

In [3]:
# List of pubtator-annotated files
pubtator_file_list = []
for file in listdir(input_file_path):
     if isfile(join(input_file_path, file)):
            pubtator_file_list.append(input_file_path + file)

In [7]:
# Function to extract data from the text file after the specified line
def extract_data_from_file(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    # Find the index of the specified line
    start_index = 0
    for i, line in enumerate(lines):
        if "00000|a|-NoAbstract-" in line:
            start_index = i + 1
            break
    
    # Extract the data after the specified line
    extracted_data = ''.join(lines[start_index:]).strip().replace('\t', ',')
    return extracted_data

# List of text files
files = pubtator_file_list

# Initialize an empty list to store dataframes
dataframes = []

# output files
output_file = os.getcwd() + '/OECD_pubtator_annotated.csv'

# Write column names
column_names = ['XX', 'Start', 'End', 'Term', 'Ontology', 'Identifier', 'Test_no']
with open(output_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(column_names)

# Process each file
for file in files:
    filename = os.path.basename(file)
    if filename != '.DS_Store':
        data = extract_data_from_file(file)
        data_rows = data.split('\n')  # Split the data into rows

        with open(output_file, 'a', newline='') as file:
            writer = csv.writer(file)
            for row in data_rows:
                if row.strip():  # Check if the row is not empty
                    data_list = row.split(',')  # Split the row into columns
                    filename_str = filename.split('.')[0]
                    data_list.append(filename_str)  # Add the filename to the end
                    writer.writerow(data_list)


In [10]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
csv_file = os.getcwd() + '/OECD_pubtator_annotated.csv'
df = pd.read_csv(csv_file)

# Initialize sets to store unique gene types and cell line types
gene_types = set()
cell_line_types = set()
unique_test_nos = set()

# Iterate over rows in the DataFrame
for index, row in df.iterrows():
    term = row['Term']
    ontology = row['Ontology']
    identifier = row['Identifier']
    test_no = row['Test_no']

    # Check if the row represents a gene or cell line
    if ontology == 'Gene':
        gene_types.add((term, identifier))
    elif ontology == 'CellLine':
        cell_line_types.add((term, identifier))

    # Add Test_no to the set of unique test numbers
    unique_test_nos.add(test_no)

# Print out the results
print(f"Types of Genes:")
for term, identifier in gene_types:
    print(f"{term} (Identifier: {identifier})")

print("\nTypes of Cell Lines:")
for term, identifier in cell_line_types:
    print(f"{term} (Identifier: {identifier})")

print(f"\nNumber of Unique Test_no: {len(unique_test_nos)}")


Types of Genes:
Hprt (Identifier: 3251)
estrogen receptor (Identifier: 2099)
IL-2 (Identifier: 3558)
CD54 (Identifier: 3383)
IFN-g (Identifier: 3458)
HPRT (Identifier: 3251)
Estrogen Receptor (Identifier: 2099)
IL-8 (Identifier: 3576)
Interleukin-8 (Identifier: 3576)
estrogen receptor (Identifier: 100136026)
GAPDH (Identifier: 2597)
Nrf2 (Identifier: 4780)
Estrogen Receptor (Identifier: 100136026)
CD86 (Identifier: 942)
SENS (Identifier: 284252)

Types of Cell Lines:
ERa-HeLa-9903 (Identifier: CVCL:2485)
E2 (Identifier: CVCL:6769)
BG1 (Identifier: CVCL:6570)
U937 (Identifier: CVCL:0007)
SIRC (Identifier: CVCL:2724)
7.2C (Identifier: CVCL:J025)

Number of Unique Test_no: 23


In [11]:
# Calculate unique identifiers for genes and cell lines, and unique Test_no
unique_gene_ids = df[df['Ontology'] == 'Gene']['Identifier'].nunique()
unique_cellline_ids = df[df['Ontology'] == 'CellLine']['Identifier'].nunique()
unique_test_no = df['Test_no'].nunique()

# Extract corresponding terms
gene_terms = df[df['Ontology'] == 'Gene'][['Term', 'Identifier']].drop_duplicates()
cellline_terms = df[df['Ontology'] == 'CellLine'][['Term', 'Identifier']].drop_duplicates()

# Print results
print(f"Number of unique Gene identifiers: {unique_gene_ids}")
print(f"Number of unique CellLine identifiers: {unique_cellline_ids}")
print(f"Number of unique Test_no: {unique_test_no}\n")

print("Corresponding Gene terms and identifiers:")
print(gene_terms)

print("\nCorresponding CellLine terms and identifiers:")
print(cellline_terms)

Number of unique Gene identifiers: 11
Number of unique CellLine identifiers: 6
Number of unique Test_no: 22

Corresponding Gene terms and identifiers:
                  Term Identifier
11                SENS     284252
12       Interleukin-8       3576
13                IL-8       3576
14                CD54       3383
15                CD86        942
28                IL-2       3558
31               IFN-g       3458
32               GAPDH       2597
33                Nrf2       4780
63                HPRT       3251
93                Hprt       3251
139  Estrogen Receptor       2099
141  estrogen receptor       2099
167  Estrogen Receptor  100136026
169  estrogen receptor  100136026

Corresponding CellLine terms and identifiers:
              Term Identifier
10            U937  CVCL:0007
103           7.2C  CVCL:J025
142  ERa-HeLa-9903  CVCL:2485
145            BG1  CVCL:6570
147             E2  CVCL:6769
185           SIRC  CVCL:2724
