In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Get the project root directory
project_root = Path.cwd().parent
data_dir = project_root / "data"
raw_dir = data_dir / "raw"

# Read the CSV files
genage_df = pd.read_csv(raw_dir /'genage_human.csv')
print(genage_df.shape)
uniprot_df = pd.read_csv(raw_dir / 'uniprot_sequences.csv')
print(uniprot_df.shape)

# Create the combined DataFrame with the specified columns
combined_df = pd.DataFrame()

# What Protein? (gene_symbol, uniprot_id)
# We'll use the genage data as the primary source and merge with uniprot data
combined_df['gene_symbol'] = genage_df['symbol']
combined_df['uniprot_id'] = genage_df['uniprot']

# What Part of the Protein? (variant_id, domain_name, sequence_location)
# These would need to be extracted from literature or other sources
# For now, we'll create placeholder columns
combined_df['variant_id'] = None  # Would need to be extracted from literature
combined_df['domain_name'] = None  # Would need to be extracted from literature  
combined_df['sequence_location'] = None  # Would need to be extracted from literature

# What Does It Do? (phenotypic_outcome_text)
# This would be extracted from literature using NLP
# For now, we'll use the 'why' column from genage as a placeholder
combined_df['phenotypic_outcome_text'] = genage_df['why']

# How Do We Know? (source_paper_doi, confidence_score)
# These would come from the literature extraction process
combined_df['source_paper_doi'] = None  # Would need to be extracted from literature
combined_df['confidence_score'] = None  # Would be assigned by the NLP model

# Add additional useful columns from the original data
combined_df['protein_name'] = genage_df['name']
combined_df['entrez_gene_id'] = genage_df['entrez gene id']
combined_df['genage_id'] = genage_df['GenAge ID']

# Merge with uniprot data to get protein sequences where available
uniprot_lookup = uniprot_df.set_index('uniprot_id')['sequence'].to_dict()
combined_df['protein_sequence'] = combined_df['uniprot_id'].map(uniprot_lookup)

# Display the structure of the combined DataFrame
print("Combined DataFrame shape:", combined_df.shape)
print("\nColumn names:")
print(combined_df.columns.tolist())
print("\nFirst few rows:")
print(combined_df.head())

# Show data types
print("\nData types:")
print(combined_df.dtypes)

# Show summary statistics
print("\nSummary:")
print(combined_df.describe(include='all'))

(307, 6)
(4, 4)
Combined DataFrame shape: (307, 12)

Column names:
['gene_symbol', 'uniprot_id', 'variant_id', 'domain_name', 'sequence_location', 'phenotypic_outcome_text', 'source_paper_doi', 'confidence_score', 'protein_name', 'entrez_gene_id', 'genage_id', 'protein_sequence']

First few rows:
  gene_symbol   uniprot_id variant_id domain_name sequence_location  \
0         GHR    GHR_HUMAN       None        None              None   
1        GHRH   SLIB_HUMAN       None        None              None   
2        SHC1   SHC1_HUMAN       None        None              None   
3      POU1F1   PIT1_HUMAN       None        None              None   
4       PROP1  PROP1_HUMAN       None        None              None   

  phenotypic_outcome_text source_paper_doi confidence_score  \
0                  mammal             None             None   
1                  mammal             None             None   
2                  mammal             None             None   
3                  mamm