# Chemical Extra Data Examples

This notebook demonstrates how to use the `ExtraData` class to retrieve reference counts and metadata from the CompTox Dashboard.

In [None]:
from pycomptox import ExtraData
import pandas as pd

# Initialize client
extra = ExtraData()

## Example 1: Get Reference Data for a Single Chemical

In [None]:
# Get reference data for Bisphenol A
dtxsid = "DTXSID7020182"
data = extra.get_data_by_dtxsid(dtxsid)

print(f"Chemical: {data['dtxsid']}")
print(f"Total references: {data['refs']}")
print(f"PubMed citations: {data['pubmed']}")
print(f"Patent references: {data['googlePatent']}")
print(f"Literature references: {data['literature']}")

## Example 2: Batch Reference Lookup

In [None]:
# Get reference data for multiple chemicals
chemicals = [
    "DTXSID7020182",  # Bisphenol A
    "DTXSID2021315",  # Caffeine
    "DTXSID5020001",  # 1,2,3-Trichloropropane
    "DTXSID3020637",  # Formaldehyde
    "DTXSID6020139"   # Benzene
]

results = extra.get_data_by_dtxsid_batch(chemicals)

# Convert to DataFrame for easier viewing
df = pd.DataFrame(results)
print(df[['dtxsid', 'refs', 'pubmed', 'googlePatent', 'literature']])

## Example 3: Rank Chemicals by Total References

In [None]:
# Sort by total references
df_sorted = df.sort_values('refs', ascending=False)

print("Chemicals ranked by total references:")
print(df_sorted[['dtxsid', 'refs', 'pubmed']].to_string(index=False))

## Example 4: Visualize Reference Distribution

In [None]:
import matplotlib.pyplot as plt

# Create stacked bar chart
fig, ax = plt.subplots(figsize=(10, 6))

x = range(len(df))
width = 0.5

ax.bar(x, df['pubmed'], width, label='PubMed', color='steelblue')
ax.bar(x, df['googlePatent'], width, bottom=df['pubmed'], 
       label='Patents', color='orange')
ax.bar(x, df['literature'], width, 
       bottom=df['pubmed'] + df['googlePatent'],
       label='Literature', color='green')

ax.set_xlabel('Chemical')
ax.set_ylabel('Reference Count')
ax.set_title('Reference Distribution by Source')
ax.set_xticks(x)
ax.set_xticklabels(df['dtxsid'], rotation=45, ha='right')
ax.legend()

plt.tight_layout()
plt.show()

## Example 5: Compare Reference Sources

In [None]:
# Calculate percentage breakdown for each chemical
for _, row in df.iterrows():
    total = row['refs']
    if total > 0:
        print(f"\n{row['dtxsid']}:")
        print(f"  PubMed: {row['pubmed']:4d} ({row['pubmed']/total*100:5.1f}%)")
        print(f"  Patents: {row['googlePatent']:4d} ({row['googlePatent']/total*100:5.1f}%)")
        print(f"  Literature: {row['literature']:4d} ({row['literature']/total*100:5.1f}%)")

## Example 6: Filter Highly-Referenced Chemicals

In [None]:
# Find chemicals with > 100 PubMed citations
highly_cited = df[df['pubmed'] > 100]

print(f"Found {len(highly_cited)} highly-cited chemicals:")
print(highly_cited[['dtxsid', 'pubmed', 'refs']])

## Example 7: Integration with Chemical Search

In [None]:
from pycomptox import Chemical

# Search for phthalates
chem = Chemical()
search_results = chem.search_by_name("phthalate")

# Get reference data for first 5 results
dtxsids = [r['dtxsid'] for r in search_results[:5]]
ref_data = extra.get_data_by_dtxsid_batch(dtxsids)

# Combine and display
for result in search_results[:5]:
    refs = next((d for d in ref_data if d['dtxsid'] == result['dtxsid']), None)
    if refs:
        print(f"{result['preferredName']}")
        print(f"  DTXSID: {result['dtxsid']}")
        print(f"  Total refs: {refs['refs']}, PubMed: {refs['pubmed']}")
        print()

## Example 8: Create Summary Statistics

In [None]:
# Get summary statistics
print("Reference Statistics:")
print("\nTotal References:")
print(df['refs'].describe())

print("\nPubMed Citations:")
print(df['pubmed'].describe())

print("\nPatent References:")
print(df['googlePatent'].describe())

## Example 9: Compare Multiple Chemical Classes

In [None]:
# Define chemical classes
chemical_classes = {
    'Bisphenols': ['DTXSID7020182', 'DTXSID4020216', 'DTXSID1020265'],
    'Phthalates': ['DTXSID5020607', 'DTXSID6021232', 'DTXSID2021781'],
    'Solvents': ['DTXSID6020139', 'DTXSID8021360', 'DTXSID2021360']
}

class_stats = []

for class_name, dtxsids in chemical_classes.items():
    try:
        results = extra.get_data_by_dtxsid_batch(dtxsids)
        avg_refs = sum(r['refs'] for r in results) / len(results)
        avg_pubmed = sum(r['pubmed'] for r in results) / len(results)
        
        class_stats.append({
            'Class': class_name,
            'Avg Total Refs': avg_refs,
            'Avg PubMed': avg_pubmed
        })
    except Exception as e:
        print(f"Error processing {class_name}: {e}")

# Display results
if class_stats:
    stats_df = pd.DataFrame(class_stats)
    print(stats_df)