In [1]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import re
import warnings
warnings.filterwarnings('ignore')

# Import plotly for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go

# Create taxonomy visualization
import plotly.io as pio
pio.renderers.default = 'vscode'

In [2]:
images_dir = Path('./images')
# Create the directory if it doesn't exist
images_dir.mkdir(exist_ok=True)

# Dataset Analysis

In [3]:
# Load generated datasets
output_dir = Path('ml_datasets')
ml_data = pd.read_csv(output_dir / 'preprocessed_spectra_with_labels.csv')

print("Dataset loaded successfully")
print(f"Shape: {ml_data.shape}")

Dataset loaded successfully
Shape: (160, 3457)


In [4]:
# Replicas and unique samples
print("\n" + "=" * 80)
print("REPLICAS AND UNIQUE SAMPLES")
print("=" * 80)

total_spectra = len(ml_data)
unique_samples = ml_data['Sample_ID'].nunique()
replica_counts = ml_data.groupby('Sample_ID')['Replica'].max()

print(f"\nTotal Spectra: {total_spectra}")
print(f"Unique Samples: {unique_samples}")
print(f"Average Replicas per Sample: {replica_counts.mean():.2f}")

print("\nReplica Distribution:")
replica_dist = replica_counts.value_counts().sort_index()
for num_replicas, count in replica_dist.items():
    print(f"  {int(num_replicas)} replica(s): {count} samples")


REPLICAS AND UNIQUE SAMPLES

Total Spectra: 160
Unique Samples: 141
Average Replicas per Sample: 1.13

Replica Distribution:
  1 replica(s): 125 samples
  2 replica(s): 13 samples
  3 replica(s): 3 samples


In [5]:
# Origins with respect to their types
print("\n" + "=" * 80)
print("ORIGINS AND THEIR TYPES")
print("=" * 80)

for origin in sorted(ml_data['Origin'].unique()):
    origin_data = ml_data[ml_data['Origin'] == origin]
    types = origin_data['Type'].unique()
    
    # Count unique samples for this origin
    origin_samples = origin_data['Sample_ID'].nunique()
    print(f"\n{origin}: {len(types)} type(s), {origin_samples} samples")
    
    for ftype in sorted(types):
        ftype_data = origin_data[origin_data['Type'] == ftype]
        count = ftype_data['Sample_ID'].nunique()
        spectra = len(ftype_data)
        print(f"  - {ftype}: {count} samples, {spectra} spectra")


ORIGINS AND THEIR TYPES

Man-made: 8 type(s), 96 samples
  - Acrylic (≥ 85% acrylonitrile): 13 samples, 14 spectra
  - Aramid (Aromatic polyamide): 4 samples, 4 spectra
  - Cellulose acetate: 4 samples, 4 spectra
  - Modacrylic (35 - 85% acrylonitrile): 16 samples, 19 spectra
  - Polyamide (Nylon): 18 samples, 18 spectra
  - Polyester: 15 samples, 17 spectra
  - Polyolefin: 8 samples, 8 spectra
  - Regenerated cellulose (Rayon): 18 samples, 18 spectra

Natural: 2 type(s), 45 samples
  - Cellulose: 17 samples, 22 spectra
  - Protein: 28 samples, 36 spectra


In [6]:
# Types with respect to their subtypes
print("=" * 80)
print("TYPES AND THEIR SUBTYPES")
print("=" * 80)

for ftype in sorted(ml_data['Type'].unique()):
    ftype_data = ml_data[ml_data['Type'] == ftype]
    subtypes = ftype_data['Subtype'].unique()
    
    print(f"\n{ftype}: {len(subtypes)} subtype(s)")
    for subtype in sorted(subtypes):
        subtype_data = ftype_data[ftype_data['Subtype'] == subtype]
        unique_samples = subtype_data['Sample_ID'].nunique()
        total_spectra = len(subtype_data)
        print(f"  - {subtype}: {unique_samples} samples, {total_spectra} spectra")

TYPES AND THEIR SUBTYPES

Acrylic (≥ 85% acrylonitrile): 5 subtype(s)
  - PAN/AA/MA terpolymer: 2 samples, 2 spectra
  - PAN/MA copolymer: 7 samples, 7 spectra
  - PAN/MMA/MA terpolymer: 1 samples, 2 spectra
  - PAN/VA copolymer: 1 samples, 1 spectra
  - Unspecified PAN (acrylic) copolymer: 2 samples, 2 spectra

Aramid (Aromatic polyamide): 3 subtype(s)
  - Meta-aramid (Nomex®): 1 samples, 1 spectra
  - Para-aramid (Kevlar®): 2 samples, 2 spectra
  - Unspecified aramid: 1 samples, 1 spectra

Cellulose: 3 subtype(s)
  - Cotton: 11 samples, 14 spectra
  - Jute: 2 samples, 2 spectra
  - Linen: 4 samples, 6 spectra

Cellulose acetate: 2 subtype(s)
  - Cellulose diacetate (secondary acetate): 3 samples, 3 spectra
  - Cellulose triacetate: 1 samples, 1 spectra

Modacrylic (35 - 85% acrylonitrile): 5 subtype(s)
  - PAN/VA/VC terpolymer: 5 samples, 5 spectra
  - PAN/VBr copolymer: 1 samples, 2 spectra
  - PAN/VC copolymer: 2 samples, 2 spectra
  - PAN/VDC copolymer: 3 samples, 4 spectra
  - Un

# Visualisation

In [7]:
# Create hierarchical sunburst visualization
print("Creating hierarchical sunburst visualization...")
print("=" * 80)

# Get unique samples only (one row per sample)
unique_samples = ml_data.drop_duplicates(subset='Sample_ID')
total_samples = len(unique_samples)

# Prepare data for sunburst plot with correct percentages
sunburst_data = []

# Add root
sunburst_data.append({
    'labels': 'All Fibers',
    'parents': '',
    'values': total_samples,
    'ids': 'All Fibers',
    'text': f'All Fibers<br>{total_samples} samples<br>100%'
})

# Add Origin level
for origin in unique_samples['Origin'].unique():
    origin_df = unique_samples[unique_samples['Origin'] == origin]
    origin_count = len(origin_df)
    origin_pct = (origin_count / total_samples) * 100
    
    sunburst_data.append({
        'labels': origin,
        'parents': 'All Fibers',
        'values': origin_count,
        'ids': origin,
        'text': f'{origin}<br>{origin_count} samples<br>{origin_pct:.1f}%'
    })
    
    # Add Type level
    for ftype in origin_df['Type'].unique():
        type_df = origin_df[origin_df['Type'] == ftype]
        type_count = len(type_df)
        type_pct = (type_count / origin_count) * 100
        type_id = f"{origin}|{ftype}"
        
        sunburst_data.append({
            'labels': ftype,
            'parents': origin,
            'values': type_count,
            'ids': type_id,
            'text': f'{ftype}<br>{type_count} samples<br>{type_pct:.1f}%'
        })
        
        # Add Subtype level
        for subtype in type_df['Subtype'].unique():
            subtype_df = type_df[type_df['Subtype'] == subtype]
            subtype_count = len(subtype_df)
            subtype_pct = (subtype_count / type_count) * 100
            subtype_id = f"{type_id}|{subtype}"
            
            sunburst_data.append({
                'labels': subtype,
                'parents': type_id,
                'values': subtype_count,
                'ids': subtype_id,
                'text': f'{subtype}<br>{subtype_count} sample{"s" if subtype_count > 1 else ""}<br>{subtype_pct:.1f}%'
            })

# Create DataFrame for plotly
sunburst_df = pd.DataFrame(sunburst_data)

print(f"Sunburst data prepared: {len(sunburst_df)} nodes")
print(f"Total samples: {total_samples}")
print(f"  - Natural: {len(unique_samples[unique_samples['Origin'] == 'Natural'])} ({(len(unique_samples[unique_samples['Origin'] == 'Natural'])/total_samples*100):.1f}%)")
print(f"  - Man-made: {len(unique_samples[unique_samples['Origin'] == 'Man-made'])} ({(len(unique_samples[unique_samples['Origin'] == 'Man-made'])/total_samples*100):.1f}%)")
sunburst_df.head()


Creating hierarchical sunburst visualization...
Sunburst data prepared: 44 nodes
Total samples: 141
  - Natural: 45 (31.9%)
  - Man-made: 96 (68.1%)


Unnamed: 0,labels,parents,values,ids,text
0,All Fibers,,141,All Fibers,All Fibers<br>141 samples<br>100%
1,Man-made,All Fibers,96,Man-made,Man-made<br>96 samples<br>68.1%
2,Cellulose acetate,Man-made,4,Man-made|Cellulose acetate,Cellulose acetate<br>4 samples<br>4.2%
3,Cellulose diacetate (secondary acetate),Man-made|Cellulose acetate,3,Man-made|Cellulose acetate|Cellulose diacetate...,Cellulose diacetate (secondary acetate)<br>3 s...
4,Cellulose triacetate,Man-made|Cellulose acetate,1,Man-made|Cellulose acetate|Cellulose triacetate,Cellulose triacetate<br>1 sample<br>25.0%


In [8]:
# Create Icicle plot 
import plotly.io as pio
pio.renderers.default = 'vscode'

print("\nCreating icicle visualization (better for long labels)...")
print("=" * 80)

fig_icicle = px.icicle(
    sunburst_df,
    names='labels',
    parents='parents',
    values='values',
    ids='ids',
    title='Hierarchical Distribution of Fiber Samples (Icicle Layout)<br><sub>Origin → Type → Subtype (Unique Samples Only)</sub>',
    color='values',
    color_continuous_scale='RdYlBu_r',
    hover_data={'values': True}
)

# Update layout for better text visibility
fig_icicle.update_layout(
    width=1400,
    height=900,
    font=dict(size=11),
    title_font=dict(size=18, family='Arial Black'),
    margin=dict(t=120, l=10, r=10, b=10)
)

# Update traces with custom text showing correct percentages
fig_icicle.update_traces(
    textinfo='text',
    text=sunburst_df['text'],
    hovertemplate='<b>%{label}</b><extra></extra>',
    marker=dict(line=dict(width=2, color='white')),
    textfont=dict(size=10),
    branchvalues='total'  # Ensures values are summed correctly in hierarchy
)

fig_icicle.show()

# Verify data integrity
print("\nData verification:")
natural_count = sunburst_df[sunburst_df['labels'] == 'Natural']['values'].values[0]
manmade_count = sunburst_df[sunburst_df['labels'] == 'Man-made']['values'].values[0]
total_count = sunburst_df[sunburst_df['labels'] == 'All Fibers']['values'].values[0]
print(f"Natural: {natural_count}, Man-made: {manmade_count}, Total: {total_count}")
print(f"Sum matches total: {natural_count + manmade_count == total_count}")


Creating icicle visualization (better for long labels)...



Data verification:
Natural: 45, Man-made: 96, Total: 141
Sum matches total: True


In [9]:
# Alternative: Treemap visualization
print("\nCreating treemap visualization...")
print("=" * 80)

# Create treemap
fig_treemap = px.treemap(
    sunburst_df,
    names='labels',
    parents='parents',
    values='values',
    ids='ids',
    title='Hierarchical Treemap of Fiber Samples<br><sub>Origin → Type → Subtype</sub>',
    color='values',
    color_continuous_scale='RdYlBu_r',
    hover_data={'values': True}
)

# Update layout
fig_treemap.update_layout(
    width=1200,
    height=800,
    font=dict(size=11),
    title_font=dict(size=18, family='Arial Black'),
    margin=dict(t=100, l=10, r=10, b=10)
)

# Update traces
fig_treemap.update_traces(
    textinfo='text',
    text=sunburst_df['text'],
    hovertemplate='<b>%{label}</b><br>Samples: %{value}<br>Percentage: %{text}<extra></extra>',
    marker=dict(line=dict(width=2, color='white')),
    branchvalues='total'  # Ensures values are summed correctly in hierarchy
)

fig_treemap.show()

# Verify data integrity
print("\nData verification:")
natural_count = sunburst_df[sunburst_df['labels'] == 'Natural']['values'].values[0]
manmade_count = sunburst_df[sunburst_df['labels'] == 'Man-made']['values'].values[0]
total_count = sunburst_df[sunburst_df['labels'] == 'All Fibers']['values'].values[0]
print(f"Natural: {natural_count}, Man-made: {manmade_count}, Total: {total_count}")
print(f"Sum matches total: {natural_count + manmade_count == total_count}")


Creating treemap visualization...



Data verification:
Natural: 45, Man-made: 96, Total: 141
Sum matches total: True


In [10]:
# Create static sunburst chart for publication using Plotly
print("\nCreating static sunburst chart for publication...")
print("=" * 80)

# Simplified color scheme based on hierarchy level
import matplotlib.colors as mcolors

# Define colors
ROOT_COLOR = '#757575'  # Gray
NATURAL_ORIGIN = '#2E7D32'  # Dark green
NATURAL_TYPE = '#4CAF50'  # Medium green  
NATURAL_SUBTYPE = '#A5D6A7'  # Light green
MANMADE_ORIGIN = '#1565C0'  # Dark blue
MANMADE_TYPE = '#2196F3'  # Medium blue
MANMADE_SUBTYPE = '#90CAF9'  # Light blue

# Assign colors based on hierarchy level
colors_list = []
text_colors = []

for idx, row in sunburst_df.iterrows():
    label = row['labels']
    parent = row['parents']
    parent_id = row.get('ids', '')
    
    if parent == '':  # Root
        colors_list.append(ROOT_COLOR)
        text_colors.append('white')
    elif parent == 'All Fibers':  # Origin level
        if label == 'Natural':
            colors_list.append(NATURAL_ORIGIN)
        else:  # Man-made
            colors_list.append(MANMADE_ORIGIN)
        text_colors.append('white')
    elif parent in ['Natural', 'Man-made']:  # Type level
        if parent == 'Natural':
            colors_list.append(NATURAL_TYPE)
        else:  # Man-made
            colors_list.append(MANMADE_TYPE)
        text_colors.append('white')
    else:  # Subtype level
        # Determine if parent is Natural or Man-made by checking parent chain
        if 'Natural' in parent_id:
            colors_list.append(NATURAL_SUBTYPE)
        else:  # Man-made
            colors_list.append(MANMADE_SUBTYPE)
        text_colors.append('black')

# Create the sunburst with custom colors
fig_static = go.Figure(go.Sunburst(
    labels=sunburst_df['labels'],
    parents=sunburst_df['parents'],
    values=sunburst_df['values'],
    ids=sunburst_df['ids'],
    text=sunburst_df['text'],
    texttemplate='%{text}',
    hovertemplate='<b>%{label}</b><extra></extra>',
    branchvalues='total',
    marker=dict(
        colors=colors_list,
        line=dict(color='white', width=3)
    ),
    textfont=dict(size=13, family='Arial Black'),
    insidetextorientation='auto',
    domain=dict(x=[0, 1], y=[0, 1])
))

# Apply custom text colors
fig_static.data[0].textfont.color = text_colors

# Verify the data adds up correctly
print("\nData verification:")
natural_count = sunburst_df[sunburst_df['labels'] == 'Natural']['values'].values[0]
manmade_count = sunburst_df[sunburst_df['labels'] == 'Man-made']['values'].values[0]
total_count = sunburst_df[sunburst_df['labels'] == 'All Fibers']['values'].values[0]
print(f"Natural: {natural_count}")
print(f"Man-made: {manmade_count}")
print(f"Sum: {natural_count + manmade_count}")
print(f"Total (All Fibers): {total_count}")
print(f"Match: {natural_count + manmade_count == total_count}")

# Update layout for publication quality
fig_static.update_layout(
    width=2000,
    height=2000,
    title=dict(
        text='Hierarchical Distribution of Fiber Samples<br><sub>Origin → Type → Subtype (n=124 unique samples)</sub>',
        font=dict(size=28, family='Arial Black'),
        x=0.5,
        xanchor='center'
    ),
    font=dict(size=16, family='Arial'),
    margin=dict(t=200, l=150, r=150, b=150),
    paper_bgcolor='white',
    plot_bgcolor='white'
)

# Display the figure
fig_static.show()

# Export as high-resolution static image
static_path = images_dir / 'fiber_distribution_sunburst_static.png'
fig_static.write_image(static_path, width=2000, height=2000, scale=3)

print(f"\n✓ Static sunburst chart saved to: {static_path}")
print(f"  Resolution: 6000x6000 pixels")
print(f"  File size: {static_path.stat().st_size / 1024:.2f} KB")


Creating static sunburst chart for publication...

Data verification:
Natural: 45
Man-made: 96
Sum: 141
Total (All Fibers): 141
Match: True



✓ Static sunburst chart saved to: images\fiber_distribution_sunburst_static.png
  Resolution: 6000x6000 pixels
  File size: 2587.92 KB


## Fiber Classification Hierarchy - Complete Taxonomy

In [11]:
# Create comprehensive hierarchical classification diagram
print("Creating complete fiber classification hierarchy...")
print("=" * 80)

# Define complete classification hierarchy (sampled + unsampled)
complete_hierarchy = [
    # Root
    {'labels': 'All Fibers', 'parents': '', 'sampled': True},
    
    # Origin Level
    {'labels': 'Natural', 'parents': 'All Fibers', 'sampled': True},
    {'labels': 'Man-made', 'parents': 'All Fibers', 'sampled': True},
    
    # Natural - Type Level
    {'labels': 'Cellulose', 'parents': 'Natural', 'sampled': True},
    {'labels': 'Protein', 'parents': 'Natural', 'sampled': True},
    
    # Natural - Cellulose Subtypes
    {'labels': 'Cotton', 'parents': 'Cellulose', 'sampled': True},
    {'labels': 'Linen', 'parents': 'Cellulose', 'sampled': True},
    {'labels': 'Jute', 'parents': 'Cellulose', 'sampled': True},
    {'labels': 'Hemp', 'parents': 'Cellulose', 'sampled': False},
    {'labels': 'Ramie', 'parents': 'Cellulose', 'sampled': False},
    {'labels': 'Sisal', 'parents': 'Cellulose', 'sampled': False},
    
    # Natural - Protein Subtypes
    {'labels': 'Silk', 'parents': 'Protein', 'sampled': True},
    {'labels': 'Wool', 'parents': 'Protein', 'sampled': True},
    {'labels': 'Alpaca', 'parents': 'Protein', 'sampled': False},
    {'labels': 'Cashmere', 'parents': 'Protein', 'sampled': False},
    #{'labels': 'Mohair', 'parents': 'Protein', 'sampled': False},
    
    # Man-made - Type Level
    {'labels': 'Polyester', 'parents': 'Man-made', 'sampled': True},
    {'labels': 'Polyamide (Nylon)', 'parents': 'Man-made', 'sampled': True},
    {'labels': 'Aramid (Aromatic polyamide)', 'parents': 'Man-made', 'sampled': False},
    {'labels': 'Acrylic (≥ 85% acrylonitrile)', 'parents': 'Man-made', 'sampled': True},
    {'labels': 'Modacrylic (35-85% acrylonitrile)', 'parents': 'Man-made', 'sampled': True},
    {'labels': 'Regenerated cellulose (Rayon)', 'parents': 'Man-made', 'sampled': True},
    {'labels': 'Cellulose acetate', 'parents': 'Man-made', 'sampled': True},
    {'labels': 'Polyolefin', 'parents': 'Man-made', 'sampled': False},
    {'labels': 'Elastane', 'parents': 'Man-made', 'sampled': False},
    {'labels': 'PVC', 'parents': 'Man-made', 'sampled': False},
    {'labels': 'PVA', 'parents': 'Man-made', 'sampled': False},
    {'labels': 'Fluorofibre', 'parents': 'Man-made', 'sampled': False},
    # {'labels': 'Carbon', 'parents': 'Man-made', 'sampled': False},
    
    # Polyester Subtypes
    {'labels': 'Poly(ethylene terephthalate) (PET)', 'parents': 'Polyester', 'sampled': True},
    {'labels': 'Poly(butylene terephthalate) (PBT)', 'parents': 'Polyester', 'sampled': False},
    
    # Polyamide Subtypes
    {'labels': 'Polyamide 6 (PA 6)', 'parents': 'Polyamide (Nylon)', 'sampled': True},
    {'labels': 'Polyamide 6,6 (PA 66)', 'parents': 'Polyamide (Nylon)', 'sampled': True},
    {'labels': 'Unspecified polyamide', 'parents': 'Polyamide (Nylon)', 'sampled': True},
    
    # Aramid Subtypes
    {'labels': 'Para-Aramid (Poly para-phenylene terephthalamide/Kevlar®)', 'parents': 'Aramid (Aromatic polyamide)', 'sampled': False},
    {'labels': 'Meta-Aramid (Poly meta-phenylene isophthalamide/Nomex®)', 'parents': 'Aramid (Aromatic polyamide)', 'sampled': False},
    {'labels': 'Unspecified aramid', 'parents': 'Aramid (Aromatic polyamide)', 'sampled': True},
    # {'labels': 'Twaron', 'parents': 'Aramid (Aromatic polyamide)', 'sampled': False},
    
    # Acrylic Subtypes
    {'labels': 'PAN/MMA copolymer', 'parents': 'Acrylic (≥ 85% acrylonitrile)', 'sampled': True},
    {'labels': 'PAN/MA copolymer', 'parents': 'Acrylic (≥ 85% acrylonitrile)', 'sampled': True},
    {'labels': 'PAN/VA copolymer', 'parents': 'Acrylic (≥ 85% acrylonitrile)', 'sampled': True},
    {'labels': 'PAN/AA copolymer', 'parents': 'Acrylic (≥ 85% acrylonitrile)', 'sampled': True},
    {'labels': 'Unspecified PAN (acrylic)', 'parents': 'Acrylic (≥ 85% acrylonitrile)', 'sampled': True},
    
    # Modacrylic Subtypes
    {'labels': 'PAN/VDC copolymer', 'parents': 'Modacrylic (35-85% acrylonitrile)', 'sampled': True},
    {'labels': 'PAN/VA/VC copolymer', 'parents': 'Modacrylic (35-85% acrylonitrile)', 'sampled': True},
    {'labels': 'VDC copolymer', 'parents': 'Modacrylic (35-85% acrylonitrile)', 'sampled': True},
    {'labels': 'Unspecified PAN (modacrylic)', 'parents': 'Modacrylic (35-85% acrylonitrile)', 'sampled': True},
    
    # Rayon Subtypes
    {'labels': 'Viscose', 'parents': 'Regenerated cellulose (Rayon)', 'sampled': True},
    {'labels': 'Lyocell', 'parents': 'Regenerated cellulose (Rayon)', 'sampled': True},
    {'labels': 'Modal', 'parents': 'Regenerated cellulose (Rayon)', 'sampled': False},
    {'labels': 'Unspecified rayon', 'parents': 'Regenerated cellulose (Rayon)', 'sampled': True},
    
    # Acetate Subtypes
    {'labels': 'Cellulose diacetate (secondary acetate)', 'parents': 'Cellulose acetate', 'sampled': True},
    {'labels': 'Cellulose triacetate', 'parents': 'Cellulose acetate', 'sampled': True},
    
    # Polyolefin Subtypes (Not sampled)
    {'labels': 'Polypropylene (PP)', 'parents': 'Polyolefin', 'sampled': False},
    {'labels': 'Polyethylene (PE)', 'parents': 'Polyolefin', 'sampled': False},
    {'labels': 'Unspecified polyolefin', 'parents': 'Polyolefin', 'sampled': False},
    
    # Elastane Subtypes (Not sampled)
    {'labels': 'Polyurethane (PU)', 'parents': 'Elastane', 'sampled': False},
    # {'labels': 'Spandex', 'parents': 'Elastane', 'sampled': False},
    # {'labels': 'Lycra', 'parents': 'Elastane', 'sampled': False},
    
    # PVC Subtypes (Not sampled)
    {'labels': 'Polyvinyl Chloride (PVC)', 'parents': 'PVC', 'sampled': False},
    
    # PVA Subtypes (Not sampled)
    {'labels': 'Polyvinyl alcohol (PVA)', 'parents': 'PVA', 'sampled': False},
    
    # Fluorofibre Subtypes (Not sampled)
    {'labels': 'Polytetrafluoroethylene (PTFE/Teflon®)', 'parents': 'Fluorofibre', 'sampled': False},
    #{'labels': 'Teflon', 'parents': 'Fluorofibre', 'sampled': False},
    
    # Carbon Subtypes (Not sampled)
    # {'labels': 'Carbon fiber (PAN-based)', 'parents': 'Carbon', 'sampled': False},
    # {'labels': 'Carbon fiber (Pitch-based)', 'parents': 'Carbon', 'sampled': False},
]

# Create DataFrame
hierarchy_df = pd.DataFrame(complete_hierarchy)

# Add dummy values for visualization (all set to 1 for equal-sized display)
hierarchy_df['values'] = 1

print(f"Complete hierarchy: {len(hierarchy_df)} categories")
print(f"  - Sampled: {len(hierarchy_df[hierarchy_df['sampled'] == True])}")
print(f"  - Not yet sampled: {len(hierarchy_df[hierarchy_df['sampled'] == False])}")
print("\nHierarchy prepared successfully!")
hierarchy_df.head(10)

Creating complete fiber classification hierarchy...
Complete hierarchy: 57 categories
  - Sampled: 35
  - Not yet sampled: 22

Hierarchy prepared successfully!


Unnamed: 0,labels,parents,sampled,values
0,All Fibers,,True,1
1,Natural,All Fibers,True,1
2,Man-made,All Fibers,True,1
3,Cellulose,Natural,True,1
4,Protein,Natural,True,1
5,Cotton,Cellulose,True,1
6,Linen,Cellulose,True,1
7,Jute,Cellulose,True,1
8,Hemp,Cellulose,False,1
9,Ramie,Cellulose,False,1


In [12]:
# VALIDATE HIERARCHY STRUCTURE
print("\n" + "=" * 80)
print("HIERARCHY VALIDATION CHECK")
print("=" * 80)

# Check for orphaned nodes (parents that don't exist as labels)
all_labels = set(hierarchy_df['labels'].values)
all_parents = set(hierarchy_df['parents'].values) - {''}  # Exclude root's empty parent

orphaned = all_parents - all_labels
if orphaned:
    print(f"\nWARNING: Orphaned nodes found (parents don't exist as labels):")
    for parent in orphaned:
        children = hierarchy_df[hierarchy_df['parents'] == parent]['labels'].values
        print(f"  - Parent '{parent}' has children: {list(children)}")
else:
    print("\nNo orphaned nodes - all parents exist as labels")

# Display hierarchy levels
print("\n" + "=" * 80)
print("HIERARCHY BREAKDOWN BY LEVEL")
print("=" * 80)

root = hierarchy_df[hierarchy_df['parents'] == '']
origins = hierarchy_df[hierarchy_df['parents'] == 'All Fibers']

print(f"\nLevel 0 - Root: {len(root)}")
for label in root['labels'].values:
    print(f"  • {label}")

print(f"\nLevel 1 - Origins: {len(origins)}")
for label in origins['labels'].values:
    print(f"  • {label}")
    
print(f"\nLevel 2 - Types:")
print(f"  Under Natural: {len(hierarchy_df[hierarchy_df['parents'] == 'Natural'])}")
for label in hierarchy_df[hierarchy_df['parents'] == 'Natural']['labels'].values:
    print(f"    • {label}")
    
print(f"\n  Under Man-made: {len(hierarchy_df[hierarchy_df['parents'] == 'Man-made'])}")
for label in hierarchy_df[hierarchy_df['parents'] == 'Man-made']['labels'].values:
    print(f"    • {label}")

print(f"\nLevel 3 - Subtypes:")
# Natural subtypes
natural_type_labels = hierarchy_df[hierarchy_df['parents'] == 'Natural']['labels'].values
subtypes_natural = hierarchy_df[hierarchy_df['parents'].isin(natural_type_labels)]
print(f"  Under Natural types: {len(subtypes_natural)}")
for parent in natural_type_labels:
    children = hierarchy_df[hierarchy_df['parents'] == parent]
    if len(children) > 0:
        print(f"    {parent}: {len(children)} subtypes")
        for label in children['labels'].values:
            print(f"      - {label}")

# Man-made subtypes  
manmade_type_labels = hierarchy_df[hierarchy_df['parents'] == 'Man-made']['labels'].values
subtypes_manmade = hierarchy_df[hierarchy_df['parents'].isin(manmade_type_labels)]
print(f"\n  Under Man-made types: {len(subtypes_manmade)}")
for parent in manmade_type_labels:
    children = hierarchy_df[hierarchy_df['parents'] == parent]
    if len(children) > 0:
        print(f"    {parent}: {len(children)} subtypes")
        for label in children['labels'].values:
            print(f"      - {label}")

print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total categories: {len(hierarchy_df)}")
print(f"  Level 0 (Root): {len(root)}")
print(f"  Level 1 (Origins): {len(origins)}")
print(f"  Level 2 (Types): {len(hierarchy_df[hierarchy_df['parents'].isin(['Natural', 'Man-made'])])}")
print(f"  Level 3 (Subtypes): {len(subtypes_natural) + len(subtypes_manmade)}")


HIERARCHY VALIDATION CHECK

No orphaned nodes - all parents exist as labels

HIERARCHY BREAKDOWN BY LEVEL

Level 0 - Root: 1
  • All Fibers

Level 1 - Origins: 2
  • Natural
  • Man-made

Level 2 - Types:
  Under Natural: 2
    • Cellulose
    • Protein

  Under Man-made: 12
    • Polyester
    • Polyamide (Nylon)
    • Aramid (Aromatic polyamide)
    • Acrylic (≥ 85% acrylonitrile)
    • Modacrylic (35-85% acrylonitrile)
    • Regenerated cellulose (Rayon)
    • Cellulose acetate
    • Polyolefin
    • Elastane
    • PVC
    • PVA
    • Fluorofibre

Level 3 - Subtypes:
  Under Natural types: 10
    Cellulose: 6 subtypes
      - Cotton
      - Linen
      - Jute
      - Hemp
      - Ramie
      - Sisal
    Protein: 4 subtypes
      - Silk
      - Wool
      - Alpaca
      - Cashmere

  Under Man-made types: 30
    Polyester: 2 subtypes
      - Poly(ethylene terephthalate) (PET)
      - Poly(butylene terephthalate) (PBT)
    Polyamide (Nylon): 3 subtypes
      - Polyamide 6 (PA 6)
    

In [13]:
print("Creating fiber classification taxonomy mindmap...")
print("=" * 80)

import plotly.io as pio
pio.renderers.default = 'vscode'
import plotly.graph_objects as go

# Create a network-style mindmap using plotly graph
# Build node positions and connections

# Assign hierarchy levels for layout
hierarchy_df['level'] = 0
for idx, row in hierarchy_df.iterrows():
    if row['parents'] == '':
        hierarchy_df.at[idx, 'level'] = 0
    elif row['parents'] == 'All Fibers':
        hierarchy_df.at[idx, 'level'] = 1
    elif row['parents'] in ['Natural', 'Man-made']:
        hierarchy_df.at[idx, 'level'] = 2
    else:
        hierarchy_df.at[idx, 'level'] = 3

# Calculate positions for mindmap layout
import numpy as np

# Get nodes by level
level_0 = hierarchy_df[hierarchy_df['level'] == 0]  # Root
level_1 = hierarchy_df[hierarchy_df['level'] == 1]  # Origins
level_2 = hierarchy_df[hierarchy_df['level'] == 2]  # Types
level_3 = hierarchy_df[hierarchy_df['level'] == 3]  # Subtypes

# Create positions
node_x = []
node_y = []
node_labels = []
node_colors = []

# Root at center
node_x.append(0)
node_y.append(0)
node_labels.append(level_0.iloc[0]['labels'])
node_colors.append('#333333')

# Origins: left (Natural) and right (Man-made)
natural_idx = None
manmade_idx = None
for idx, row in level_1.iterrows():
    if row['labels'] == 'Natural':
        node_x.append(-3)
        node_y.append(0)
        natural_idx = len(node_x) - 1
        node_colors.append('#2E7D32')
    else:  # Man-made
        node_x.append(3)
        node_y.append(0)
        manmade_idx = len(node_x) - 1
        node_colors.append('#1565C0')
    node_labels.append(row['labels'])

# Types under Natural (left side)
natural_types = level_2[level_2['parents'] == 'Natural']
n_nat_types = len(natural_types)
for i, (idx, row) in enumerate(natural_types.iterrows()):
    y_pos = (i - (n_nat_types-1)/2) * 1.5
    node_x.append(-5.5)
    node_y.append(y_pos)
    node_labels.append(row['labels'])
    node_colors.append('#4CAF50')

# Types under Man-made (right side)
manmade_types = level_2[level_2['parents'] == 'Man-made']
n_mm_types = len(manmade_types)
for i, (idx, row) in enumerate(manmade_types.iterrows()):
    y_pos = (i - (n_mm_types-1)/2) * 1.2
    node_x.append(5.5)
    node_y.append(y_pos)
    node_labels.append(row['labels'])
    node_colors.append('#2196F3')

# Subtypes (outer ring)
for parent_label in level_2['labels']:
    subtypes = level_3[level_3['parents'] == parent_label]
    n_subtypes = len(subtypes)
    
    # Find parent position
    parent_idx = node_labels.index(parent_label)
    parent_x = node_x[parent_idx]
    parent_y = node_y[parent_idx]
    
    # Determine if Natural or Man-made branch
    is_natural = parent_x < 0
    
    for i, (idx, row) in enumerate(subtypes.iterrows()):
        # Spread subtypes around parent
        x_offset = -7.5 if is_natural else 7.5
        y_offset = parent_y + (i - (n_subtypes-1)/2) * 0.6
        
        node_x.append(x_offset)
        node_y.append(y_offset)
        node_labels.append(row['labels'])
        node_colors.append('#A5D6A7' if is_natural else '#90CAF9')

# Create edges
edge_x = []
edge_y = []

for idx, row in hierarchy_df.iterrows():
    if row['parents'] != '':
        # Find parent and child indices
        child_label = row['labels']
        parent_label = row['parents']
        
        child_idx = node_labels.index(child_label)
        parent_idx = node_labels.index(parent_label)
        
        # Add edge
        edge_x.extend([node_x[parent_idx], node_x[child_idx], None])
        edge_y.extend([node_y[parent_idx], node_y[child_idx], None])

# Create figure
fig_taxonomy = go.Figure()

# Add edges
fig_taxonomy.add_trace(go.Scatter(
    x=edge_x, y=edge_y,
    mode='lines',
    line=dict(color='#DDDDDD', width=1),
    hoverinfo='none',
    showlegend=False
))

# Add small marker nodes (just for connection points)
fig_taxonomy.add_trace(go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    marker=dict(
        size=8,
        color=node_colors,
        line=dict(color='white', width=1)
    ),
    hoverinfo='none',
    showlegend=False
))

# Add text labels OUTSIDE the markers for better readability
# Determine text position based on location
text_positions = []
for i, x in enumerate(node_x):
    if i == 0:  # Root
        text_positions.append('top center')
    elif x < -6:  # Far left (subtypes)
        text_positions.append('middle left')
    elif x > 6:  # Far right (subtypes)
        text_positions.append('middle right')
    elif x < 0:  # Left side (Natural)
        text_positions.append('middle left')
    else:  # Right side (Man-made)
        text_positions.append('middle right')

fig_taxonomy.add_trace(go.Scatter(
    x=node_x, y=node_y,
    mode='text',
    text=node_labels,
    textposition=text_positions,
    textfont=dict(size=11, family='Arial', color='#333333'),
    hovertemplate='<b>%{text}</b><extra></extra>',
    showlegend=False
))

fig_taxonomy.update_layout(
    title={
        'text': 'Fiber Classification Taxonomy - Mindmap View<br><sub>Origin → Type → Subtype Hierarchy</sub>',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20, 'family': 'Arial Black'}
    },
    showlegend=False,
    hovermode='closest',
    width=1800,
    height=1400,
    margin=dict(t=100, l=200, r=200, b=50),
    paper_bgcolor='white',
    plot_bgcolor='white',
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)

fig_taxonomy.show()

print(f"\nTaxonomy mindmap created successfully!")
print(f"Total categories: {len(hierarchy_df)}")
print(f"  - Origin level: 2")
print(f"  - Type level: {len(hierarchy_df[hierarchy_df['parents'].isin(['Natural', 'Man-made'])])}")
print(f"  - Subtype level: {len(hierarchy_df) - 1 - 2 - len(hierarchy_df[hierarchy_df['parents'].isin(['Natural', 'Man-made'])])}")

Creating fiber classification taxonomy mindmap...



Taxonomy mindmap created successfully!
Total categories: 57
  - Origin level: 2
  - Type level: 14
  - Subtype level: 40


In [14]:
# Export taxonomy diagram
print("Exporting taxonomy diagrams...")
print("=" * 80)

# Save static PNG version (high resolution for publication)
taxonomy_png_path = images_dir / 'fiber_taxonomy_complete.png'
fig_taxonomy.write_image(taxonomy_png_path, width=2000, height=2000, scale=3)
png_size = taxonomy_png_path.stat().st_size / 1024
print(f"\n✓ Static PNG exported: {taxonomy_png_path}")
print(f"  Resolution: 6000x6000 pixels")
print(f"  File size: {png_size:.2f} KB")

Exporting taxonomy diagrams...

✓ Static PNG exported: images\fiber_taxonomy_complete.png
  Resolution: 6000x6000 pixels
  File size: 1538.98 KB
