In [27]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.preprocessing import StandardScaler

In [28]:
# Configuration
INPUT_FILE_PATH = 'MPEA_dataset.csv'
TARGET_COLUMN = 'yield_strength_mpa'
MIN_SAMPLE = 1000

In [29]:
df = pd.read_csv(INPUT_FILE_PATH)

In [30]:
# This is not inplace, as you can see, original df still points to previous data
# As you'll see in the next cell
new_columns_df = df.rename(columns={
    'IDENTIFIER: Reference ID': 'reference_id',
    'FORMULA': 'formula',
    'PROPERTY: Microstructure': 'microstructure',
    'PROPERTY: Processing method': 'processing_method',
    'PROPERTY: BCC/FCC/other': 'crystal_structure',
    'PROPERTY: grain size ($\\mu$m)': 'grain_size_um',
    'PROPERTY: Exp. Density (g/cm$^3$)': 'density_experimental',
    'PROPERTY: Calculated Density (g/cm$^3$)': 'density_calculated',
    'PROPERTY: HV': 'hardness_vickers',
    'PROPERTY: Type of test': 'test_type',
    'PROPERTY: Test temperature ($^\\circ$C)': 'test_temperature_c',
    'PROPERTY: YS (MPa)': 'yield_strength_mpa',
    'PROPERTY: UTS (MPa)': 'tensile_strength_mpa',
    'PROPERTY: Elongation (%)': 'elongation_percent',
    'PROPERTY: Elongation plastic (%)': 'elongation_plastic_percent',
    'PROPERTY: Exp. Young modulus (GPa)': 'youngs_modulus_experimental',
    'PROPERTY: Calculated Young modulus (GPa)': 'youngs_modulus_calculated',
    'PROPERTY: O content (wppm)': 'oxygen_content_ppm',
    'PROPERTY: N content (wppm)': 'nitrogen_content_ppm',
    'PROPERTY: C content (wppm)': 'carbon_content_ppm',
    'REFERENCE: doi': 'doi',
    'REFERENCE: year': 'publication_year',
    'REFERENCE: title': 'publication_title'
})

new_columns_df

Unnamed: 0,reference_id,formula,microstructure,processing_method,crystal_structure,grain_size_um,density_experimental,density_calculated,hardness_vickers,test_type,...,elongation_percent,elongation_plastic_percent,youngs_modulus_experimental,youngs_modulus_calculated,oxygen_content_ppm,nitrogen_content_ppm,carbon_content_ppm,doi,publication_year,publication_title
0,27,Al0.25 Co1 Fe1 Ni1,FCC,CAST,FCC,,,7.9,138.0,C,...,,,,192.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
1,27,Al0.5 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.4,212.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
2,27,Al0.75 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.0,385.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
3,27,Al1 Co1 Fe1 Ni1,BCC,CAST,BCC,,,6.6,456.0,C,...,,,,162.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
4,27,Co1 Fe1 Ni1,FCC,CAST,FCC,,,8.5,125.0,C,...,,,,207.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1540,265,Hf1 Nb1 Ta1 Ti1,BCC,CAST,BCC,,,10.9,270.0,,...,,,,119.0,,,,10.1016/j.matdes.2018.06.003,2018,"Phase equilibria, mechanical properties and de..."
1541,265,Mo1 Nb1 Ta1 Ti1 W1,BCC,ANNEAL,BCC,,,11.8,482.0,,...,,,,222.0,,,,10.1016/j.matdes.2018.06.003,2018,"Phase equilibria, mechanical properties and de..."
1542,265,Mo1 Nb1 Ta1 Ti1 W1,BCC,CAST,BCC,,,11.8,446.0,,...,,,,222.0,,,,10.1016/j.matdes.2018.06.003,2018,"Phase equilibria, mechanical properties and de..."
1543,265,Mo1 Nb1 Ta1 Ti1,BCC,ANNEAL,BCC,,,10.0,407.0,,...,,,,179.0,,,,10.1016/j.matdes.2018.06.003,2018,"Phase equilibria, mechanical properties and de..."


In [31]:
df.head()

Unnamed: 0,IDENTIFIER: Reference ID,FORMULA,PROPERTY: Microstructure,PROPERTY: Processing method,PROPERTY: BCC/FCC/other,PROPERTY: grain size ($\mu$m),PROPERTY: Exp. Density (g/cm$^3$),PROPERTY: Calculated Density (g/cm$^3$),PROPERTY: HV,PROPERTY: Type of test,...,PROPERTY: Elongation (%),PROPERTY: Elongation plastic (%),PROPERTY: Exp. Young modulus (GPa),PROPERTY: Calculated Young modulus (GPa),PROPERTY: O content (wppm),PROPERTY: N content (wppm),PROPERTY: C content (wppm),REFERENCE: doi,REFERENCE: year,REFERENCE: title
0,27,Al0.25 Co1 Fe1 Ni1,FCC,CAST,FCC,,,7.9,138.0,C,...,,,,192.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
1,27,Al0.5 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.4,212.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
2,27,Al0.75 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.0,385.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
3,27,Al1 Co1 Fe1 Ni1,BCC,CAST,BCC,,,6.6,456.0,C,...,,,,162.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
4,27,Co1 Fe1 Ni1,FCC,CAST,FCC,,,8.5,125.0,C,...,,,,207.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...


In [32]:
# Now original df points to new memory
df = new_columns_df
df.head()

Unnamed: 0,reference_id,formula,microstructure,processing_method,crystal_structure,grain_size_um,density_experimental,density_calculated,hardness_vickers,test_type,...,elongation_percent,elongation_plastic_percent,youngs_modulus_experimental,youngs_modulus_calculated,oxygen_content_ppm,nitrogen_content_ppm,carbon_content_ppm,doi,publication_year,publication_title
0,27,Al0.25 Co1 Fe1 Ni1,FCC,CAST,FCC,,,7.9,138.0,C,...,,,,192.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
1,27,Al0.5 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.4,212.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
2,27,Al0.75 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.0,385.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
3,27,Al1 Co1 Fe1 Ni1,BCC,CAST,BCC,,,6.6,456.0,C,...,,,,162.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
4,27,Co1 Fe1 Ni1,FCC,CAST,FCC,,,8.5,125.0,C,...,,,,207.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...


In [33]:
df[TARGET_COLUMN]

0       158.0
1       346.0
2       794.0
3       964.0
4       204.0
        ...  
1540      NaN
1541      NaN
1542      NaN
1543      NaN
1544      NaN
Name: yield_strength_mpa, Length: 1545, dtype: float64

In [34]:
df[TARGET_COLUMN].notna()

0        True
1        True
2        True
3        True
4        True
        ...  
1540    False
1541    False
1542    False
1543    False
1544    False
Name: yield_strength_mpa, Length: 1545, dtype: bool

In [35]:
df[TARGET_COLUMN].notna().sum() # Samples with Yield Strength

np.int64(1067)

In [36]:
(df[TARGET_COLUMN].notna().sum())/len(df[TARGET_COLUMN])*100

np.float64(69.06148867313917)

*More than 69% rows didn't have target. So we are dropping those. Later we will try and fill them. *

`In pandas, standard assignment (=) creates a reference to the original data, meaning changes to the new variable affect the original DataFrame or Series. In contrast, the .copy() method creates a new, independent object in memory, so changes to one do not affect the other`

In [37]:
notNullMask = df[TARGET_COLUMN].notna()
df[notNullMask] # The dataframe only here yield strength has some value

Unnamed: 0,reference_id,formula,microstructure,processing_method,crystal_structure,grain_size_um,density_experimental,density_calculated,hardness_vickers,test_type,...,elongation_percent,elongation_plastic_percent,youngs_modulus_experimental,youngs_modulus_calculated,oxygen_content_ppm,nitrogen_content_ppm,carbon_content_ppm,doi,publication_year,publication_title
0,27,Al0.25 Co1 Fe1 Ni1,FCC,CAST,FCC,,,7.9,138.0,C,...,,,,192.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
1,27,Al0.5 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.4,212.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
2,27,Al0.75 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.0,385.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
3,27,Al1 Co1 Fe1 Ni1,BCC,CAST,BCC,,,6.6,456.0,C,...,,,,162.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
4,27,Co1 Fe1 Ni1,FCC,CAST,FCC,,,8.5,125.0,C,...,,,,207.0,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518,264,Al1 Cr1 Fe1 Mo0.5 Ni1 Ti0.25,BCC+B2,CAST,other,,,6.6,712.0,C,...,,14.3,,,,,,10.1088/0256-307X/35/3/036102,2018,Microstructures and Mechanical Properties of A...
1519,264,Al1 Cr1 Fe1 Mo0.5 Ni1 Ti0.4,BCC+B2,CAST,other,,,6.5,731.9,C,...,,14.8,,,,,,10.1088/0256-307X/35/3/036102,2018,Microstructures and Mechanical Properties of A...
1520,264,Al1 Cr1 Fe1 Mo0.5 Ni1 Ti0.5,BCC+B2,CAST,other,,,6.5,751.7,C,...,,10.4,,,,,,10.1088/0256-307X/35/3/036102,2018,Microstructures and Mechanical Properties of A...
1521,264,Al1 Cr1 Fe1 Mo0.5 Ni1 Ti0.6,BCC+B2,CAST,other,,,6.4,756.1,C,...,,,,,,,,10.1088/0256-307X/35/3/036102,2018,Microstructures and Mechanical Properties of A...


In [38]:
df = df[notNullMask].copy()
print(f" Kept {len(df)} samples with target values")

 Kept 1067 samples with target values


In [39]:
if len(df) < MIN_SAMPLE:
    raise ValueError(f"Only {len(df)} samples found, need at least {MIN_SAMPLES}")

In [40]:
# Selecting relevant features
features_to_keep = [
    'formula', # Will parse into composition
    'crystal_structure', # BCC/FCC/Mixed
    'density_calculated',
    'processing_method',
    'microstructure',
    'test_temperature_c',
    TARGET_COLUMN
]

df = df[features_to_keep].copy()
df.shape

(1067, 7)

In [41]:
# Show missing column counts
df.columns

Index(['formula', 'crystal_structure', 'density_calculated',
       'processing_method', 'microstructure', 'test_temperature_c',
       'yield_strength_mpa'],
      dtype='object')

In [42]:
for col in df.columns:
  missing = df[col].isnull().sum()
  if missing > 0:
    print(f" {col}: {missing} ({missing/len(df)*100:.1f}%)")

 processing_method: 31 (2.9%)
 microstructure: 51 (4.8%)


In [43]:
# Little Na values, so we are gonna fill them
df['processing_method'] = df['processing_method'].fillna('Unknown')
df['microstructure'] = df['microstructure'].fillna('Unknown')

In [44]:
df['test_temperature_c'].isnull().sum()

np.int64(0)

In [45]:
df.isnull().sum()

formula               0
crystal_structure     0
density_calculated    0
processing_method     0
microstructure        0
test_temperature_c    0
yield_strength_mpa    0
dtype: int64

In [46]:
# Parse chemical formulas
def parse_formula(formula):
  """
  Parse formula like "AlCoCrFeNi" into composition dict.
  Returns: {'Al': 0.2, 'Co' : 0.2, 'Cr': 0.2, 'Fe': 0.2, 'Ni': 0.2}
  """

  # Find all element-count pairs
  # Examples : "Al0.5Co" -> [(Al , 0.5, Co, '')]
  elements = re.findall(r'([A-Z][a-z]?)(\d*\.?\d*)', str(formula))

  composition = {}
  for elem, count in elements:
    count = float(count) if count else 1.0
    composition[elem] = composition.get(elem, 0) + count

  # Normalize to percentage (sum = 1.0)
  total = sum(composition.values())
  if total > 0:
    composition = {k: v/total for k, v in composition.items()}

  return composition


In [47]:
df['composition'] = df['formula'].apply(parse_formula)

In [48]:
df.head()

Unnamed: 0,formula,crystal_structure,density_calculated,processing_method,microstructure,test_temperature_c,yield_strength_mpa,composition
0,Al0.25 Co1 Fe1 Ni1,FCC,7.9,CAST,FCC,25.0,158.0,"{'Al': 0.07692307692307693, 'Co': 0.3076923076..."
1,Al0.5 Co1 Fe1 Ni1,other,7.4,CAST,FCC+BCC,25.0,346.0,"{'Al': 0.14285714285714285, 'Co': 0.2857142857..."
2,Al0.75 Co1 Fe1 Ni1,other,7.0,CAST,FCC+BCC,25.0,794.0,"{'Al': 0.2, 'Co': 0.26666666666666666, 'Fe': 0..."
3,Al1 Co1 Fe1 Ni1,BCC,6.6,CAST,BCC,25.0,964.0,"{'Al': 0.25, 'Co': 0.25, 'Fe': 0.25, 'Ni': 0.25}"
4,Co1 Fe1 Ni1,FCC,8.5,CAST,FCC,25.0,204.0,"{'Co': 0.3333333333333333, 'Fe': 0.33333333333..."


In [49]:
new_df = df.copy()

In [50]:
new_df['Name'] = 32

In [51]:
new_df.head()

Unnamed: 0,formula,crystal_structure,density_calculated,processing_method,microstructure,test_temperature_c,yield_strength_mpa,composition,Name
0,Al0.25 Co1 Fe1 Ni1,FCC,7.9,CAST,FCC,25.0,158.0,"{'Al': 0.07692307692307693, 'Co': 0.3076923076...",32
1,Al0.5 Co1 Fe1 Ni1,other,7.4,CAST,FCC+BCC,25.0,346.0,"{'Al': 0.14285714285714285, 'Co': 0.2857142857...",32
2,Al0.75 Co1 Fe1 Ni1,other,7.0,CAST,FCC+BCC,25.0,794.0,"{'Al': 0.2, 'Co': 0.26666666666666666, 'Fe': 0...",32
3,Al1 Co1 Fe1 Ni1,BCC,6.6,CAST,BCC,25.0,964.0,"{'Al': 0.25, 'Co': 0.25, 'Fe': 0.25, 'Ni': 0.25}",32
4,Co1 Fe1 Ni1,FCC,8.5,CAST,FCC,25.0,204.0,"{'Co': 0.3333333333333333, 'Fe': 0.33333333333...",32


In [52]:
df.head()

Unnamed: 0,formula,crystal_structure,density_calculated,processing_method,microstructure,test_temperature_c,yield_strength_mpa,composition
0,Al0.25 Co1 Fe1 Ni1,FCC,7.9,CAST,FCC,25.0,158.0,"{'Al': 0.07692307692307693, 'Co': 0.3076923076..."
1,Al0.5 Co1 Fe1 Ni1,other,7.4,CAST,FCC+BCC,25.0,346.0,"{'Al': 0.14285714285714285, 'Co': 0.2857142857..."
2,Al0.75 Co1 Fe1 Ni1,other,7.0,CAST,FCC+BCC,25.0,794.0,"{'Al': 0.2, 'Co': 0.26666666666666666, 'Fe': 0..."
3,Al1 Co1 Fe1 Ni1,BCC,6.6,CAST,BCC,25.0,964.0,"{'Al': 0.25, 'Co': 0.25, 'Fe': 0.25, 'Ni': 0.25}"
4,Co1 Fe1 Ni1,FCC,8.5,CAST,FCC,25.0,204.0,"{'Co': 0.3333333333333333, 'Fe': 0.33333333333..."


In [53]:
for comp in df['composition']:
  print(comp.keys)

<built-in method keys of dict object at 0x0000029812BF9F80>
<built-in method keys of dict object at 0x0000029812D94500>
<built-in method keys of dict object at 0x0000029812085B40>
<built-in method keys of dict object at 0x000002987F2C9D40>
<built-in method keys of dict object at 0x0000029812D963C0>
<built-in method keys of dict object at 0x0000029812D96300>
<built-in method keys of dict object at 0x0000029812D94DC0>
<built-in method keys of dict object at 0x0000029812C12AC0>
<built-in method keys of dict object at 0x0000029812DA28C0>
<built-in method keys of dict object at 0x0000029812DA1740>
<built-in method keys of dict object at 0x0000029812C116C0>
<built-in method keys of dict object at 0x0000029812DA26C0>
<built-in method keys of dict object at 0x0000029812C10800>
<built-in method keys of dict object at 0x0000029812C11640>
<built-in method keys of dict object at 0x0000029812D97600>
<built-in method keys of dict object at 0x0000029812C10F80>
<built-in method keys of dict object at 

In [54]:
# Find all unique elements
all_elements = set()
for comp in df['composition']:
  all_elements.update(comp.keys())
all_elements = sorted(all_elements)
print(all_elements)
print(f"✓ Found {len(all_elements)} unique elements: {', '.join(all_elements)}")


['Al', 'C', 'Ca', 'Co', 'Cr', 'Cu', 'Fe', 'Hf', 'Li', 'Mg', 'Mn', 'Mo', 'Nb', 'Ni', 'Pd', 'Re', 'Sc', 'Si', 'Sn', 'Ta', 'Ti', 'V', 'W', 'Y', 'Zn', 'Zr']
✓ Found 26 unique elements: Al, C, Ca, Co, Cr, Cu, Fe, Hf, Li, Mg, Mn, Mo, Nb, Ni, Pd, Re, Sc, Si, Sn, Ta, Ti, V, W, Y, Zn, Zr


In [55]:
# Create one column per element
for elem in all_elements:
  df[f'comp_{elem}'] = df['composition'].apply(lambda x: x.get(elem, 0.0))

In [56]:
df.head()

Unnamed: 0,formula,crystal_structure,density_calculated,processing_method,microstructure,test_temperature_c,yield_strength_mpa,composition,comp_Al,comp_C,...,comp_Sc,comp_Si,comp_Sn,comp_Ta,comp_Ti,comp_V,comp_W,comp_Y,comp_Zn,comp_Zr
0,Al0.25 Co1 Fe1 Ni1,FCC,7.9,CAST,FCC,25.0,158.0,"{'Al': 0.07692307692307693, 'Co': 0.3076923076...",0.076923,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Al0.5 Co1 Fe1 Ni1,other,7.4,CAST,FCC+BCC,25.0,346.0,"{'Al': 0.14285714285714285, 'Co': 0.2857142857...",0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Al0.75 Co1 Fe1 Ni1,other,7.0,CAST,FCC+BCC,25.0,794.0,"{'Al': 0.2, 'Co': 0.26666666666666666, 'Fe': 0...",0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Al1 Co1 Fe1 Ni1,BCC,6.6,CAST,BCC,25.0,964.0,"{'Al': 0.25, 'Co': 0.25, 'Fe': 0.25, 'Ni': 0.25}",0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Co1 Fe1 Ni1,FCC,8.5,CAST,FCC,25.0,204.0,"{'Co': 0.3333333333333333, 'Fe': 0.33333333333...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
df = df.drop('composition', axis=1)

In [58]:
len(all_elements)

26

In [59]:
columns=['crystal_structure', 'processing_method', 'microstructure']
columns


['crystal_structure', 'processing_method', 'microstructure']

In [60]:
# Notice the difference
df['crystal_structure']

0         FCC
1       other
2       other
3         BCC
4         FCC
        ...  
1518    other
1519    other
1520    other
1521    other
1522    other
Name: crystal_structure, Length: 1067, dtype: object

df[columns] # because columns get replaced by [ , , ,] that is why no extra 3rd bracket needed
# when you write a single column stirng, no third bracket needed
# but for multiple, third bracket needed

In [61]:
df = pd.get_dummies(
    df,
    columns=['crystal_structure', 'processing_method', 'microstructure'],
    prefix=['struct', 'proc', 'micro']
)
print(f"✓ After encoding: {len(df.columns)} total columns")

✓ After encoding: 78 total columns


In [62]:
df.columns

Index(['formula', 'density_calculated', 'test_temperature_c',
       'yield_strength_mpa', 'comp_Al', 'comp_C', 'comp_Ca', 'comp_Co',
       'comp_Cr', 'comp_Cu', 'comp_Fe', 'comp_Hf', 'comp_Li', 'comp_Mg',
       'comp_Mn', 'comp_Mo', 'comp_Nb', 'comp_Ni', 'comp_Pd', 'comp_Re',
       'comp_Sc', 'comp_Si', 'comp_Sn', 'comp_Ta', 'comp_Ti', 'comp_V',
       'comp_W', 'comp_Y', 'comp_Zn', 'comp_Zr', 'struct_BCC', 'struct_FCC',
       'struct_other', 'proc_ANNEAL', 'proc_CAST', 'proc_OTHER', 'proc_POWDER',
       'proc_Unknown', 'proc_WROUGHT', 'micro_B2', 'micro_B2+BCC',
       'micro_B2+L12', 'micro_B2+Laves+Sec.', 'micro_B2+Sec.', 'micro_BCC',
       'micro_BCC+B2', 'micro_BCC+B2+FCC', 'micro_BCC+B2+FCC+Sec.',
       'micro_BCC+B2+Laves', 'micro_BCC+BCC', 'micro_BCC+BCC+HCP',
       'micro_BCC+BCC+Laves', 'micro_BCC+BCC+Sec.', 'micro_BCC+FCC',
       'micro_BCC+HCP', 'micro_BCC+Laves', 'micro_BCC+Laves+Sec.',
       'micro_BCC+Sec.', 'micro_FCC', 'micro_FCC+B2', 'micro_FCC+B2+Sec.',
  

In [63]:
# Feature engineering
# Count the number of elements in each allooy
comp_cols = [col for col in df.columns if col.startswith('comp_')]
df['num_elements'] = (df[comp_cols] > 0).sum(axis=1)

In [64]:
df['num_elements']

0       4
1       4
2       4
3       4
4       3
       ..
1518    6
1519    6
1520    6
1521    6
1522    6
Name: num_elements, Length: 1067, dtype: int64

In [65]:
# Calculate composition entropy
df['composition_entropy'] = -1 * (
    df[comp_cols] * np.log(df[comp_cols] + 1e-10)
).sum(axis=1)

In [66]:
df['composition_entropy']

0       1.285293
1       1.351784
2       1.379292
3       1.386294
4       1.098612
          ...   
1518    1.704070
1519    1.734764
1520    1.748067
1521    1.757293
1522    1.765340
Name: composition_entropy, Length: 1067, dtype: float64

In [67]:
df['max_element_fraction'] = df[comp_cols].max(axis=1)
df['max_element_fraction']

0       0.307692
1       0.285714
2       0.266667
3       0.250000
4       0.333333
          ...   
1518    0.210526
1519    0.204082
1520    0.200000
1521    0.196078
1522    0.190476
Name: max_element_fraction, Length: 1067, dtype: float64

In [68]:
# Removing outliers
y = df[TARGET_COLUMN]

In [69]:
# Calculate outlier bounds (+- 3 standard deviations)
mean_ys = y.mean()
mean_ys

np.float64(889.9614714151828)

In [70]:
std_ys = y.std()
std_ys

np.float64(570.1354368328949)

In [71]:
lower_bound = mean_ys - 3 * std_ys
lower_bound

np.float64(-820.4448390835021)

In [72]:
upper_bound = mean_ys + 3 * std_ys
upper_bound

np.float64(2600.3677819138675)

In [73]:
# even though lower and upper bounds are single number, why outliers is a dataframe?
outliers = (y < lower_bound) | (y > upper_bound)
outliers

0       False
1       False
2       False
3       False
4       False
        ...  
1518    False
1519    False
1520    False
1521    False
1522    False
Name: yield_strength_mpa, Length: 1067, dtype: bool

In [74]:
num_outliers = outliers.sum()
num_outliers

np.int64(9)

In [75]:
print(f"Outlier bounds: [{lower_bound: .0f}, {upper_bound: .0f}] MPa")
print(f"Outlier detected: {num_outliers} ({num_outliers/len(y)*100: .2f}%)")

Outlier bounds: [-820,  2600] MPa
Outlier detected: 9 ( 0.84%)


In [76]:
# Remove if less than 5% of data
if num_outliers / len(y) < 0.05:
    df = df[~outliers].copy()
    print(f"✓ Removed {num_outliers} outliers")
else:
    print(f"✓ Kept outliers (too many to remove)")

print(f"Final dataset: {len(df)} samples")

✓ Removed 9 outliers
Final dataset: 1058 samples


In [77]:
# Drop non-feature columns
X = df.drop(['formula', TARGET_COLUMN], axis=1)
y = df[TARGET_COLUMN]

print(f" Main dataset (df): {df.shape}")
print(f" Features (X): {X.shape}")
print(f" Target (y): {y.shape}")


 Main dataset (df): (1058, 81)
 Features (X): (1058, 79)
 Target (y): (1058,)
