<a href="https://colab.research.google.com/github/sheikhahnaf/jarvis/blob/master/Analyzing_data_in_the_matminer_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install matminer jarvis aflow

Collecting aflow
  Downloading aflow-0.0.11-py3-none-any.whl.metadata (3.0 kB)
Collecting ase (from aflow)
  Downloading ase-3.24.0-py3-none-any.whl.metadata (3.9 kB)
Downloading aflow-0.0.11-py3-none-any.whl (35 kB)
Downloading ase-3.24.0-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ase, aflow
Successfully installed aflow-0.0.11 ase-3.24.0


In [17]:
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
import pandas as pd
import numpy as np
from pprint import pprint
import requests
import json

def clean_dataframe(df):
    """Clean dataframe by converting complex types to strings."""
    df_clean = df.copy()
    for col in df_clean.columns:
        # Check if column contains any dict or list
        if df_clean[col].apply(lambda x: isinstance(x, (dict, list))).any():
            df_clean[col] = df_clean[col].apply(lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x)
    return df_clean

def get_mp_data(elements, api_key):
    """Get all available properties from Materials Project."""
    print("\nQuerying Materials Project...")
    mpdr = MPDataRetrieval(api_key=api_key)

    criteria = {
        'elements': {'$in': elements},  # Changed from $all to $in to get any compounds containing the elements
    }

    try:
        props = [
            "material_id", "formula", "formation_energy_per_atom",
            "band_gap", "density", "volume", "nsites",
            "spacegroup", "total_magnetization"
        ]

        mp_df = mpdr.get_dataframe(criteria=criteria, properties=props)

        if not mp_df.empty:
            mp_df = clean_dataframe(mp_df)
            mp_df = mp_df.add_prefix('mp_')
            mp_df.rename(columns={'mp_formula': 'formula'}, inplace=True)
            available_props = [col.replace('mp_', '') for col in mp_df.columns if col != 'formula']
            print(f"Found {len(mp_df)} entries in Materials Project")
            return mp_df, available_props

    except Exception as e:
        print(f"Error querying Materials Project: {e}")
    return pd.DataFrame(), []

def get_oqmd_data(elements):
    """Get data from OQMD using direct API calls."""
    print("\nQuerying OQMD API directly...")

    base_url = "http://oqmd.org/oqmdapi/formationenergy"

    try:
        # Query for any compounds containing the specified elements
        element_filters = [f"element_set={element}" for element in elements]
        query_string = "&".join(element_filters)

        response = requests.get(f"{base_url}?{query_string}")

        if response.status_code != 200:
            print(f"Error accessing OQMD API: {response.status_code}")
            return pd.DataFrame(), []

        data = response.json()

        if not data.get('data'):
            print("No data found in OQMD")
            return pd.DataFrame(), []

        oqmd_df = pd.json_normalize(data['data'])
        oqmd_df = clean_dataframe(oqmd_df)
        oqmd_df.columns = [f'oqmd_{col}' for col in oqmd_df.columns]

        if 'oqmd_composition' in oqmd_df.columns:
            oqmd_df['formula'] = oqmd_df['oqmd_composition'].astype(str)

        available_props = [col.replace('oqmd_', '') for col in oqmd_df.columns if col != 'formula']
        print(f"Found {len(oqmd_df)} entries in OQMD")
        return oqmd_df, available_props

    except Exception as e:
        print(f"Error querying OQMD: {e}")

    return pd.DataFrame(), []

def collect_mp_oqmd(elements=['Ti'], api_key="cMB2Zun6Vg8G6d9C"):
    """
    Collect properties from Materials Project and OQMD.
    Retrieves all compounds containing any of the specified elements.

    Args:
        elements (list): List of elements to search for
        api_key (str): Materials Project API key
    Returns:
        tuple: (Combined DataFrame, dict of available properties by source)
    """
    print(f"\nCollecting properties for compounds containing any of these elements: {', '.join(elements)}...")

    # Get data from both sources
    mp_df, mp_props = get_mp_data(elements, api_key)
    oqmd_df, oqmd_props = get_oqmd_data(elements)

    # Combine dataframes
    dfs = []
    if not mp_df.empty:
        dfs.append(mp_df)
    if not oqmd_df.empty:
        dfs.append(oqmd_df)

    if dfs:
        # Start with the first dataframe
        combined_df = dfs[0]

        # Merge with remaining dataframes
        for df in dfs[1:]:
            # Ensure formula column is string type in both dataframes
            combined_df['formula'] = combined_df['formula'].astype(str)
            df['formula'] = df['formula'].astype(str)
            combined_df = pd.merge(combined_df, df, on='formula', how='outer')
    else:
        combined_df = pd.DataFrame()

    # Create property summary
    property_summary = {
        'materials_project': mp_props,
        'oqmd': oqmd_props,
        'total_compositions': len(combined_df),
        'total_properties': len(combined_df.columns)
    }

    # Analyze data availability
    if not combined_df.empty:
        print("\nData availability summary:")
        print(f"Total unique compositions found: {len(combined_df)}")
        print("\nDatabase coverage:")
        for prefix in ['mp_', 'oqmd_']:
            cols = [col for col in combined_df.columns if col.startswith(prefix)]
            if cols:
                print(f"{prefix[:-1].upper()}: {len(cols)} properties")

        print("\nProperty availability per database:")
        for col in combined_df.columns:
            non_null = combined_df[col].count()
            print(f"{col}: {non_null}/{len(combined_df)} entries available")

    return combined_df, property_summary

# Example usage
if __name__ == "__main__":
    elements = ['Ti',"W"]
    api_key = "cMB2Zun6Vg8G6d9C"

    # Collect data from MP and OQMD
    df, property_summary = collect_mp_oqmd(elements, api_key)

    # Save results
    if not df.empty:
        filename = f"{'_'.join(elements)}_mp_oqmd.csv"
        df.to_csv(filename, index=False)
        print(f"\nComplete dataset saved to {filename}")

        # Save property summary
        with open(f"{'_'.join(elements)}_property_summary.txt", 'w') as f:
            pprint(property_summary, stream=f)


Collecting properties for compounds containing any of these elements: Ti, W...

Querying Materials Project...


  return _MPResterLegacy(*args, **kwargs)
100%|██████████| 9487/9487 [00:06<00:00, 1404.25it/s]


Found 9487 entries in Materials Project

Querying OQMD API directly...
Found 50 entries in OQMD

Data availability summary:
Total unique compositions found: 9537

Database coverage:
MP: 7 properties
OQMD: 20 properties

Property availability per database:
formula: 9537/9537 entries available
mp_formation_energy_per_atom: 9487/9537 entries available
mp_band_gap: 9487/9537 entries available
mp_density: 9487/9537 entries available
mp_volume: 9487/9537 entries available
mp_nsites: 9487/9537 entries available
mp_spacegroup: 9487/9537 entries available
mp_total_magnetization: 9487/9537 entries available
oqmd_name: 50/9537 entries available
oqmd_entry_id: 50/9537 entries available
oqmd_calculation_id: 50/9537 entries available
oqmd_icsd_id: 48/9537 entries available
oqmd_formationenergy_id: 50/9537 entries available
oqmd_duplicate_entry_id: 48/9537 entries available
oqmd_composition: 50/9537 entries available
oqmd_composition_generic: 50/9537 entries available
oqmd_prototype: 50/9537 entries 

In [19]:
df["formula"]

Unnamed: 0,formula
0,Ag1 Ti1
1,Ag1 Tm1
2,Ag1 Y1
3,Ag1 Yb1
4,Al1
...,...
9532,"{""Zr"": 5.0, ""Ti"": 7.0, ""O"": 24.0}"
9533,"{""Zr"": 9.0, ""B"": 1.0, ""W"": 4.0}"
9534,"{""Zr"": 9.0, ""Ti"": 3.0, ""P"": 4.0}"
9535,"{""Zr"": 9.0, ""W"": 4.0, ""O"": 3.0}"


In [14]:
df

Unnamed: 0,formula,mp_formation_energy_per_atom,mp_band_gap,mp_density,mp_volume,mp_nsites,mp_spacegroup,mp_total_magnetization,oqmd_name,oqmd_entry_id,...,oqmd_volume,oqmd_ntypes,oqmd_natoms,oqmd_unit_cell,oqmd_sites,oqmd_band_gap,oqmd_delta_e,oqmd_stability,oqmd_fit,oqmd_calculation_label
0,Ag1 Ti1,,,,,,,,TiAg,10676.0,...,33.9741,2.0,2.0,"[[2.858117, 0.0, 0.0], [0.0, 2.858117, 0.0], [...","[""Ag @ 0 0 0"", ""Ti @ 0.5 0.5 0.5""]",0.0,0.069534,0.169806,standard,static
1,Ag1 Tm1,,,,,,,,TmAg,10678.0,...,45.4458,2.0,2.0,"[[3.5686, 0.0, 0.0], [0.0, 3.5686, 0.0], [0.0,...","[""Ag @ 0 0 0"", ""Tm @ 0.5 0.5 0.5""]",0.0,-0.350774,0.001628,standard,static
2,Ag1 Y1,,,,,,,,YAg,10681.0,...,47.7059,2.0,2.0,"[[3.626804, 0.0, 0.0], [0.0, 3.626804, 0.0], [...","[""Ag @ 0 0 0"", ""Y @ 0.5 0.5 0.5""]",0.0,-0.343094,0.001558,standard,static
3,Ag1 Yb1,,,,,,,,YbAg,10683.0,...,48.1894,2.0,2.0,"[[3.639014, 0.0, 0.0], [0.0, 3.639014, 0.0], [...","[""Ag @ 0 0 0"", ""Yb @ 0.5 0.5 0.5""]",0.0,-0.429266,0.006615,standard,static
4,Al1,,,,,,,,Al,8100.0,...,16.4826,1.0,1.0,"[[2.019908, -2.019908, 0.0], [-2.019908, 0.0, ...","[""Al @ 0 0 0""]",0.0,0.000789,0.000789,standard,static
5,B1 Tc1,,,,,,,,TcB,21452.0,...,20.3153,2.0,2.0,"[[2.728608, 0.0, 0.0], [0.0, 2.728608, 0.0], [...","[""B @ 0.5 0.5 0.5"", ""Tc @ 0 0 0""]",0.0,0.144659,0.529524,standard,static
6,Bi1 Li1,,,,,,,,LiBi,10937.0,...,47.1338,2.0,2.0,"[[3.370293, 0.0, 0.0], [0.0, 3.370293, 0.0], [...","[""Bi @ 0 0 0"", ""Li @ 0.5 0.5 0.5""]",0.0,-0.387425,0.000175,standard,static
7,C1 Si1,,,,,,,,SiC,22472.0,...,16.624,2.0,2.0,"[[2.025669, -2.025669, 0.0], [-2.025669, 0.0, ...","[""C @ 0.5 0 0.5"", ""Si @ 0 0 0""]",0.0,0.53432,0.743225,standard,static
8,Er1,,,,,,,,Er,9566.0,...,30.7106,1.0,1.0,"[[1.972769, 1.972769, 1.972769], [1.972769, -1...","[""Er @ 0 0 0""]",0.0,0.141404,0.141404,standard,static
9,Fe1,,,,,,,,Fe,22514.0,...,11.1739,1.0,1.0,"[[1.408362, 1.408362, 1.408362], [1.408362, -1...","[""Fe @ 0 0 0""]",0.0,0.000355,0.000355,standard,static
