In [1]:
from mp_api.client import MPRester
print("MPRester imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


MPRester imported successfully!


In [4]:
from mp_api.client import MPRester
import pandas as pd
import os

api_key = os.getenv("MAPI_KEY")
mpr = MPRester(api_key)

def fetch_range(min_gap, max_gap, max_count=500):
    docs = mpr.materials.summary.search(
        criteria={
            "band_gap": {"$gte": min_gap, "$lte": max_gap},
            "energy_above_hull": {"$lte": 0.1}
        },
        fields=[
            "material_id",
            "formula_pretty",
            "band_gap",
            "energy_above_hull",
            "density",
            "volume",
            "symmetry.spacegroup_number",
        ],
        limit=max_count,
    )

    data = [doc.dict() for doc in docs]
    df = pd.DataFrame(data)
    return df

# Fetch datasets
metals = fetch_range(0.0, 0.1, max_count=500)
semis = fetch_range(0.1, 3.8, max_count=1000)
insul = fetch_range(3.8, 50.0, max_count=500)

# Add labels
metals["label"] = "metal"
semis["label"] = "semiconductor"
insul["label"] = "insulator"

# Combine
df_all = pd.concat([metals, semis, insul], ignore_index=True)
df_all.to_csv("data/all_classes_bandgaps.csv", index=False)

print(df_all["label"].value_counts())



MPRestError: You have specified the following kwargs which are unknown to [34m`search`[39m, but may be known to [31m`_search`[39m
    [36mcriteria, limit[39m
Please see the documentation:
    [34m`search`: https://materialsproject.github.io/api/_autosummary/mp_api.client.routes.materials.summary.SummaryRester.html#mp_api.client.routes.materials.summary.SummaryRester.search[39m
   [31m`_search`: https://api.materialsproject.org/redoc#tag/Materials-Summary/operation/search_materials_summary__get[39m

In [6]:
# Import the correct MPRester
from mp_api.client import MPRester

# Replace this with your actual API key (stored securely if possible)
MAPI_KEY = "SS3reUV9Y5n9UtyavCqkimWscPqnDW03"
# Create the client and query
with MPRester(MAPI_KEY) as mpr:
    # Use the new API endpoint: materials.summary.search()
    results = mpr.materials.summary.search(
        # Define filtering criteria
        band_gap=(0.5, 3.0),   # Select semiconductors: 0.5 eV < Eg < 3.0 eV
        fields=["material_id", "formula_pretty", "band_gap", "structure"],
        num_chunks=1,           # Number of data chunks to fetch
        chunk_size=10           # Number of entries per chunk
    )

# Print the first few results
for r in results:
    print(f"{r.formula_pretty:10s}  Band gap: {r.band_gap:.2f} eV")


Retrieving SummaryDoc documents: 100%|██████████████████████████████████████████████| 10/10 [00:00<00:00, 50351.79it/s]

Ac2S3       Band gap: 2.30 eV
AcBO3       Band gap: 0.81 eV
AcCrO3      Band gap: 2.00 eV
AcFeO3      Band gap: 0.99 eV
AcGaO3      Band gap: 2.90 eV
AcH3        Band gap: 0.64 eV
AcI3        Band gap: 2.59 eV
AcTlTe2     Band gap: 0.63 eV
Ag(BCl)6    Band gap: 2.70 eV
Ag(CO)2     Band gap: 0.74 eV





In [8]:
# Check what kind of object the results variable is
print(type(results))

# Look at the first result more closely
second_result = results[1]
print(second_result)


<class 'list'>
[4m[1mMPDataDoc<BaseModel>[0;0m[0;0m
[1mformula_pretty[0;0m='AcBO3',
[1mmaterial_id[0;0m=MPID(mp-1183052),
[1mstructure[0;0m=Structure Summary
Lattice
    abc : 3.7216679999999993 3.7216679999999993 3.7216679999999993
 angles : 90.0 90.0 90.0
 volume : 51.548126407860565
      A : np.float64(3.7216679999999993) np.float64(0.0) np.float64(0.0)
      B : np.float64(0.0) np.float64(3.7216679999999993) np.float64(0.0)
      C : np.float64(0.0) np.float64(0.0) np.float64(3.7216679999999993)
    pbc : True True True
PeriodicSite: Ac (0.0, 0.0, 0.0) [0.0, 0.0, 0.0]
PeriodicSite: B (1.861, 1.861, 1.861) [0.5, 0.5, 0.5]
PeriodicSite: O (1.861, 1.861, 0.0) [0.5, 0.5, 0.0]
PeriodicSite: O (1.861, 0.0, 1.861) [0.5, 0.0, 0.5]
PeriodicSite: O (0.0, 1.861, 1.861) [0.0, 0.5, 0.5],
[1mband_gap[0;0m=0.8071

[1mFields not requested:[0;0m


In [9]:
import pandas as pd

# Extract only key fields into a list of dictionaries
data = []
for r in results:
    data.append({
        "material_id": str(r.material_id),
        "formula": r.formula_pretty,
        "band_gap": r.band_gap
    })

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())


  material_id formula  band_gap
0    mp-32800   Ac2S3    2.2962
1  mp-1183052   AcBO3    0.8071
2   mp-866101  AcCrO3    2.0031
3   mp-861502  AcFeO3    0.9888
4  mp-1183053  AcGaO3    2.8959


In [10]:
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   material_id  10 non-null     object 
 1   formula      10 non-null     object 
 2   band_gap     10 non-null     float64
dtypes: float64(1), object(2)
memory usage: 372.0+ bytes


In [11]:
df_reduced = df[['formula_pretty', 'band_gap']]
df_reduced.head()


KeyError: "['formula_pretty'] not in index"

In [12]:
df.columns

Index(['material_id', 'formula', 'band_gap'], dtype='object')

In [13]:
df_reduced = df[['material_id','formula', 'band_gap']]
df_reduced.head()

Unnamed: 0,material_id,formula,band_gap
0,mp-32800,Ac2S3,2.2962
1,mp-1183052,AcBO3,0.8071
2,mp-866101,AcCrO3,2.0031
3,mp-861502,AcFeO3,0.9888
4,mp-1183053,AcGaO3,2.8959


In [14]:
def categorize_material(band_gap):
    if band_gap < 0.1:
        return "Conductor"
    elif band_gap < 3.8:
        return "Semiconductor"
    else:
        return "Insulator"

df_reduced['category'] = df_reduced['band_gap'].apply(categorize_material)
df_reduced.head()


Unnamed: 0,material_id,formula,band_gap,category
0,mp-32800,Ac2S3,2.2962,Semiconductor
1,mp-1183052,AcBO3,0.8071,Semiconductor
2,mp-866101,AcCrO3,2.0031,Semiconductor
3,mp-861502,AcFeO3,0.9888,Semiconductor
4,mp-1183053,AcGaO3,2.8959,Semiconductor


In [16]:
df_reduced.to_csv("bandgap_dataset.csv", index=False)


In [17]:
import os
os.getcwd()


'C:\\Users\\HP'