## Imports

In [31]:
from pymongo import MongoClient
client = MongoClient()
db = client.structure_sets

from IPython.display import display
import pandas as pd
pd.options.display.max_rows = 9999

from pymatgen import Element, Structure
from bson.objectid import ObjectId
import math
import time
import pprint
import math
import warnings
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

from matminer.figrecipes.plotly.make_plots import PlotlyFig

bonds = {}
bl_pred_dfs = []

### Set up parallel processing units

In [2]:
import ipyparallel as ipp
c = ipp.Client()
print c.ids
# dview = c[:]
lview = c.load_balanced_view()

[0, 1, 2, 3]


## Define functions

### Define the volume predictor

In [13]:
def predict_volume(inp):
    doc = inp[0]
    bonds = inp[1]
    # hacky, had to do this to use parallel engines
    # to run code on the engines, need to pass everything (imports, variables, functions, etc) required to run
    
    """
    Given a Structure (in dict form) and a dictionary of bond length predictions,
    returns a volume prediction and data about which bond was used.
    
    Args:
        doc (dict) : a dictionary corresponding to a Structure (generated via Structure.as_dict())
        bonds (dict) : a dictionary containing bonds with their predicted lengths
                            
    Returns:
        a dictionary containing the following keys:
        name (str) : structure's reduced formula
        actual_volume (float) : volume in the structure dict
        predicted_volume (float) : volume prediction
        num_atoms (int) : number of atoms in the structure
        worst_predicted_bond (str) : ex. 'Al-F' in alphabtical order
        actual_distance (float) : actual distance for the worst_predicted_bond
        expected_distance (float) : predicted distance for the worst_predicted_bond
        _id (ObjectId) : id of the dictionary in mongodb (useful for quick retrievals/deletes/updates)
    """
    
    
    import numpy as np
    from matminer.descriptors.composition_features import get_pymatgen_descriptor
    from pymatgen import Element, Structure
    import warnings
    structure = Structure.from_dict(doc)
    cutoff=4
    ionic_factor = 0.30
    
    smallest_dist = None
    smallest_expected_dist = None
    smallest_ratio = None
    smallest_e1 = None
    smallest_e2 = None
    
    ionic_mix = min(np.std(get_pymatgen_descriptor(structure.composition, 'X')) * ionic_factor, 1)
    
    for site in structure:
        el1 = site.specie

        if el1.atomic_radius:
            r1 = el1.average_ionic_radius * ionic_mix + \
                 el1.atomic_radius * (1-ionic_mix) if el1.average_ionic_radius else el1.atomic_radius
                
            neighbors = structure.get_neighbors(site, el1.atomic_radius + cutoff)
            
            for site2, dist in neighbors:
                el2 = site2.specie
                

                if el2.atomic_radius:
                    r2 = el2.average_ionic_radius * ionic_mix + \
                         el2.atomic_radius * (1-ionic_mix) if el2.average_ionic_radius else el2.atomic_radius
                
                    bond_name = '-'.join(sorted([el1.symbol, el2.symbol]))
                    if bond_name in bonds:
                        expected_dist = bonds[bond_name]['estimates'][-1]
                    else:
                        expected_dist = float(r1+r2)

                    if not smallest_ratio or dist/expected_dist < smallest_ratio:
                        smallest_dist = dist
                        smallest_expected_dist = expected_dist
                        smallest_ratio = dist/expected_dist
                        smallest_e1 = el1
                        smallest_e2 = el2
        else:
            warnings.warn("VolumePredictor: no atomic radius data for "
                          "{}".format(el1))

    if smallest_ratio is None:
        print structure.composition.reduced_formula, doc['_id'], 'FAILED'
        return {'name':structure.composition.reduced_formula, '_id':doc['_id'],
                'status': 'FAILED'}
    
    volume_factor = (1/smallest_ratio)**3
            
    return {'name': structure.composition.reduced_formula, 'actual_volume': structure.volume,
            'predicted_volume': structure.volume * volume_factor, 'num_atoms': structure.composition.num_atoms,
            'worst_predicted_bond': '-'.join(sorted([smallest_e1.symbol, smallest_e2.symbol])),
            'actual_distance': smallest_dist, 'expected_distance': smallest_expected_dist,
            '_id':doc['_id'], 'material_id':doc['material_id']}

### Functions to import previous data

In [76]:
def import_pred_dfs(file_list):
    """
    Import prediction DataFrames based on the list of filenames given
    
    Args:
        file_list (list) : ex. ['bl_pred_df_iter_0.csv', 'bl_pred_df_iter_1.csv']
        
    Returns:
        list of DataFrames
    """
    bl_pred_dfs = []
    for f in file_list:
        df_csv = pd.read_csv(f)
        df_csv.drop('Unnamed: 0', axis=1, inplace=True)
        bl_pred_df.append(df_csv)
        
    return bl_pred_dfs


def import_bond_data(f):
    """
    Import database of bond lengths from json 
    
    Args:
        f (json) : ex. 'bonds_0.json'
        
    Returns:
        dict of bonds (see bonds)
    """
    with open(f) as data_file:    
        bonds_from_json = json.load(data_file)
    return bonds_from_json

### Functions for initializing the bonds database (these functions run during iteration 0)

In [60]:
def create_bonds_dict(bl_pred_df_0):
    """
    Initializes the bonds dictionary based the DataFrame corresponding to iteration 0
    """
    
    for b in bl_pred_df_0['worst_predicted_bond'].unique():
        if b not in bonds:
            bonds[b]={'mp_values':{}, 'estimates':[]}

            
def get_mp_values(c):
    """
    Given a material, adds the worst_predicted_bond to the bonds database.
    This function is run on a DataFrame:
        ex. bl_pred_dfs[i].apply(get_mp_values,axis=1)
    
    Args:
        c (Series) : Series corresponding to a single structure
    """
    
    if c['worst_predicted_bond'] in bonds:
        bonds[c['worst_predicted_bond']]['mp_values'][c['material_id']] = c['actual_distance']
    else:
        print c['worst_predicted_bond'], 'DNE. Creating entry in bonds dict..'
        bonds[c['worst_predicted_bond']]={'mp_values':{c['material_id']:c['actual_distance']}, 'estimates':[]}

### Functions to retrieve n% of the the worst bond predictions for an iteration

In [16]:
# def get_worst_predicted_bonds_avg_minus_median():
#     bond_estimate_data = []
#     percent_to_return = 0.1
#     num_bonds_to_change = int(percent_to_return*len(bonds))
#     for b in bonds:
#         b_median = np.median(bonds[b]['mp_values'].values())
#         b_est = bonds[b]['estimates'][-1]
#         b_dist = abs(b_median-b_est)

#         bond_estimate_data.append({'bond_name':b, 'median-estimate': b_dist})

#     bond_estimate_data_df = pd.DataFrame(bond_estimate_data)

#     return bond_estimate_data_df.sort_values('median-estimate',ascending=False)[:num_bonds_to_change]['bond_name'].tolist()


def get_worst_predicted_bonds_avg_score(bl_pred_df, percent_to_return=10):
    """
    Given a dataframe corresponding to an iteration, retrieves n% of the worst performing bond
    length predictions, based on which bonds had the highest average percent error.
    
    Args:
        bl_pred_df (DataFrame) : ex. corresponds to material data
        
    Returns:
        list of worst performing bond names
    """    
    bond_estimate_data = []
    percent_to_return = float(percent_to_return)/100.0
    num_bonds_to_change = int(percent_to_return*len(bonds))
    for b in bl_pred_df['worst_predicted_bond'].unique():
        df =  bl_pred_df[bl_pred_df['worst_predicted_bond']==b]
        mean_score = df['score'].mean()
        bond_estimate_data.append({'bond_name':b, 'mean_score': np.mean(mean_score)})

    bond_estimate_data_df = pd.DataFrame(bond_estimate_data)

    return bond_estimate_data_df.sort_values('mean_score',ascending=False)[:num_bonds_to_change]['bond_name'].tolist()
    

### Update bond length estimates for a specified list of bonds

In [17]:
def update_estimates(bl_pred_df, iter_num, bonds_to_update,weight=0.5):
    """
    Given a dataframe corresponding to an iteration and a list of worst performing bonds,
    move the bond length estimates closer to the Materials Project median by a specified weight
    
    Args:
        bl_pred_df (DataFrame) : ex. corresponds to material data
        iter_num (int): the iteration number
        bonds_to_update(list): list of bond names that performed the worst
        weight(float): weight the movement from old prediction to MP median
    """   
    
    for b in bonds_to_update:
        if len(bonds[b]['estimates'])<iter_num+2: # after iter 0, want at most 2 estimate entries, 3 after iter 1, etc.
            if len(bonds[b]['estimates']) == 0: # first estimate is always the OG ionic mix prediction
                df = bl_pred_df[bl_pred_df['worst_predicted_bond']==b]
                new_estimate = df['expected_distance'].mean()
            else:
                new_estimate = weight*(np.median(bonds[b]['mp_values'].values())) + (1-weight)*(bonds[b]['estimates'][-1])
                
            bonds[b]['estimates'].append(new_estimate)
            

### Scoring functions, distance and percent error

In [18]:
def dist_df(c, pred_vol='predicted_volume', act_vol='actual_volume', num_atoms='num_atoms'):
    """
    Returns a score based on distance from the y=x line
    
    Args:
        c (Series) : Series corresponding to a single structure
        pred_vol (str) : column name that contains the predicted volume
        act_vol (str) : column name that contains the actual volume
        num_atoms (str) : column name that contains the number of atoms
                            
    Returns:
        a float value of the distance away from the optimal line
    """
    
    # define the optimal line here using two points
    # line is y = x
    x1 = 0
    y1 = 0
    x2 = 100
    y2 = 100
    
    # set x3, y3 as the point
    x3 = c[pred_vol]/c[num_atoms]
    y3 = c[act_vol]/c[num_atoms]
        
    return score(x1, y1, x2, y2, x3,y3)


def dist(x1, y1, x2, y2, x3,y3):
    """
    Helper function
    """
    px = x2-x1
    py = y2-y1

    temp = px*px + py*py
    u = ((x3 - x1)*px + (y3 - y1)*py)/float(temp)

    if u > 1:
        u = 1
    elif u < 0:
        u = 0

    x = x1 + u*px
    y = y1 + u*py
    dx = x - x3
    dy = y - y3
    
    dist = math.sqrt(dx*dx + dy*dy)
    return dist


def percent_error_df(c, pred_vol='predicted_volume', act_vol='actual_volume', num_atoms='num_atoms'):
    """
    Returns a score based on percent error between actual and predicted
    
    Args:
        c (Series) : Series corresponding to a single structure
        pred_vol (str) : column name that contains the predicted volume
        act_vol (str) : column name that contains the actual volume
        num_atoms (str) : column name that contains the number of atoms
                            
    Returns:
        a float value corresponding to percent error
    """
    return percent_error(c[pred_vol]/c[num_atoms], c[act_vol]/c[num_atoms])
    

def percent_error(calculcated, actual):
    """
    Helper function
    """
    return abs(calculcated-actual)/actual

### Some functions to help analyze data

In [61]:
def display_bond_data(b):
    """
    Prints out bond information in the bonds dictionary and all materials that used that bond for
    volume prediction in each iteration
    
    Args:
        b (str): bond name ex. 'O-P'
        bonds (dict): dict of bonds
        bl_pred_dfs (list): list of DataFrames
    """
    pprint.pprint(bonds[b])
    for df in bl_pred_dfs:
        display(df[df['worst_predicted_bond']==b])
    
    
def display_material_data(c):
    """
    Prints out a material's predictions/informations in all of the iterations run so far
    
    Args:
        c (str): material name ex. 'IZr' # might want to change this to using mat_id down the line
        bl_pred_dfs (list): list of DataFrames
    """
    for df in bl_pred_dfs:
        display(df[df['name']==c])

        
def get_material(mat_id, collection=db.materials_project):
    """
    Given a material id, find it in the mongo database. This is useful for editing or removing values
    
    Args:
        mat_id (str): Materials Project id
        collection: collection name
    """
    cursor = collection.find({"_id": ObjectId(obj_id_to_find)})
    print cursor.count(), 'results found'

    
def delete_material(mat_id, collection=db.materials_project):
    """
    Given a material id, delete it in the mongo database.
    
    Args:
        mat_id (str): Materials Project id
        collection: collection name
    """
    result = collection.delete_one({'_id': ObjectId(mat_id)})
    print 'deleted', result.deleted_count
    

## MAIN

In [20]:
# to test changes, run on a small section of the data set
# acquire a random sample from the materials_project db, and add it to a temporary db
# materials_project_test_set

# cursor = db.materials_project.aggregate(
#     [
#         {"$sample": {"size": 1000}}
#     ]
# )
# for doc in cursor:
#     db.mp_test.insert_one(doc)
cursor = db.mp_test.find()
cursor.count()

1000

In [33]:
# Careful!! Running this section will wipe all previous data and start from scratch

"""
bonds is a dictionary organized in the following manner:
    key: 'element1-element2' ex. 'C-Cl'
    'mp_values': dictionary of bonds lengths calculated from the structures in Materials Project
                 ex. bonds['C-Cl'][mp_values] = {'mp-12345':1.0, 'mp-67890':2.0}
    'estimates': array of the last estimated value used (entry 0 should be estimate from old vol predictor)
                 ex. bonds['C-Cl']['estimates'] = [1, 1.5, 2, 2.25, 2.4]
"""
bonds = {}


"""
bl_pred_dfs is a list of pandas DataFrames, where each index corresponds to an iteration.
    bl_pred_dfs[0] is the initial iteration DataFrame
    bl_pred_dfs[-1] is the most recent iteration DataFrame
"""
bl_pred_dfs = []


"""
bonds_updated_this_iter is a list of lists containing the names of the bonds that were updated in
an iteration. bonds_updated_this_iter[0] returns the bonds that were updated after the initial iteration, etc.
"""
bonds_updated_this_iter = []

"""
scores is a list containing the average score of the iteration (percent error)
iter_run_times is a list containing the times it took to run each iteration
"""
scores = []
iter_run_times = []

In [53]:
"""
Main function. This section runs through all of the given database, estimating the bond lengths for
each structure, and storing the worst estimate in bond_length_predictions[iter_index]. After each iteration
through the database, updates 10% worst performing bond by moving the estimate closer
to the median of the values found.

Run this section after setting the number of iterations and how many bonds to change per iteration.

"""

def run_predictor(weight_decay_per_iter, weight, collection=db.materials_project,
                  init_iter=0, num_iter=1, percent_bonds_to_change_per_iter=10):
    cursor = collection.find()

    for i in range(init_iter, init_iter+num_iter):
        iter_start_time = time.time()
        print '\n################### ROUND', i, '#######################'
        bond_length_predictions = []

        lim = 1000
        iter_num = 0
        d = []
        print len(bond_length_predictions), 'structures processed'
        while len(bond_length_predictions) < cursor.count():
            temp_cursor = cursor.clone()
            temp_cursor.skip(lim*iter_num)
            temp_cursor.limit(lim)
            d = lview.map_sync(predict_volume, [[doc,bonds] for doc in temp_cursor])
            bond_length_predictions += d
            iter_num+=1
            print len(bond_length_predictions), 'structures processed'

        print 'making dataframe'
        cols = ['name', 'material_id', 'worst_predicted_bond', 'expected_distance', 'actual_distance',
                'predicted_volume', 'actual_volume','num_atoms', '_id']
        bl_pred_dfs.append(pd.DataFrame(bond_length_predictions, columns=cols))


        print 'calculating score'
        bl_pred_dfs[i]['score'] = bl_pred_dfs[i].apply(percent_error_df, args=('predicted_volume','actual_volume','num_atoms'), axis=1)
        score_this_iter = bl_pred_dfs[i]['score'].mean()    
        scores.append(score_this_iter)


        if i == 0: # initialization of bonds dict estimates
            print 'initializing bonds dict (iteration 0)'
            # create a dictionary bonds
            print 'creating bonds dict'
            create_bonds_dict(bl_pred_dfs[0])
            # gets all of the materials project data for bond lengths and puts it in 'mp_values'
            print 'collecting Materials Project bond lengths'
            bl_pred_dfs[0].apply(get_mp_values,axis=1)
            # initialize our first estimate
            print 'initializing estimates'
            update_estimates(bl_pred_dfs[i], i, bonds.keys(), weight)

        print 'adding new bonds'
        # if a new bond shows up, add it to the dict!
        for b in bl_pred_dfs[i]['worst_predicted_bond'].unique():
            if b not in bonds:
                bonds[b]={'mp_values':{}, 'estimates':[]}
                df = bl_pred_dfs[i][bl_pred_dfs[i]['worst_predicted_bond']==b]
                for index, row in df.iterrows():
                    bonds[b]['mp_values'][row['material_id']] = row['actual_distance']
                new_estimate = df['expected_distance'].median()
                bonds[b]['estimates'] = [new_estimate]
                print 'NEW BOND', b, 'added to bonds dict'

        print 'updating MP values'
        # for a bond, if a new material now uses it, add its MP length into the dict
        for b in bl_pred_dfs[i]['worst_predicted_bond'].unique():
            df = bl_pred_dfs[i][bl_pred_dfs[i]['worst_predicted_bond']==b]
            for index, row in df.iterrows():
                if row['material_id'] not in bonds[b]['mp_values']:
                    bonds[b]['mp_values'][row['material_id']] = row['actual_distance']


        print 'updating bonds dict and adding new estimates for 10%'
        bonds_updated_this_iter.append(get_worst_predicted_bonds_avg_score(bl_pred_dfs[i])) # get back a list of bond length estimates updated
        update_estimates(bl_pred_dfs[i], i, bonds_updated_this_iter[i], weight)
        weight = weight * weight_decay_per_iter


        # write bl_pred_dfs[i], bonds_updated_this_iter[i], bonds to csv
        print 'saving dataframe and bonds'
        filepath = 'bl_pred_df_iter_'+str(i)+'.csv'
        bl_pred_dfs[i].to_csv(filepath)
        with open('bonds_'+str(i)+'.json', 'w') as fp:
            json.dump(bonds, fp, sort_keys=True)

        iter_end_time = time.time()
        iter_run_times.append(iter_end_time-iter_start_time)

    

In [55]:
"""
Before running the predictor, set these variables each time! This allows you to space out iterations
instead of having to run them all at once.
"""

init_iter = 0
num_iter = 2
percent_bonds_to_change_per_iter = 10
weight_decay_per_iter = 0.9
weight = 0.5 * pow(weight_decay_per_iter, init_iter)
collection = db.mp_test

In [56]:
run_predictor(weight_decay_per_iter=weight_decay_per_iter, weight=weight, collection=collection,
              init_iter=init_iter, num_iter=num_iter,
              percent_bonds_to_change_per_iter=percent_bonds_to_change_per_iter)


################### ROUND 0 #######################
0 structures processed
1000 structures processed
making dataframe
calculating score
initializing bonds dict (iteration 0)
creating bonds dict
collecting Materials Project bond lengths
initializing estimates
adding new bonds
updating MP values
updating bonds dict and adding new estimates for 10%
saving dataframe and bonds

################### ROUND 1 #######################
0 structures processed
1000 structures processed
making dataframe
calculating score
adding new bonds
updating MP values
updating bonds dict and adding new estimates for 10%
saving dataframe and bonds


In [None]:
print 'score per iteration:', scores
print 'hours to run each iteration:', np.array(iter_run_times)/3600
print 'num bonds in database:', len(bonds)

## Plots

In [None]:
d1 = bl_pred_dfs[0]

x = (d1['predicted_volume']/d1['num_atoms']).tolist()
y = (d1['actual_volume']/d1['num_atoms']).tolist()

a1 = np.arange(150)
a2 = a1*0.8

plt.scatter(x,y,s=2,color='#285296')
plt.plot(range(0,150),range(0,150), '#FFA74D')

plt.xlabel('Predicted Volume/Atom',fontsize=18)
plt.ylabel('Actual Volume/Atom',fontsize=18)
plt.title('Volume Prediction (Iteration 0)',fontsize=24)
axes = plt.gca()
axes.set_xlim([0,150])
axes.set_ylim([0,150])
plt.grid()
plt.show()

In [None]:
d1 = bl_pred_dfs[-1]

x = (d1['predicted_volume']/d1['num_atoms']).tolist()
y = (d1['actual_volume']/d1['num_atoms']).tolist()

plt.scatter(x,y,s=2,color='#285296')
plt.plot(range(0,150), '#FFA74D')

plt.xlabel('Predicted Volume/Atom',fontsize=18)
plt.ylabel('Actual Volume/Atom',fontsize=18)
plt.title('Volume Prediction (Iteration 9)',fontsize=24)
axes = plt.gca()
axes.set_xlim([0,150])
axes.set_ylim([0,150])
plt.grid()
plt.show()

In [None]:
p = PlotlyFig(x_title='Initial Prediction (sum of atomic radii)', y_title='Final Prediction',
          plot_title='Initial vs Final Predictions', height=600, width=800,
          plot_mode='notebook', margin_left=150, textsize=25,)

p.xy_plot(x_col=x, y_col=y, marker_outline_width=1,
            text=bonds.keys(), 
            add_xy_plot=[{'x_col': [0, 5], 'y_col': [0, 5],
                          'color': 'green', 'mode': 'lines',
                          'legend': None,'text': None, 'size': None},
                        ],)



p = PlotlyFig(x_title='Initial Prediction (sum of atomic radii)', y_title='Final Prediction Delta',
          plot_title='Initial Predictions vs Final Delta', height=600, width=800,
          plot_mode='notebook', margin_left=150, textsize=25,)


p.xy_plot(x_col=x, y_col=z, marker_outline_width=1,
            text=bonds.keys(), 
            add_xy_plot=[{'x_col': [0, 5], 'y_col': [1, 1],
                          'color': 'black', 'mode': 'lines',
                          'legend': None,'text': None, 'size': None},
                         {'x_col': [0, 5], 'y_col': [0, 0],
                          'color': 'black', 'mode': 'lines',
                          'legend': None,'text': None, 'size': None},
                         {'x_col': [0, 5], 'y_col': [-1, -1],
                          'color': 'black', 'mode': 'lines',
                          'legend': None,'text': None, 'size': None}
                        ],
             )

## Analysis

In [None]:
e1 = Element('O')
e2 = Element('Xe')
print e1.atomic_radius, e1.average_ionic_radius
print e2.atomic_radius, e2.average_ionic_radius

In [66]:
display_material_data('Lu5SbPd2')

Unnamed: 0,name,material_id,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id,score
0,Lu5SbPd2,mp-15846,Lu-Pd,2.98264,2.929129,408.673988,387.070649,16.0,58f024467f079f2488b33660,0.055812


Unnamed: 0,name,material_id,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id,score
0,Lu5SbPd2,mp-15846,Lu-Pd,2.98264,2.929129,408.673988,387.070649,16.0,58f024467f079f2488b33660,0.055812


Unnamed: 0,name,material_id,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id
0,Lu5SbPd2,mp-15846,Lu-Pd,2.98264,2.929129,408.673988,387.070649,16.0,58f024467f079f2488b33660


In [70]:
display_bond_data('Cs-Cs')

{'estimates': [4.762915944059146, 4.5323698742877978, 4.4286241428906914],
 'mp_values': {u'mp-11124': 4.48917234,
               u'mp-13992': 4.284522225200528,
               u'mp-14069': 3.5536567552417084,
               u'mp-14429': 3.7847149943385747,
               u'mp-22865': 4.20905509,
               u'mp-555302': 4.30182380451645,
               u'mp-560387': 4.1846245886435955,
               u'mp-570957': 4.375585078748178,
               u'mp-571409': 4.423072152760527,
               u'mp-574620': 4.18110004,
               u'mp-675010': 4.546934141410505,
               u'mp-675287': 5.180813273709207,
               u'mp-7152': 3.9463610899999995,
               u'mp-8890': 4.379723251396561,
               u'mp-998433': 4.607277}}


Unnamed: 0,name,material_id,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id,score
103,CsK4GaO4,mp-14429,Cs-Cs,4.612713,3.784715,3049.314349,1684.34991,80.0,58f021b87f079f2488b32604,0.810381
355,Cs2Pt3S4,mp-13992,Cs-Cs,4.866768,4.284522,378.266921,258.097448,9.0,58f0189c7f079f2488b2ed83,0.465597
375,CsSiTe3,mp-570957,Cs-Cs,4.958299,4.375585,1211.653668,832.700359,20.0,58f02b557f079f2488b36487,0.45509
420,Cs(SbS2)2,mp-8890,Cs-Cs,4.910813,4.379723,627.555155,445.175914,14.0,58f020ef7f079f2488b320fb,0.409679
427,CsYZnSe3,mp-574620,Cs-Cs,4.863541,4.1811,588.831856,374.115168,12.0,58f024a57f079f2488b338bf,0.573932
498,CsAlO2,mp-14069,Cs-Cs,4.651623,3.553657,309.921398,138.186155,8.0,58f0205b7f079f2488b31d4c,1.242782
570,CsSrBr3,mp-998433,Cs-Cs,4.714085,4.607277,441.410163,412.081327,10.0,58f019037f079f2488b2f010,0.071172
575,Cs2Al2P2O9,mp-560387,Cs-Cs,4.724228,4.184625,376.483218,261.651106,15.0,58f01f837f079f2488b317c9,0.438875
585,CsZrCuSe3,mp-7152,Cs-Cs,4.875483,3.946361,623.797402,330.811862,12.0,58f024ac7f079f2488b338f0,0.885656
622,CsHgCl3,mp-675010,Cs-Cs,4.75181,4.546934,416.969596,365.3283,10.0,58f020dc7f079f2488b3207e,0.141356


Unnamed: 0,name,material_id,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id,score
103,CsK4GaO4,mp-14429,Cs-Cs,4.612713,3.784715,3049.314349,1684.34991,80.0,58f021b87f079f2488b32604,0.810381
355,Cs2Pt3S4,mp-13992,Cs-Cs,4.866768,4.284522,378.266921,258.097448,9.0,58f0189c7f079f2488b2ed83,0.465597
375,CsSiTe3,mp-570957,Cs-Cs,4.958299,4.375585,1211.653668,832.700359,20.0,58f02b557f079f2488b36487,0.45509
420,Cs(SbS2)2,mp-8890,Cs-Cs,4.910813,4.379723,627.555155,445.175914,14.0,58f020ef7f079f2488b320fb,0.409679
427,CsYZnSe3,mp-574620,Cs-Cs,4.863541,4.1811,588.831856,374.115168,12.0,58f024a57f079f2488b338bf,0.573932
498,CsAlO2,mp-14069,Cs-Cs,4.651623,3.553657,309.921398,138.186155,8.0,58f0205b7f079f2488b31d4c,1.242782
570,CsSrBr3,mp-998433,Cs-Cs,4.714085,4.607277,441.410163,412.081327,10.0,58f019037f079f2488b2f010,0.071172
575,Cs2Al2P2O9,mp-560387,Cs-Cs,4.724228,4.184625,376.483218,261.651106,15.0,58f01f837f079f2488b317c9,0.438875
585,CsZrCuSe3,mp-7152,Cs-Cs,4.875483,3.946361,623.797402,330.811862,12.0,58f024ac7f079f2488b338f0,0.885656
622,CsHgCl3,mp-675010,Cs-Cs,4.75181,4.546934,416.969596,365.3283,10.0,58f020dc7f079f2488b3207e,0.141356


Unnamed: 0,name,material_id,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id
355,Cs2Pt3S4,mp-13992,Cs-Cs,4.53237,4.284522,305.529073,258.097448,9.0,58f0189c7f079f2488b2ed83
375,CsSiTe3,mp-570957,Cs-Cs,4.53237,4.375585,925.457302,832.700359,20.0,58f02b557f079f2488b36487
420,Cs(SbS2)2,mp-8890,Cs-Cs,4.53237,4.379723,493.364259,445.175914,14.0,58f020ef7f079f2488b320fb
427,CsYZnSe3,mp-574620,Cs-Cs,4.53237,4.1811,476.551315,374.115168,12.0,58f024a57f079f2488b338bf
498,CsAlO2,mp-14069,Cs-Cs,4.53237,3.553657,286.691049,138.186155,8.0,58f0205b7f079f2488b31d4c
570,CsSrBr3,mp-998433,Cs-Cs,4.53237,4.607277,392.306945,412.081327,10.0,58f019037f079f2488b2f010
585,CsZrCuSe3,mp-7152,Cs-Cs,4.53237,3.946361,501.148742,330.811862,12.0,58f024ac7f079f2488b338f0
688,CsCl,mp-22865,Cs-Cs,4.53237,4.209055,93.105649,74.568229,2.0,58f02e627f079f2488b3795b
701,CsLaHgSe3,mp-11124,Cs-Cs,4.53237,4.489172,436.225397,423.871044,12.0,58f0217c7f079f2488b32489
884,CsFeF3,mp-555302,Cs-Cs,4.53237,4.301824,597.039594,510.487316,30.0,58f031df7f079f2488b39152


### getting the coefficients of variance of MP data - shows how spread the bond lengths are

In [None]:
test_df_data = []
for b in bonds:
    x = bonds[b]['mp_values'].values()
    test_df_data.append({'bond_name':b, 'CV': np.std(x)/np.mean(x), 'std_dev':np.std(x), 'mean':np.mean(x),
                         'variance':np.var(x)})

bond_cv_scores = pd.DataFrame(test_df_data)
display(bond_cv_scores.sort_values('CV',ascending=False).head(25))