## Imports

In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client.structure_sets

from IPython.display import display
import pandas as pd
pd.options.display.max_rows = 9999

from pymatgen import Element, Structure
import math
import time
import pprint
import math
import warnings
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

In [2]:
# setup our parallel processing units..
import ipyparallel as ipp
c = ipp.Client()
print c.ids
# dview = c[:]
lview = c.load_balanced_view()

# with c[:].sync_imports():
#     from pymatgen import Structure
#     from matminer.learners.volume_predictor import VolumePredictor
#     from vp_funcs import get_id_serial_f

[0, 1, 2, 3]


            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@198.128.197.28')
            or instruct your controller to listen on an external IP.


## Define bond length prediction functions

In [8]:
def predict_bond_length(structure):
    cutoff=4
    ionic_factor = 0.30
    
    smallest_dist = None
    smallest_expected_dist = None
    smallest_ratio = None
    smallest_e1 = None
    smallest_e2 = None
    
    ionic_mix = min(np.std(get_pymatgen_descriptor(structure.composition, 'X')) * ionic_factor, 1)
    
    for site in structure:
        el1 = site.specie

        if el1.atomic_radius:
            r1 = el1.average_ionic_radius * ionic_mix + \
                 el1.atomic_radius * (1-ionic_mix) if el1.average_ionic_radius else el1.atomic_radius
                
            neighbors = structure.get_neighbors(site, el1.atomic_radius + cutoff)
            
            for site2, dist in neighbors:
                el2 = site2.specie
                
                if el2.atomic_radius or el2.average_ionic_radius:
                    bond_name = '-'.join(sorted([el1.symbol, el2.symbol]))
                    if bond_name in bonds.keys():
                        expected_dist = bonds[bond_name]['estimates'][-1]
                    else:
                        expected_dist = el1.atomic_radius + el2.atomic_radius
                        r2 = el2.average_ionic_radius * ionic_mix + \
                             el2.atomic_radius * (1-ionic_mix) if el2.average_ionic_radius else el2.atomic_radius

                    if not smallest_ratio or dist/expected_dist < smallest_ratio:
                        smallest_dist = dist
                        smallest_expected_dist = expected_dist
                        smallest_ratio = dist/expected_dist
                        smallest_e1 = el1
                        smallest_e2 = el2
        else:
            warnings.warn("VolumePredictor: no atomic radius data for "
                          "{}".format(el1))

    if smallest_ratio is None:
        return None
            
    volume_factor = (1/smallest_ratio)**3
            
    return {'name': structure.composition.reduced_formula, 'actual_volume': structure.volume,
            'predicted_volume': structure.volume * volume_factor, 'num_atoms': structure.composition.num_atoms,
            'worst_predicted_bond': '-'.join(sorted([smallest_e1.symbol, smallest_e2.symbol])),
            'actual_distance': smallest_dist, 'expected_distance': smallest_expected_dist}

In [15]:
def get_worst_bond_length_prediction(inp):
    doc = inp[0]
    bonds = inp[1]
#     bonds = {}
    
    import numpy as np
    from matminer.descriptors.composition_features import get_pymatgen_descriptor
    from pymatgen import Element, Structure
    import warnings
    structure = Structure.from_dict(doc)
    cutoff=4
    ionic_factor = 0.30
    
    smallest_dist = None
    smallest_expected_dist = None
    smallest_ratio = None
    smallest_e1 = None
    smallest_e2 = None
    
    ionic_mix = min(np.std(get_pymatgen_descriptor(structure.composition, 'X')) * ionic_factor, 1)
    
    for site in structure:
        el1 = site.specie

        if el1.atomic_radius:
            r1 = el1.average_ionic_radius * ionic_mix + \
                 el1.atomic_radius * (1-ionic_mix) if el1.average_ionic_radius else el1.atomic_radius
                
            neighbors = structure.get_neighbors(site, el1.atomic_radius + cutoff)
            
            for site2, dist in neighbors:
                el2 = site2.specie
                

                if el2.atomic_radius:
                    r2 = el2.average_ionic_radius * ionic_mix + \
                         el2.atomic_radius * (1-ionic_mix) if el2.average_ionic_radius else el2.atomic_radius
                
                    bond_name = '-'.join(sorted([el1.symbol, el2.symbol]))
                    if bond_name in bonds:
                        expected_dist = bonds[bond_name]['estimates'][-1]
                    else:
                        expected_dist = float(r1+r2)

                    if not smallest_ratio or dist/expected_dist < smallest_ratio:
                        smallest_dist = dist
                        smallest_expected_dist = expected_dist
                        smallest_ratio = dist/expected_dist
                        smallest_e1 = el1
                        smallest_e2 = el2
        else:
            warnings.warn("VolumePredictor: no atomic radius data for "
                          "{}".format(el1))

    if smallest_ratio is None:
        print structure.composition.reduced_formula, doc['_id'], 'FAILED'
        return {'name':structure.composition.reduced_formula, '_id':doc['_id'],
                'status': 'FAILED'}
    
    volume_factor = (1/smallest_ratio)**3
            
    return {'name': structure.composition.reduced_formula, 'actual_volume': structure.volume,
            'predicted_volume': structure.volume * volume_factor, 'num_atoms': structure.composition.num_atoms,
            'worst_predicted_bond': '-'.join(sorted([smallest_e1.symbol, smallest_e2.symbol])),
            'actual_distance': smallest_dist, 'expected_distance': smallest_expected_dist,
            '_id':doc['_id']}

In [66]:
def create_bonds_dict(bl_pred_0):
    for b in bl_pred_0['worst_predicted_bond'].unique():
        if b not in bonds:
            bonds[b]={'mp_values':[], 'estimates':[]}

def get_mp_values(c):
    if c['worst_predicted_bond'] in bonds:
        bonds[c['worst_predicted_bond']]['mp_values'].append(c['actual_distance'])
    else:
        print c['worst_predicted_bond'], 'DNE. Creating entry in bonds dict..'
        bonds[c['worst_predicted_bond']]={'mp_values':[c['actual_distance']], 'estimates':[]}

def get_worst_predicted_bonds_avg_minus_median():
    bond_estimate_data = []
    percent_to_return = 0.1
    num_bonds_to_change = int(percent_to_return*len(bonds))
    for b in bonds:
        b_median = np.median(bonds[b]['mp_values'])
        b_est = bonds[b]['estimates'][-1]
        b_dist = abs(b_median-b_est)

        bond_estimate_data.append({'bond_name':b, 'median-estimate': b_dist})

    bond_estimate_data_df = pd.DataFrame(bond_estimate_data)

    return bond_estimate_data_df.sort_values('median-estimate',ascending=False)[:num_bonds_to_change]['bond_name'].tolist()
        
def get_worst_predicted_bonds_avg_score(bl_pred_0):
    bond_estimate_data = []
    percent_to_return = 0.1
    num_bonds_to_change = int(percent_to_return*len(bonds))
    for b in bl_pred_0['worst_predicted_bond'].unique():
        df =  bl_pred_0[bl_pred_0['worst_predicted_bond']==b]
        mean_score = df['score'].mean()
        bond_estimate_data.append({'bond_name':b, 'mean_score': np.mean(mean_score)})

    bond_estimate_data_df = pd.DataFrame(bond_estimate_data)

    return bond_estimate_data_df.sort_values('mean_score',ascending=False)[:num_bonds_to_change]['bond_name'].tolist()
    
    
def update_estimates(bl_pred_df, iter_num, bonds_to_update,weight=0.5):
    for b in bonds_to_update:
        if len(bonds[b]['estimates'])<iter_num+2: # after iteration 0, we want at most 2 estimate entries (old prediction and new)
            if len(bonds[b]['estimates']) == 0: # first iteration estimate is always the OG prediction
                df = bl_pred_df[bl_pred_df['worst_predicted_bond']==b]
                new_estimate = df['expected_distance'].mean()
            else:
                new_estimate = weight*(np.median(bonds[b]['mp_values'])) + (1-weight)*(bonds[b]['estimates'][-1])
                
            bonds[b]['estimates'].append(new_estimate)
            

In [79]:
def score(c, pred_vol='predicted_volume', act_vol='actual_volume', num_atoms='num_atoms'):
    """
    Found that the % error isn't a great measurement of error in our case.
    Our worst cases occur the farthest away from the optimal line (y=x),
    so define a function that finds distance between the optimal line and the point.
    
        Args:
        c (Series) : Series corresponding to a single structure
        structures_path (str) : file path to set of structures
        predictor (str) : string defining which predictor to use - 
                            "mm" : mat miner
                            "dev" : dev predictor
                            "okeeffe" : O'Keefffe predictor
        print_info (bool) : set to True to print out smallest ratio info for a structure
                            
    Returns:
        a float value of the distance away from the optimal line
    """
    
    # define the optimal line here using two points
    # line is y = x
    x1 = 0
    y1 = 0
    x2 = 100
    y2 = 100
    
    # set x3, y3 as the point
    try:
        x3 = c[pred_vol]/c[num_atoms]
        y3 = c[act_vol]/c[num_atoms]
    except:
        sys.exit("Error: specified column does not exist")
        
        
    # run the actual calculation here..
    px = x2-x1
    py = y2-y1

    temp = px*px + py*py

    u =  ((x3 - x1) * px + (y3 - y1) * py) / float(temp)

    if u > 1:
        u = 1
    elif u < 0:
        u = 0

    x = x1 + u * px
    y = y1 + u * py

    dx = x - x3
    dy = y - y3

    dist = math.sqrt(dx*dx + dy*dy)

    return dist


def score_test(x1, y1, x2, y2, x3,y3):
    px = x2-x1
    py = y2-y1

    temp = px*px + py*py

    u =  ((x3 - x1) * px + (y3 - y1) * py) / float(temp)

    if u > 1:
        u = 1
    elif u < 0:
        u = 0

    x = x1 + u * px
    y = y1 + u * py

    dx = x - x3
    dy = y - y3

    dist = math.sqrt(dx*dx + dy*dy)

    return dist


def percent_error_df(c, pred_vol='predicted_volume', act_vol='actual_volume', num_atoms='num_atoms'):
    return percent_error(c['predicted_volume']/c[num_atoms], c['actual_volume']/c[num_atoms])
    

def percent_error(calculcated, actual):
    return abs(calculcated+actual)/actual*100

score_test(0,0,100,100,45,50)

3.5355339059327378

## MAIN

In [9]:
"""
bonds is a dictionary organized in the following manner:
    key: 'element1-element2' ex. 'C-Cl'
    'mp_values': array of values calculated from the structures in Materials Project
    'estimates': array of the last estimated value used (entry 0 should be r_atomic+r_atomic)
"""
bonds = {}

In [8]:
# For each structure, get the worst bond length estimate
# The estimate is made from the value found in the bonds dictionary
# If it is not there, set bond length prediction to r_atomic_e1 + r_atomic_e2
# DON'T MAKE THIS MISTAKE AGAIN :(
bonds = {}
bond_length_predictions = []
bl_pred_dfs = []
scores = []
bonds_updated_this_iter = []

In [26]:
"""

Main block. This section runs through all of the Materials Project database, estimating the bond lengths for
each structure, and storing the worst estimate in bond_length_predictions[iter_index]. After each iteration
through the Materials Proejct database, updates the n worst performing bond by moving the estimate closer
to the median of the values found.

Run this section after setting the number of iterations and how many bonds to change per iteration.

"""


cursor = db.mp_final.find()

init_iter = 1
num_iter = 1
num_bonds_to_change_per_iter = 10
weight = 0.5
weight_decay_per_iter = 0.9


for i in range (init_iter, init_iter+num_iter):
    print '\n################### ROUND', i, '#######################'
    bond_length_predictions.append([])
#     num_processed = 0
#     new_bonds_added_this_iter = []

    lim = 1000
    iter_num = 0
    d = []
    print len(bond_length_predictions[i]), 'structures processed'
    while len(bond_length_predictions[i]) < cursor.count():
        temp_cursor = cursor.clone()
        temp_cursor.skip(lim*iter_num)
        temp_cursor.limit(lim)
        d = lview.map_sync(get_worst_bond_length_prediction, [[doc,bonds] for doc in temp_cursor])
        bond_length_predictions[i] += d
        iter_num+=1
        print len(bond_length_predictions[i]), 'structures processed'

    print 'making dataframe'
    cols = ['name', 'worst_predicted_bond', 'expected_distance', 'actual_distance', 'predicted_volume', 'actual_volume',
        'num_atoms', '_id']
    bl_pred_dfs.append(pd.DataFrame(bond_length_predictions[i], columns=cols))


# Mess with the scores stuff 
#     if scores[-1] < score_this_iter:
#         print 'Optimal score found at iteration', i
#         print 'Print "scores" for list of values'
#         break
#     else:
#         scores.append(score_this_iter)


    print 'calculating score'
    bl_pred_dfs[i]['score'] = bl_pred_dfs[i].apply(score, args=('predicted_volume','actual_volume','num_atoms'), axis=1)
    score_this_iter = bl_pred_dfs[i]['score'].mean()    
    scores.append(score_this_iter)

    print 'initializing bonds dict (iteration 0). initial estimate original vol predictor'
    if i == 0: # initialization of bonds dict estimates
        # create a dictionary bonds
        print 'creating bonds dict'
        create_bonds_dict(bl_pred_dfs[0])
        # gets all of the materials project data for bond lengths and puts it in 'mp_values'
        print 'collecting Materials Project bond lengths'
        bl_pred_dfs[i].apply(get_mp_values,axis=1)
        # initialize our first estimate
        print 'initializing estimates'
        update_estimates(bl_pred_dfs[i], i, bonds.keys(), weight)

    
    # if a new bond shows up, add it to the dict!
    for b in bl_pred_dfs[i]['worst_predicted_bond'].unique():
        if b not in bonds:
            bonds[b]={'mp_values':[], 'estimates':[]}
            df =  bl_pred_dfs[i][bl_pred_dfs[i]['worst_predicted_bond']==b]
            bonds[b]['mp_values'] = df['actual_distance'].tolist()
            new_estimate = df['expected_distance'].mean()
            bonds[b]['estimates'] = [new_estimate]
            print 'NEW BOND', b, 'added to bonds dict'
    
    print 'updating bonds dict and adding new estimates for 10%'
    bonds_updated_this_iter.append(get_worst_predicted_bonds()) # get back a list of bond length estimates updated
    update_estimates(bl_pred_dfs[i], i, bonds_updated_this_iter[i], weight)
    weight = weight * weight_decay_per_iter
        

    


################### ROUND 0 #######################
67466 structures processed
making dataframe
calculating score
initializing bonds dict (iteration 0). initial estimate original vol predictor
updating bonds dict and adding new estimates for 10%

################### ROUND 1 #######################
0 structures processed


KeyboardInterrupt: 

In [86]:
# if i == 0: # initialization of bonds dict estimates
#     # create a dictionary bonds
#     create_bonds_dict(bl_pred_dfs[0])
#     # gets all of the materials project data for bond lengths and puts it in 'mp_values'
#     bl_pred_dfs[i].apply(get_mp_values,axis=1)
#     # initialize our first estimate
#   update_estimates(bl_pred_dfs[i], i, bonds.keys(), weight)
len1 = 0
len2 = 0
lenelse= 0
for b in bonds:
    if len(bonds[b]['estimates']) == 1:
        len1 += 1
        #bonds[b]['estimates'] = [bonds[b]['estimates'][0]]
    elif len(bonds[b]['estimates']) == 2:
        len2 += 1
    else:
        lenelse +=1
        
print len1, len2, lenelse

# update_estimates(bl_pred_dfs[i], i, bonds_updated_this_iter[0], weight)

# bonds['F-F']
# display(bl_pred_dfs[0][bl_pred_dfs[0]['worst_predicted_bond']=='Ar-Ar'])
# bond_to_inspect = 'F-F'

# print bonds[bond_to_inspect]
# display(bl_pred_dfs[0][bl_pred_dfs[0]['worst_predicted_bond']==bond_to_inspect])

# bl_pred_dfs[i]['score'] = bl_pred_dfs[i].apply(percent_error_df, args=('predicted_volume','actual_volume','num_atoms'), axis=1)

# test_bonds_to_change_pe = get_worst_predicted_bonds_avg_score(bl_pred_dfs[0])
# test_bonds_to_change == test_bonds_to_change_pe

2250 249 0


## some code to delete specific objects from our database

In [47]:
# bond_length_predictions[0].index(None)
# for i, j in enumerate(bond_length_predictions[0]):
#     if j == None:
#         print i
        
indices_to_delete = []
for i in bond_length_predictions[0]:
    if 'status' in i:
        indices_to_delete.append(i)
        print i, 'to be deleted'
        bond_length_predictions[0].remove(i)

# for j in bond_length_predictions[0]:
#     if j['_id'] in indices_to_delete
        

In [70]:
doc
structure.composition.reduced_formula

u'SbXeF9'

In [42]:
from bson.objectid import ObjectId

for i in indices_to_delete:
#     result = db.mp_final.delete_one({'_id': ObjectId(i['_id'])})
#     print i['name'], 'deleted', result.deleted_count

Ne deleted 1
Xe deleted 1
He deleted 1
Kr deleted 1
He deleted 1
Kr deleted 1
Xe deleted 1
He deleted 1
Kr deleted 1
P deleted 1
H2 deleted 1
EuAg deleted 1
Xe deleted 1
He deleted 1
Kr deleted 1
Xe deleted 1
Kr deleted 1


## Analysis Functions

In [123]:
print 'number of unique bonds used to predict volume:', len(bonds)
print 'number of structures predicted:', len(bond_length_predictions[0])

# in iteration 0, we predicted on 67466 compounds. we found 2609 unique bonds used

number of unique bonds used to predict volume: 2609
number of structures predicted: 67466


In [200]:
def get_worst_bond_predictions(bonds_dict, iter_index, num_bonds_to_change=10):
    """
    Retrieves a list of the bonds whose estimates were the furthest the median of the
    bond length found in Materials Project
    Args:
        bonds_dict (dict): dictionary containing the full set of bonds
        iter_index (int): index of the iteration
    Returns:
        list containing the worst bond estimates (bond name, previous estimate, new estimate)
    """
#     num_bonds_to_change = 10
    avg_bonds = []
    worst_bonds_predicted = []
    for key in bonds_dict.keys():
        mp_values_median = np.median(np.array(bonds_dict[key]['mp_values']))
        bond_length_estimate = bonds_dict[key]['estimates'][iter_index]
        num_mp_values = len(bonds_dict[key]['mp_values'])
        
        avg_bonds.append([key,
                          bond_length_estimate,
                          mp_values_median,
                          abs(mp_values_median - bond_length_estimate),
                          num_mp_values])
        
    ab = np.array(avg_bonds)
    display_worst_performing_bonds = ab[ab[:,3].astype(float).argsort()][-num_bonds_to_change:]
    worst_bonds_predicted = display_worst_performing_bonds[:,0].tolist()
    
    print display_worst_performing_bonds
    # return something
    return worst_bonds_predicted

In [296]:
def analyze_bond(bond_to_inspect):
    for b in bond_to_inspect.split('-'):
        e = Element(b)
        print e.symbol, 'atomic radius', e.atomic_radius, 'ionic_radius', e.average_ionic_radius
#     print bonds[bond_to_inspect]
    display(bl_pred_0[bl_pred_0['worst_predicted_bond']==bond_to_inspect])

In [211]:
def fix_bond_estimates(bonds_dict, worst_bonds_list, iter_index, weight):
    """
    Given a dictionary of bonds and a list of worst performing bonds, change the estimated bond length.
    Initial bond length estimate is r_atomic_e1+r_atomic_e2
    New estimate is adding a weighted difference between the median of bond length in Materials Project
    Args:
        bonds_dict (dict): dictionary containing the full set of bonds
        worst_bonds_list (list): list containing the bonds that we want to change
        iter_index (int): index of the iteration
        weight (float): how much to move the estimate by
    Returns:
        None
    """
    for key in bonds_dict.keys():
        if len(bonds_dict[key]['estimates']) <= iter_index+1:
            # if it's a bond we want to change, update its estimate
            if key in worst_bonds_list:
                mp_values_median = np.median(np.array(bonds_dict[key]['mp_values']))
                prev_bond_length_estimate = bonds_dict[key]['estimates'][-1]

                median_vs_estimate_diff = mp_values_median - prev_bond_length_estimate

                new_bond_length_estimate = prev_bond_length_estimate + weight*median_vs_estimate_diff

                bonds_dict[key]['estimates'].append(new_bond_length_estimate)

                print key, 'changed. Previous estimate:', prev_bond_length_estimate, 'New estimate:', new_bond_length_estimate
                print 'frequency in MP:', len(bonds_dict[key]['mp_values'])
            
            # if this bond does not need to be changed, keep the same estimate
            else:
                bonds_dict[key]['estimates'].append(bonds_dict[key]['estimates'][-1])

## Run analysis for initial iteration

### create dataFrama object, score the predictions

In [54]:
cols = ['name', 'worst_predicted_bond', 'expected_distance', 'actual_distance', 'predicted_volume', 'actual_volume',
        'num_atoms', '_id']
bl_pred_0 = pd.DataFrame(bond_length_predictions[0], columns=cols)
bl_pred_0['worst_predicted_bond'].value_counts()

In [133]:
bl_pred_0['worst_predicted_bond'].value_counts()

O-P      5992
Li-O     2631
O-V      1965
Mn-O     1503
B-O      1474
O-Si     1449
Fe-O     1397
C-O      1307
O-Ti     1303
O-S      1221
Cr-O     1174
K-K      1032
Mo-O      815
Cs-Cs     809
Rb-Rb     803
Co-O      717
O-W       672
Nb-O      649
O-Te      595
F-Li      555
Na-Na     529
Cu-O      526
H-O       518
Ca-O      513
C-N       487
O-Ta      486
Ge-O      479
La-O      457
Li-Li     444
O-Se      440
N-O       434
Mg-O      430
Al-O      412
O-U       393
Bi-O      381
O-Sb      379
As-O      346
O-Y       341
Ba-Ba     328
Ni-O      311
O-Sr      292
Fe-Fe     280
Na-O      244
O-O       231
Ag-Ag     228
Ni-Ni     224
Cu-S      216
Ba-O      205
Sr-Sr     205
Tl-Tl     191
Nd-O      190
Cu-Cu     188
Ag-O      187
Ca-Ca     183
I-O       178
O-Tl      173
O-Sm      173
O-Re      162
Cl-O      161
O-Pb      160
Mo-Mo     159
Fe-S      156
O-Sn      154
O-Zr      149
Ce-O      145
Ba-F      144
Zn-Zn     144
O-Zn      138
Co-Co     137
La-La     136
Cu-Se     135
Cd-I  

In [60]:
bl_pred_0['score'] = bl_pred_0.apply(score, args=('predicted_volume','actual_volume','num_atoms'), axis=1)

In [64]:
display(bl_pred_0.head())
print bl_pred_0['score'].mean()

Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id,score
0,Ni3O4,Ni-O,1.961357,1.880527,81.539999,71.86857,7.0,58a781ba7f079f2fd3b86cfe,0.976962
1,Li2Fe2Si2O7,Li-O,2.08216,1.917591,259.914669,203.027985,13.0,58a781bb7f079f2fd3b86cff,3.094228
2,YbZn,Yb-Zn,2.81884,3.127887,34.484139,47.115209,2.0,58a781bc7f079f2fd3b86d00,4.465758
3,La2SiO5,O-Si,1.730899,1.620021,584.63803,479.329349,32.0,58a781bc7f079f2fd3b86d01,2.327015
4,HfCr2,Cr-Cr,2.753161,2.493218,240.716605,178.768942,12.0,58a781bd7f079f2fd3b86d02,3.650301


2.86690634626


### Create a bonds dictionary

In [183]:
# create_bonds_dict()
# len(bonds)
# bl_pred_0.apply(get_mp_values,axis=1)
# update_estimates(0)

bond_to_inspect = 'Ar-Ar'
display(bl_pred_dfs[0][bl_pred_dfs[0]['worst_predicted_bond']==bond_to_inspect])
display(bl_pred_dfs[1][bl_pred_dfs[0]['worst_predicted_bond']==bond_to_inspect])
pprint.pprint(bonds[bond_to_inspect])

Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id,score
1771,Ar,Ar-Ar,1.42,3.988628,2.02465,44.869952,1.0,58a783987f079f2fd3b873eb,30.296203
11783,Ar,Ar-Ar,1.42,3.987594,4.065892,90.037547,2.0,58a78cee7f079f2fd3b89b0d,30.39557


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,actual_volume,num_atoms,_id,score
1771,Ar,Ar-Ar,1.42,3.988628,2.02465,44.869952,1.0,58a783987f079f2fd3b873eb,30.296203
11783,Ar,Ar-Ar,1.42,3.987594,4.065892,90.037547,2.0,58a78cee7f079f2fd3b89b0d,30.39557


{'estimates': [1.4199999999999999, 2.7040555086763436, 3.3460832630145152],
 'mp_values': [3.988628004705375, 3.98759403]}


In [158]:
df = bl_pred_0[bl_pred_0['worst_predicted_bond']=='I-P']
df['expected_distance'].tolist()

[2.364791737209144,
 2.361670098414481,
 2.3633727125266653,
 2.36458204490361,
 2.3633727125266653,
 2.36458204490361]

In [191]:
# bl_pred_dfs[1].head()
# len(bond_length_predictions[1])
# bl_pred_dfs[0]['score'] == bl_pred_dfs[1]['score']
len(bond_length_predictions)
# bond_length_predictions = bond_length_predictions[0:1]
bl_pred_dfs = bl_pred_dfs[0:1]
scores = scores[0:1]
bonds_updated_this_iter = bonds_updated_this_iter[0:1]

### Fix the bonds baseed on median

In [201]:
print '[name, estimate, median (in MP), difference, freq in MP]'
# bonds_list_to_change_0 = get_worst_bond_predictions(bonds, 0)

[name, estimate, median (in MP), difference, freq in MP]
[[u'Cs-K' u'4.8 ang' u'3.97073516816' u'0.829264831842' u'32']
 [u'K-Ta' u'3.65 ang' u'2.80447186293' u'0.845528137065' u'1']
 [u'K-Mo' u'3.65 ang' u'2.78237719929' u'0.867622800705' u'1']
 [u'Ir-K' u'3.55 ang' u'2.66783199419' u'0.882168005807' u'1']
 [u'Cs-Cs' u'5.2 ang' u'4.29233469875' u'0.907665301252' u'949']
 [u'Cs-Na' u'4.4 ang' u'3.4780854159' u'0.921914584101' u'10']
 [u'K-Zr' u'3.75 ang' u'2.82134257631' u'0.928657423689' u'3']
 [u'Hf-K' u'3.75 ang' u'2.81135453055' u'0.938645469451' u'1']
 [u'F-F' u'1.0 ang' u'2.4894341553' u'1.4894341553' u'12']
 [u'Ar-Ar' u'1.42 ang' u'3.98811101735' u'2.56811101735' u'2']]


In [319]:
e = Element('Sm')
print e.symbol, e.atomic_radius, e.average_ionic_radius
analyze_bond('F-F')

Sm 1.85 ang 1.229 ang
F atomic radius 0.5 ang ionic_radius 0.705 ang
F atomic radius 0.5 ang ionic_radius 0.705 ang


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
11424,KrF2,F-F,1.0,2.795223,2.933135,3.0,58a78cb57f079f2fd3b899a6
13895,KrF2,F-F,1.0,2.818688,6.03498,6.0,58a78e227f079f2fd3b8a350
14070,F2,F-F,1.0,1.427373,22.006291,4.0,58a78e3b7f079f2fd3b8a3ff
14572,XeF2,F-F,1.0,3.067668,2.515601,3.0,58a78e8a7f079f2fd3b8a5f5
16495,F2,F-F,1.0,1.426752,26.27155,4.0,58a78fb87f079f2fd3b8ad78
28509,XeF6,F-F,1.0,2.535056,115.301076,112.0,58a797b67f079f2fd3b8dc67
33189,XeF6,F-F,1.0,2.443812,135.696252,112.0,58a79ad27f079f2fd3b8eeb0
48741,XeF3,F-F,1.0,2.877965,14.110571,16.0,58a7a4a77f079f2fd3b92b74
51434,F2,F-F,1.0,1.426354,32.541423,4.0,58a7a63e7f079f2fd3b935f9
52290,F2,F-F,1.0,2.23105,8.0,8.0,58a7a6bb7f079f2fd3b93951


In [212]:
fix_bond_estimates(bonds, bonds_list_to_change_0, 1.0)
# fixed these 10 bonds!

K-Zr changed. Previous estimate: 3.75 ang New estimate: 2.82134257631
K-Ta changed. Previous estimate: 3.65 ang New estimate: 2.80447186293
Ir-K changed. Previous estimate: 3.55 ang New estimate: 2.66783199419
Cs-Cs changed. Previous estimate: 5.2 ang New estimate: 4.29233469875
F-F changed. Previous estimate: 1.0 ang New estimate: 2.4894341553
Ar-Ar changed. Previous estimate: 1.42 ang New estimate: 3.98811101735
K-Mo changed. Previous estimate: 3.65 ang New estimate: 2.78237719929
Hf-K changed. Previous estimate: 3.75 ang New estimate: 2.81135453055
Cs-Na changed. Previous estimate: 4.4 ang New estimate: 3.4780854159
Cs-K changed. Previous estimate: 4.8 ang New estimate: 3.97073516816


In [214]:
bonds['K-Zr']

{'estimates': [3.75, 2.8213425763106366],
 'mp_values': [3.1258710635627232, 2.6362394492076864, 2.8213425763106366]}

In [220]:
for b in bonds_list_to_change_0:
    analyze_bond(b)

{'estimates': [4.800000000000001, 3.9707351681583622], 'mp_values': [4.0060860595722891, 4.0733989799999994, 3.8854681026626752, 4.2174241468943876, 3.4887997033647276, 3.507288149327807, 3.7920796517142858, 3.5133968693683966, 4.2119310923908291, 3.6486326412019006, 3.2259987157600718, 3.8387646345688387, 3.7926367591751529, 3.9914597714269573, 3.9947854646414789, 3.9490350101165119, 3.9930559519758604, 3.9762693531885849, 3.7590593757785005, 3.987167418979126, 3.9375295857046937, 3.913329450928845, 3.9968981228822145, 4.0766969400000077, 4.0767141814463965, 4.1426638431417162, 3.2323603137520758, 3.8620127052675932, 3.9652009831281392, 4.093135434607091, 4.0576823631847043, 4.1604729855405491]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
416,Cs2KTiF6,Cs-K,4.8,4.006086,340.535045,10.0,58a7826c7f079f2fd3b86ea0
3886,Cs2KAgF6,Cs-K,4.8,4.073399,340.535046,10.0,58a7860c7f079f2fd3b87c2f
4232,Cs2KNF6,Cs-K,4.8,3.885468,340.535045,10.0,58a786c47f079f2fd3b87d89
4834,Cs2KBiF6,Cs-K,4.8,4.217424,340.535078,10.0,58a788b47f079f2fd3b87fe3
4904,Cs2K2TeO5,Cs-K,4.8,3.4888,2216.674767,40.0,58a788c07f079f2fd3b88029
5274,CsK2AuO2,Cs-K,4.8,3.507288,1547.246516,24.0,58a788f87f079f2fd3b8819b
5535,CsK2Sc(PO4)2,Cs-K,4.8,3.79208,435.891541,14.0,58a789227f079f2fd3b882a0
6046,Cs2K2Cd3O5,Cs-K,4.8,3.513397,1322.085341,24.0,58a789707f079f2fd3b884a0
7089,Cs2KPbF6,Cs-K,4.8,4.211931,340.535045,10.0,58a78a117f079f2fd3b888b3
7275,CsK2CoO2,Cs-K,4.8,3.648633,322.949037,6.0,58a78a2c7f079f2fd3b8896d


{'estimates': [3.6500000000000004, 2.8044718629345606], 'mp_values': [2.8044718629345606]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
36955,KTaPt,K-Ta,3.65,2.804472,149.732713,3.0,58a79d127f079f2fd3b8fd68


{'estimates': [3.6500000000000004, 2.7823771992949213], 'mp_values': [2.7823771992949213]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
37004,KMoRu2,K-Mo,3.65,2.782377,149.732713,4.0,58a79d197f079f2fd3b8fd9a


{'estimates': [3.5500000000000003, 2.667831994192957], 'mp_values': [2.6678319941929569]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
36956,KBIr2,Ir-K,3.55,2.667832,137.760008,4.0,58a79d127f079f2fd3b8fd69


{'estimates': [5.2, 4.292334698748215], 'mp_values': [4.3681630233027704, 5.2684252218804755, 4.0306191821977659, 4.1689329454589252, 4.5212977701928549, 4.4907040534736016, 4.3525417060582221, 5.1444081194336464, 5.209288413309447, 4.4574234567414424, 4.10694538, 3.868386689372147, 5.4027922999999998, 4.5230806264491399, 4.3810608200000001, 4.3336577542018508, 4.30828363, 4.8787137500000002, 4.3775288405855681, 4.2084409725714025, 5.0457946436555501, 3.813594042075052, 5.0205493627617432, 4.067269972102971, 4.1063385535298256, 4.1917402403909172, 4.0668137535714157, 3.890137699806119, 4.1497268885946612, 4.029138726162631, 4.2941247583377944, 4.3240096860368631, 4.5024703669534576, 4.3010350857079072, 4.1292052065523253, 4.183623627432608, 4.2919948700026112, 4.1210710500000003, 3.9912306062981138, 3.8622674599945594, 3.9829091063289361, 4.22603872, 4.9843506169709126, 4.8303920517563661, 4.4990470190152179, 4.3163184699999997, 4.8417994223167664, 3.9270312799999996, 4.118330109999999

Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
10,CsAg3Se2,Cs-Cs,5.2,4.368163,554.044388,12.0,58a781bf7f079f2fd3b86d08
87,CsCrCl3,Cs-Cs,5.2,5.268425,281.124227,10.0,58a781d87f079f2fd3b86d55
99,CsTiF4,Cs-Cs,5.2,4.030619,920.166919,24.0,58a781dd7f079f2fd3b86d61
118,Cs2PtC2(OF)4,Cs-Cs,5.2,4.168933,916.136987,26.0,58a781e27f079f2fd3b86d74
184,CsSi2SbO7,Cs-Cs,5.2,4.521298,3250.651998,132.0,58a782027f079f2fd3b86db6
401,Cs2NaScF6,Cs-Cs,5.2,4.490704,281.216,10.0,58a782677f079f2fd3b86e91
421,Cs2NaGaF6,Cs-Cs,5.2,4.352542,611.217251,20.0,58a7826d7f079f2fd3b86ea5
448,Cs,Cs-Cs,5.2,5.144408,254.315,2.0,58a782777f079f2fd3b86ec0
466,CsAuCl3,Cs-Cs,5.2,5.209288,281.216017,10.0,58a7827e7f079f2fd3b86ed2
477,CsPH3O3F,Cs-Cs,5.2,4.457423,845.375818,36.0,58a782817f079f2fd3b86edd


{'estimates': [4.4, 3.478085415898742], 'mp_values': [3.5941814145540349, 3.8101765393874909, 3.7249919112212879, 3.3619894172434499, 3.2440140995640974, 3.207992153917373, 3.6524354672926775, 3.2440362925885275, 3.8345990681626017, 3.2807020072381063]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
74,Cs2NaAl(PO4)2,Cs-Na,4.4,3.594181,3697.078095,112.0,58a781d37f079f2fd3b86d48
1849,Cs6Na2Nb6H36O37,Cs-Na,4.4,3.810177,1660.445386,87.0,58a783bc7f079f2fd3b87439
4380,Cs2NaMnF6,Cs-Na,4.4,3.724992,604.342495,20.0,58a788667f079f2fd3b87e1d
7635,CsNaTiO3,Cs-Na,4.4,3.361989,440.009839,12.0,58a78a607f079f2fd3b88ad5
8800,CsNa5(WN3)2,Cs-Na,4.4,3.244014,4816.156782,112.0,58a78b0c7f079f2fd3b88f62
19165,Cs2Na3InO4,Cs-Na,4.4,3.207992,2029.423546,40.0,58a7915e7f079f2fd3b8b7e6
21742,CsNaSi2O5,Cs-Na,4.4,3.652435,2057.578476,72.0,58a793147f079f2fd3b8c1f7
33653,CsNa2BO3,Cs-Na,4.4,3.244036,525.309051,14.0,58a79b197f079f2fd3b8f080
44903,CsNaF3,Cs-Na,4.4,3.834599,144.824926,5.0,58a7a2577f079f2fd3b91c76
49063,CsNa3Li12(GeO4)4,Cs-Na,4.4,3.280702,1016.244134,36.0,58a7a4d77f079f2fd3b92cb6


{'estimates': [3.75, 2.8213425763106366], 'mp_values': [3.1258710635627232, 2.6362394492076864, 2.8213425763106366]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
36958,KZr2Nb,K-Zr,3.75,3.125871,162.379763,4.0,58a79d137f079f2fd3b8fd6b
38017,K(Zr3I7)4,K-Zr,3.75,2.636239,19665.30524,164.0,58a79db67f079f2fd3b9018f
48153,K(Zr3I7)2,K-Zr,3.75,2.821343,3893.736364,42.0,58a7a44f7f079f2fd3b92928


{'estimates': [3.75, 2.811354530549105], 'mp_values': [2.8113545305491048]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
37052,KHfNi,Hf-K,3.75,2.811355,162.379763,3.0,58a79d217f079f2fd3b8fdca


{'estimates': [1.0, 2.489434155303493], 'mp_values': [2.7952226578376052, 2.8186876691665743, 1.4273733345844997, 3.0676676257244013, 1.4267515944663114, 2.5350562783550425, 2.4438120322519432, 2.8779649777328848, 1.4263539833775516, 2.231050025, 1.4267712785023376, 2.5507755033773321]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
11424,KrF2,F-F,1.0,2.795223,2.933135,3.0,58a78cb57f079f2fd3b899a6
13895,KrF2,F-F,1.0,2.818688,6.03498,6.0,58a78e227f079f2fd3b8a350
14070,F2,F-F,1.0,1.427373,22.006291,4.0,58a78e3b7f079f2fd3b8a3ff
14572,XeF2,F-F,1.0,3.067668,2.515601,3.0,58a78e8a7f079f2fd3b8a5f5
16495,F2,F-F,1.0,1.426752,26.27155,4.0,58a78fb87f079f2fd3b8ad78
28509,XeF6,F-F,1.0,2.535056,115.301076,112.0,58a797b67f079f2fd3b8dc67
33189,XeF6,F-F,1.0,2.443812,135.696252,112.0,58a79ad27f079f2fd3b8eeb0
48741,XeF3,F-F,1.0,2.877965,14.110571,16.0,58a7a4a77f079f2fd3b92b74
51434,F2,F-F,1.0,1.426354,32.541423,4.0,58a7a63e7f079f2fd3b935f9
52290,F2,F-F,1.0,2.23105,8.0,8.0,58a7a6bb7f079f2fd3b93951


{'estimates': [1.42, 3.9881110173526872], 'mp_values': [3.988628004705375, 3.9875940299999999]}


Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
1771,Ar,Ar-Ar,1.42,3.988628,2.02465,1.0,58a783987f079f2fd3b873eb
11783,Ar,Ar-Ar,1.42,3.987594,4.065892,2.0,58a78cee7f079f2fd3b89b0d


## Test bench

In [223]:
len(bond_length_predictions)

1

In [250]:
len(new_bonds_added_this_iter), new_bonds_added_this_iter[0]

(812, u'Ni-O')

In [245]:
for b in new_bonds_added_this_iter:
    del bonds[b]

In [251]:
bonds['Ni-O']

{'estimates': [1.9500000000000002],
 'mp_values': [1.8805266150531745,
  2.0558223685412003,
  2.1098317445106076,
  1.8724391943498615,
  1.8826315845081569,
  1.8143699626794569]}

In [259]:
cols = ['name', 'worst_predicted_bond', 'expected_distance', 'actual_distance', 'predicted_volume',
        'num_atoms', '_id']
bl_pred_2 = pd.DataFrame(bond_length_predictions[0], columns=cols)

In [253]:
display(bl_pred_1[bl_pred_1['worst_predicted_bond']=='Ni-O'])

Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
0,Ni3O4,Ni-O,1.95,1.880527,80.131705,7.0,58a781ba7f079f2fd3b86cfe
201,NiH44C12N8(ClO5)2,Ni-O,1.95,2.055822,603.257097,77.0,58a782087f079f2fd3b86dc7
1420,NiO,Ni-O,1.95,2.109832,14.829795,2.0,58a7834a7f079f2fd3b8728c
2073,Co2NiO6,Ni-O,1.95,1.872439,108.184064,9.0,58a784037f079f2fd3b87519
2328,Li2MnNi3O8,Ni-O,1.95,1.882632,153.899446,14.0,58a7847f7f079f2fd3b87619
2343,NiO2,Ni-O,1.95,1.81437,56.513197,3.0,58a784847f079f2fd3b87628


In [254]:
display(bl_pred_0[bl_pred_0['name']=='NiO'])

Unnamed: 0,name,worst_predicted_bond,expected_distance,actual_distance,predicted_volume,num_atoms,_id
1420,NiO,Ni-O,1.95,2.109832,14.829795,2.0,58a7834a7f079f2fd3b8728c
38479,NiO,Ni-O,1.95,2.102112,29.66013,4.0,58a79e087f079f2fd3b9035e


In [4]:
import multiprocessing as mp
import threading

In [5]:
# pool = mp.Pool(processes=4)
# results = [pool.apply(FUNCTION, args=(STRUCTURE,)) for x in range(1,7)]
# print(results)

# results = pool.map(cube, range(1,7))
# print(results)

def print_hi(cursor):
    id_endings = []
    for doc in cursor:
        id_endings.append(str(doc['_id'])[-1])
        
    print set(id_endings)

cursors = db.mp_final.parallel_scan(8)
threads = [
    threading.Thread(target=print_hi, args=(cursor,))
    for cursor in cursors]

for thread in threads:
    thread.start()
    
for thread in threads:
    thread.join()


set(['a', 'c', 'b', 'e', 'd', 'f', '1', '0', '3', '2', '5', '4', '7', '6', '9', '8'])


In [1]:
doc

NameError: name 'doc' is not defined

In [31]:
cursor = db.mp_final.find()
cursor.count()

67483

In [38]:
batch_num = 0
batch_size = 10
cursor.limit(95)

<pymongo.cursor.Cursor at 0x11e5e2450>

In [39]:
# pool = mp.Pool(processes=4)
cursor_set = []
while (batch_num*batch_size) < cursor.count():
    c = cursor.clone().skip(batch_num*batch_size).limit(batch_size)
    cursor_set.append(c)
    batch_num += 1
    
# results = [pool.apply(print_obj_id, args=(c,)) for c in cursor_set]
# print 'obj_ids',

In [40]:
def get_obj_ids(curs):
    out = []
    for doc in curs:
        out.append[doc['_id']]
    return out

pool = mp.Pool(processes=4)
results = [pool.apply(get_obj_ids, args=(c,)) for c in cursor_set]
print results

TypeError: can't pickle thread.lock objects

In [18]:
i=0
for doc in cursor:
    i+=1
    print i, doc['_id']

1 58a781ba7f079f2fd3b86cfe
2 58a781bb7f079f2fd3b86cff
3 58a781bc7f079f2fd3b86d00
4 58a781bc7f079f2fd3b86d01
5 58a781bd7f079f2fd3b86d02
6 58a781bd7f079f2fd3b86d03
7 58a781bd7f079f2fd3b86d04
8 58a781be7f079f2fd3b86d05
9 58a781be7f079f2fd3b86d06
10 58a781bf7f079f2fd3b86d07


In [19]:
cursor_clone = cursor.clone()

In [20]:
i=0
for doc in cursor_clone:
    i+=1
    print i, doc['_id']

1 58a781ba7f079f2fd3b86cfe
2 58a781bb7f079f2fd3b86cff
3 58a781bc7f079f2fd3b86d00
4 58a781bc7f079f2fd3b86d01
5 58a781bd7f079f2fd3b86d02
6 58a781bd7f079f2fd3b86d03
7 58a781bd7f079f2fd3b86d04
8 58a781be7f079f2fd3b86d05
9 58a781be7f079f2fd3b86d06
10 58a781bf7f079f2fd3b86d07


In [23]:
cursor.alive

False

In [34]:
Structure.from_dict(doc)

Structure Summary
Lattice
    abc : 5.1165439476774068 10.321597155914025 9.9088226497680711
 angles : 16.258432635698291 29.376595063230173 33.220824340184052
 volume : 71.868569702060682
      A : 2.5056169399999999 0.0 4.4610431200000003
      B : -0.0095372100000000008 2.8897407400000001 9.9088180599999998
      C : -0.0095372100000000008 0.0 9.9088180599999998
PeriodicSite: Ni (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
PeriodicSite: Ni (1.2480, 0.0000, 7.1849) [0.5000, 0.0000, 0.5000]
PeriodicSite: Ni (-0.0048, 1.4449, 4.9544) [0.0000, 0.5000, 0.0000]
PeriodicSite: O (0.5115, 0.0000, 3.8357) [0.2053, 0.0000, 0.2947]
PeriodicSite: O (0.7933, 1.4449, 8.6276) [0.3194, 0.5000, 0.2269]
PeriodicSite: O (1.9846, 0.0000, 10.5341) [0.7947, 0.0000, 0.7053]
PeriodicSite: O (1.6932, 1.4449, 15.6510) [0.6806, 0.5000, 0.7731]