In [3]:
from urllib.request import urlopen
import json
import os
import time
import pandas as pd
import datetime
from multiprocessing.dummy import Pool as ThreadPool 

# You need to run ICSD_entries script first so you have them in ENTRIES directory

In [4]:
def get_properties(url, max_tries=15):
    '''Returns json using the url'''
    url += "?format=json"
    
    # setting number of tries when url does not respond 
    remaining_tries = max_tries
    
    while remaining_tries > 0:
        try:
            with urlopen(url) as response:
                source = response.read()
                
            # convert string (source) to dict
            data = json.loads(source)
            
            return data
        
        except:
            # waits 30 seconds until next retry
            time.sleep(30)
            print('Trying again.')
            
        remaining_tries = remaining_tries - 1
        
    # case after we try the url for 15 times with no response 
    return None

In [5]:
def convert_to_df(prop, entry):
    ''' Converts to DataFrame object'''
        
    try:
        prop_df = pd.DataFrame(prop)
        prop_df = prop_df.loc[0]
    except:
        # if we get any errors with the json to DataFrame conversion
        # only a blank line with the compound's name is added
        blank_line = {'compound': [entry]}
        return pd.DataFrame(blank_line)
    
    # transposition operation needed for propper formatting
    prop_df = prop_df.transpose()
    
    return prop_df

In [6]:
def save_to_csv(df, group):
    ''' Saves the group's DataFrame df in PROPERTIES directory
    in csv format.'''
    
    folder = './PROPERTIES'
    # Making sure if PROPERTIES directory exists. If not, creates it.
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Saving
    df.to_csv(f'{folder}/{group}_prop.csv', sep=',')

In [7]:
def save_group_prop(group):
    '''Gets properties data for the group and saves it as group_prop.csv.
    Returns list with missing entries'''
    
    # getting the entries from the group files
    # these files were created with ICSD_entries.py
    with open(f'./ENTRIES/{group}.csv') as f:
        dados = f.read()

    # each line --> one element in 'lista'
    lista = dados.split('\n')
    # deleting first blank line/element
    del lista[0]
    
    # Creating empty DataFrame for future appending (for each entry)
    group_df = pd.DataFrame()
    
    # Printing just for logging
    print(f"{group} >> Getting {group} properties from ICSD_WEB.\n")
    t_i = time.time()
    
    # Iterating thorugh the list of entries
    for entry in lista:
        
        # adding entry to the url
        url = f'http://aflowlib.duke.edu/AFLOWDATA/ICSD_WEB/{group}/{entry}'
        
        properties = get_properties(url)
        
        # Empty list for appending of missing entries
        missing_entries = list()
        
        # if properties == None: get the index and the entry
        # for future verification
        if not properties:
            index_missing, entry_missing  = lista.index(entry), entry
            missing_entries.append((index_missing, entry_missing))
            print(f'{group} >> MISSED ENTRY: {index_missing}, {entry_missing}')
            continue
              
        # Converting to json to Dataframe structure
        prop_df = convert_to_df(properties, entry)
        
        # Appending to the group DataFrame
        group_df = group_df.append(prop_df, sort='False')
        
        pid = os.getpid()
        ts = time.time()
        st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
        print(f'Process {pid} > {st} - {url[44:]}\n{group} > SIZE: {group_df.shape}')
        
    save_to_csv(group_df, group)
    
    # Logging        
    delta = round((time.time() - t_i)/60, 2)
    print(f"{group} >> Done. It took {delta} minutes.\n")
    
    # Returning tuple info about missing entries
    return (group, missing_entries)

In [None]:
if __name__ == '__main__':
    
    server = "http://aflowlib.duke.edu/AFLOWDATA/ICSD_WEB"
    group_list = "BCC BCT CUB FCC HEX MCL MCLC ORC ORCC ORCF ORCI RHL TET TRI".split()
    
    print('>>> GETTING ALL DATA FROM ICSD_WEB <<<\n\n')
    
    t_i = time.time()
    
    # Parallelization stuff to get things faster 
    # you should test for the optimal number of threads
    pool = ThreadPool(len(group_list))
    
    # This line starts everything
    # pool.map({function you want to apply}, {list of objects used as args})
    # returns a list of outputs corresponding to the list of inputs
    missed_entries = pool.map(save_group_prop, group_list)
    
    delta = round((time.time() - t_i)/60, 2)
    print(f'\n\n >>> FINISHED. IT TOOK {delta} minutes. <<<\n')   
    print(missed_entries)