# Clean DFT Jobs
---

Run in computer cluster to perform a variety of job clean and processing

Currently the following things are done:

1. Process large `job.out` files, if `job.out` is larger than `job_out_size_limit` than creates new `job.out.new` file removes middle section of file and leaves behind the beginning and end of the original file
1. Rclone copy the job directories to the Stanford Google Drive

## TODO
* Remove large files if they are newer revisions (Only time you need large VASP files are when starting a new job and therefore need WAVECAR or charge files)

# Import Modules

In [1]:
import os
print(os.getcwd())
import sys
import time; ti = time.time()

import copy
import shutil
from pathlib import Path
import subprocess
import pickle

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


# #########################################################
from IPython.display import display

# #########################################################
from methods import (
    get_df_jobs,
    get_df_jobs_paths,
    get_df_jobs_anal,
    )
from methods import (
    get_other_job_ids_in_set,
    )

# #########################################################
from local_methods import (
    cwd, process_large_job_out,
    rclone_sync_job_dir,
    parse_job_state,
    local_dir_matches_remote,
    )

/mnt/f/Dropbox/01_norskov/00_git_repos/PROJ_IrOx_OER/dft_workflow/job_processing


In [2]:
from methods import isnotebook    
isnotebook_i = isnotebook()
if isnotebook_i:
    from tqdm.notebook import tqdm
    verbose = True
else:
    from tqdm import tqdm
    verbose = False

# Script Inputs

In [3]:
# verbose = False

job_out_size_limit = 5  # MB

In [4]:
compenv = os.environ.get("COMPENV", None)

proj_dir = os.environ.get("PROJ_irox_oer", None)

# Read Data

In [5]:
# #########################################################
df_jobs = get_df_jobs(exclude_wsl_paths=False)

if compenv != "wsl":
    df_i = df_jobs[df_jobs.compenv == compenv]
else:
    df_i = df_jobs

# #########################################################
df_jobs_paths = get_df_jobs_paths()
df_jobs_paths_i = df_jobs_paths[df_jobs_paths.compenv == compenv]

# #########################################################
df_jobs_anal = get_df_jobs_anal()

if verbose:
    print(60 * "-")
    print("Directories being parsed")
    tmp = [print(i) for i in df_jobs_paths_i.path_rel_to_proj.tolist()]
    print("")

------------------------------------------------------------
Directories being parsed



# Iterate through rows

In [6]:
# df_i.job_type == ""

In [7]:
if compenv != "wsl":

    iterator = tqdm(df_i.index.tolist(), desc="1st loop")
    for index_i in iterator:
        # #####################################################
        row_i = df_i.loc[index_i]
        # #####################################################
        job_type_i = row_i.job_type
        slab_id_i = row_i.slab_id
        ads_i = row_i.ads
        att_num_i = row_i.att_num
        compenv_i = row_i.compenv
        active_site_i = row_i.active_site
        # #####################################################

        if active_site_i == "NaN":
            tmp = 42
        elif np.isnan(active_site_i):
            active_site_i = "NaN"

        # #####################################################
        df_jobs_paths_i = df_jobs_paths[df_jobs_paths.compenv == compenv_i]
        row_jobs_paths_i = df_jobs_paths_i.loc[index_i]
        # #####################################################
        path_job_root_w_att_rev = row_jobs_paths_i.path_job_root_w_att_rev
        path_full = row_jobs_paths_i.path_full
        path_rel_to_proj = row_jobs_paths_i.path_rel_to_proj
        gdrive_path_i = row_jobs_paths_i.gdrive_path
        # #####################################################

        # #####################################################

        name_new_i = (job_type_i, compenv_i, slab_id_i, ads_i, active_site_i, att_num_i)
        in_index = df_jobs_anal.index.isin([name_new_i]).any()
            # [(job_type_i, compenv_i, slab_id_i, ads_i, active_site_i, att_num_i)]).any()

        # in_index = df_jobs_anal.index.isin(
        #     [(compenv_i, slab_id_i, ads_i, active_site_i, att_num_i)]).any()
        if in_index:
            row_anal_i = df_jobs_anal.loc[name_new_i]
            # row_anal_i = df_jobs_anal.loc[
            #     compenv_i, slab_id_i, ads_i, active_site_i, att_num_i]
            # #################################################
            job_completely_done_i = row_anal_i.job_completely_done
            # #################################################
        else:
            job_completely_done_i = None

        # if job_completely_done_i:
        #     print("job done:", path_full)

        # #####################################################
        if compenv != "wsl":

            from proj_data import compenvs
            compenv_in_path = None
            for compenv_j in compenvs:
                if compenv_j in path_rel_to_proj:
                    compenv_in_path = compenv_j

            if compenv_in_path is not None:
                new_path_list = []
                for i in path_rel_to_proj.split("/"):
                    if i != compenv_in_path:
                        new_path_list.append(i)
                path_rel_to_proj_new = "/".join(new_path_list)
                path_rel_to_proj = path_rel_to_proj_new


            path_i = os.path.join(
                os.environ["PROJ_irox_oer"],
                path_rel_to_proj)
        else:
            path_i = os.path.join(
                os.environ["PROJ_irox_oer_gdrive"],
                gdrive_path_i)



        # print("path_i:", path_i)

        my_file = Path(path_i)
        if my_file.is_dir():

            # Only do these operations on non-running jobs
            job_state_dict = parse_job_state(path_i)
            job_state_i = job_state_dict["job_state"]

            if verbose:
                print("job_state_i:", job_state_i)

            # #########################################
            if job_state_i != "RUNNING":
                # print("Doing large job processing")
                process_large_job_out(
                    path_i, job_out_size_limit=job_out_size_limit)

            # #########################################
            # job_type_i
            rclone_sync_job_dir(
                path_job_root_w_att_rev=path_job_root_w_att_rev,
                path_rel_to_proj=path_rel_to_proj,
                verbose=False,
                )

# Remove left over large job.out files
For some reason some are left over

In [8]:
if compenv == "wsl":
    iterator = tqdm(df_i.index.tolist(), desc="1st loop")
    for index_i in iterator:
        # #####################################################
        row_i = df_i.loc[index_i]
        # #####################################################
        slab_id_i = row_i.slab_id
        ads_i = row_i.ads
        att_num_i = row_i.att_num
        compenv_i = row_i.compenv
        active_site_i = row_i.active_site
        # #####################################################

        # #####################################################
        df_jobs_paths_i = df_jobs_paths[df_jobs_paths.compenv == compenv_i]
        row_jobs_paths_i = df_jobs_paths_i.loc[index_i]
        # #####################################################
        gdrive_path_i = row_jobs_paths_i.gdrive_path
        # #####################################################

        path_i = os.path.join(
            os.environ["PROJ_irox_oer_gdrive"],
            gdrive_path_i)
        if Path(path_i).is_dir():

            # #############################################
            path_job_short = os.path.join(path_i, "job.out.short")
            if Path(path_job_short).is_file():
                path_job = os.path.join(path_i, "job.out")
                if Path(path_job).is_file():
                    print("Removing job.out", path_i)
                    os.remove(path_job)

            # #############################################
            path_job = os.path.join(path_i, "job.out")
            if Path(path_job).is_file():
                if not Path(path_job_short).is_file():
                    file_size = os.path.getsize(path_job)
                    file_size_mb = file_size / 1000 / 1000
                    
                    if file_size_mb > job_out_size_limit:
                        print("Large job.out, but no job.out.short", path_i)
                        process_large_job_out(
                            path_i, job_out_size_limit=job_out_size_limit)

HBox(children=(FloatProgress(value=0.0, description='1st loop', max=5633.0, style=ProgressStyle(description_wi…




In [9]:
# print(
#     10 * "NOT REMOVING JOBS AFTER RCLONE SYNC | TESTING DOS CALCULATIONS FIRST \n",
#     sep="")

In [10]:
# assert False

# Remove systems that are completely done

In [11]:
if verbose:
    print(5 * "\n")
    print(80 * "*")
    print(80 * "*")
    print(80 * "*")
    print(80 * "*")
    print("Removing job folders/data that are no longer needed")
    print("Removing job folders/data that are no longer needed")
    print("Removing job folders/data that are no longer needed")
    print("Removing job folders/data that are no longer needed")
    print("Removing job folders/data that are no longer needed")
    print("Removing job folders/data that are no longer needed")
    print(2 * "\n")







********************************************************************************
********************************************************************************
********************************************************************************
********************************************************************************
Removing job folders/data that are no longer needed
Removing job folders/data that are no longer needed
Removing job folders/data that are no longer needed
Removing job folders/data that are no longer needed
Removing job folders/data that are no longer needed
Removing job folders/data that are no longer needed





In [12]:
iterator = tqdm(df_i.index.tolist(), desc="1st loop")
for job_id_i in iterator:
    # #####################################################
    row_i = df_i.loc[job_id_i]
    # #####################################################
    job_type_i = row_i.job_type
    compenv_i = row_i.compenv
    slab_id_i = row_i.slab_id
    ads_i = row_i.ads
    att_num_i = row_i.att_num
    active_site_i = row_i.active_site
    # #####################################################

    if active_site_i == "NaN":
        tmp = 42
    elif np.isnan(active_site_i):
        active_site_i = "NaN"

    # #####################################################
    df_jobs_paths_i = df_jobs_paths[df_jobs_paths.compenv == compenv_i]
    row_jobs_paths_i = df_jobs_paths_i.loc[job_id_i]
    # #####################################################
    path_job_root_w_att_rev = row_jobs_paths_i.path_job_root_w_att_rev
    path_full = row_jobs_paths_i.path_full
    path_rel_to_proj = row_jobs_paths_i.path_rel_to_proj
    gdrive_path_i = row_jobs_paths_i.gdrive_path
    # #####################################################

    # #####################################################
    name_new_i = (job_type_i, compenv_i, slab_id_i, ads_i, active_site_i, att_num_i)
    in_index = df_jobs_anal.index.isin([name_new_i]).any()
        # [(job_type_i, compenv_i, slab_id_i, ads_i, active_site_i, att_num_i)]).any()
    if in_index:
        row_anal_i = df_jobs_anal.loc[name_new_i]
        # row_anal_i = df_jobs_anal.loc[
        #     compenv_i, slab_id_i, ads_i, active_site_i, att_num_i]
        # #################################################
        job_completely_done_i = row_anal_i.job_completely_done
        # #################################################
    else:
        continue




    path_i = os.path.join(os.environ["PROJ_irox_oer"], path_rel_to_proj)



    delete_job = False

    if not job_completely_done_i:
        df_job_set_i = get_other_job_ids_in_set(job_id_i, df_jobs=df_jobs)

        num_revs_list = df_job_set_i.num_revs.unique()
        assert len(num_revs_list) == 1, "kisfiisdjf"
        num_revs = num_revs_list[0]

        df_jobs_to_delete = df_job_set_i[df_job_set_i.rev_num < num_revs - 1]

        if job_id_i in df_jobs_to_delete.index.tolist():
            delete_job = True

    # #####################################################
    if job_completely_done_i:
        delete_job = True

    if delete_job:

        # #####################################################
        # Check that the directory exists
        my_file = Path(path_i)
        dir_exists = False
        if my_file.is_dir():
            dir_exists = True

        # #####################################################
        # Check if .dft_clean file is present
        dft_clean_file_path = os.path.join(path_i, ".dft_clean")
        my_file = Path(dft_clean_file_path)
        dft_clean_already_exists = False
        if my_file.is_file():
            dft_clean_already_exists = True

        # #####################################################
        if dir_exists:
            # Creating .dft_clean file
            if not dft_clean_already_exists:
                if compenv != "wsl":
                    with open(dft_clean_file_path, "w") as file:
                        file.write("")

        # #####################################################
        # Remove directory
        if dir_exists and dft_clean_already_exists and compenv != "wsl":
            local_dir_matches_remote_i = local_dir_matches_remote(
                path_i=path_i,
                gdrive_path_i=gdrive_path_i,
                )
            print(40 * "*")
            print(path_i)
            if local_dir_matches_remote_i:
                print("Removing:")
                shutil.rmtree(path_i)
            else:
                print("Gdrive doesn't match local")
            print("")

HBox(children=(FloatProgress(value=0.0, description='1st loop', max=5633.0, style=ProgressStyle(description_wi…




In [13]:
# #########################################################
print(20 * "# # ")
print("All done!")
print("Run time:", np.round((time.time() - ti) / 60, 3), "min")
print("clean_dft_dirs.ipynb")
print(20 * "# # ")
# #########################################################

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
All done!
Run time: 1.126 min
clean_dft_dirs.ipynb
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 


In [14]:
# df_jobs.job_type.unique()

In [15]:
# df_ind = df_jobs_anal.index.to_frame()
# df_jobs_anal = df_jobs_anal.loc[
#     df_ind[df_ind.job_type == "oer_adsorbate"].index
#     ]
# df_jobs_anal = df_jobs_anal.droplevel(level=0)


# df_ind = df_atoms_sorted_ind.index.to_frame()
# df_atoms_sorted_ind = df_atoms_sorted_ind.loc[
#     df_ind[df_ind.job_type == "oer_adsorbate"].index
#     ]
# df_atoms_sorted_ind = df_atoms_sorted_ind.droplevel(level=0)

In [16]:
# print("COMBAK I'M STOPPING ALL SCRIPTS UNTIL DOS_BADER WF GET'S CORRECTED")
# assert False