# Read a file-list and generate shell script commands

In [1]:
# basic packages
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
#import databricks.koalas as ks

# the default `sequence` option merges all partitions into a single catastrophic one : what?
#ks.set_option('compute.default_index_type', 'distributed') 

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})

## Read data 

In [2]:
!pwd

/home/shong/work/gaia/notebook


In [3]:
!ls /home/shong/work/gaia/data/

GaiaSource.header  gaia-csv.list  show3cols.html


In [2]:
with open('./L2.datalist') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

In [3]:
len(content)

602

In [4]:
content[:6]

['L2dx12_v3_smica_cmb_005a_2048_cold.csv',
 'L2dx12_v3_smica_cmb_005a_2048_hot.csv',
 'L2dx12_v3_smica_cmb_005a_2048_sim0000_cold.csv',
 'L2dx12_v3_smica_cmb_005a_2048_sim0000_hot.csv',
 'L2dx12_v3_smica_cmb_005a_2048_sim0001_cold.csv',
 'L2dx12_v3_smica_cmb_005a_2048_sim0001_hot.csv']

In [5]:
content[-1]

'L2dx12_v3_smica_cmb_005a_2048_sim0299_hot.csv'

In [6]:
def outname (infilename,headstr='',tmodestr='',tailstr='.pkl'):
    outstr = ''
        
    mcode='unknown'
    if 'sim' not in infilename:
        mcode = '-1'
    else:
        mcode = infilename.split('sim')[1].split('_')[0]
    outstr = outstr+'.'+mcode
    
    return headstr+outstr+tailstr

In [7]:
outname(content[0],headstr='L2')

'L2.-1.pkl'

In [8]:
outname(content[100],headstr='L2',tmodestr='',tailstr='.pkl')

'L2.0049.pkl'

In [9]:
content[2].split('sim')[1].split('_')[0]

'0000'

## Generate the script commands

> Now we need a strict file path, which will be included as a file output, for example the HDFS path `hdfs://master:54310/data/cosmo/cmb/`

In [10]:
!pwd

/home/shong/work/bigdata/cmb/inspect471/scripts


In [11]:
outpath = '/home/shong/work/bigdata/cmb/inspect471/scripts/results/nocap/'

In [12]:
lencontent = len(content)

In [13]:
print(lencontent)

602


### For L1 and t1 

In [17]:
# write down each command to a shell script 
finalstr = ''
idx=0
while idx < lencontent:
    hdfspath = 'hdfs://master:54310/data/cosmo/cmb/L2/'
    coldfilename = content[idx]
    hotfilename = content[idx+1]
    header = 'spark-submit --master spark://master:7077 --driver-memory 20g --executor-memory 58g --conf spark.driver.maxResultSize=8g --packages graphframes:graphframes:0.7.0-spark2.4-s_2.11 cmb-inspect471-v2-nocap.py '
    body = hdfspath+coldfilename+' '+hdfspath+hotfilename+' '+outpath+outname(coldfilename,headstr='L2nocap',tmodestr='',tailstr='.pkl')
    finalstr = finalstr+header+body+'\n'  
    idx=idx+2
    #print(idx)


In [18]:
with open("run-inspect-new-smica-nocap.sh", "w") as script_file:
    script_file.write(finalstr)

In [19]:
print(finalstr[:3000])

spark-submit --master spark://master:7077 --driver-memory 20g --executor-memory 58g --conf spark.driver.maxResultSize=8g --packages graphframes:graphframes:0.7.0-spark2.4-s_2.11 cmb-inspect471-v2-nocap.py hdfs://master:54310/data/cosmo/cmb/L2/L2dx12_v3_smica_cmb_005a_2048_cold.csv hdfs://master:54310/data/cosmo/cmb/L2/L2dx12_v3_smica_cmb_005a_2048_hot.csv /home/shong/work/bigdata/cmb/inspect471/scripts/results/nocap/L2nocap.-1.pkl
spark-submit --master spark://master:7077 --driver-memory 20g --executor-memory 58g --conf spark.driver.maxResultSize=8g --packages graphframes:graphframes:0.7.0-spark2.4-s_2.11 cmb-inspect471-v2-nocap.py hdfs://master:54310/data/cosmo/cmb/L2/L2dx12_v3_smica_cmb_005a_2048_sim0000_cold.csv hdfs://master:54310/data/cosmo/cmb/L2/L2dx12_v3_smica_cmb_005a_2048_sim0000_hot.csv /home/shong/work/bigdata/cmb/inspect471/scripts/results/nocap/L2nocap.0000.pkl
spark-submit --master spark://master:7077 --driver-memory 20g --executor-memory 58g --conf spark.driver.maxResul