Import common packages

In [1]:
import sys
import os
import datetime
import errno

Import findspark and import&init spark

In [2]:
import findspark


os.environ["SPARK_HOME"] = "/home/ser/Dev/Spark/spark-2.1.0-bin-hadoop2.7"
os.environ["PYSPARK_PYTHON"] = "/home/ser/Dev/Python/2/anaconda2/bin/python"
#os.environ['PYSPARK_SUBMIT_ARGS'] = "--master local[4] pyspark-shell"
os.environ['PYSPARK_SUBMIT_ARGS'] = "--master yarn \
                                     --deploy-mode cluster \
                                     --conf \"spark.yarn.am.port\"=8025\
                                     pyspark-shell"
print os.environ['PYSPARK_SUBMIT_ARGS']
os.environ['HADOOP_CONF_DIR'] = "/home/ser/Dev/Hadoop/hadoop-2.7.3/etc/hadoop"
findspark.init()

#import numpy as np
#import sep
#from operator import add

import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row

--master yarn                                      --deploy-mode cluster                                      --conf "spark.yarn.am.port"=8025                                     pyspark-shell


Import task-oriented packages

In [3]:
import astromatic_wrapper as aw
from astropy.io import fits
#import pydoop

Constants (in future go to config file)

In [4]:
exec_mode = 'test'
wrk_path = '/home/ser/Dev/Notebooks/spark_pipeline_1'

In [5]:
class AstroConfigurer:
    
    def __init__(self, check_dirs_existance=False, **kwargs):
        
        # declare class attributes and first init
        self._exec_mode = ''
        self._wrk_path = None
        self._cur_path = None
        #self._cur_timestamp = None
        self._logs_timestamp = None
        self._attr_prefix = '_'
        self._dirname_delim = '_'
        self._paths = None
        self._log_paths = []
        self._check_dirs_existance = check_dirs_existance
        
        # check not None kwargs
        for kwarg in kwargs:
            if kwargs[kwarg] is not None:
                # check if str attrs are actually str typed
                if kwarg.endswith('_mode') \
                or kwarg.endswith('_path') \
                or kwarg.endswith('_timestamp') \
                or kwarg.endswith('_prefix') \
                or kwarg.endswith('_delim'):
                    if not isinstance(kwargs[kwarg], str):
                        raise TypeError(kwarg + ' argument has str type, but '
                                       + type(kwargs[kwarg]).__name__ + ' typed value specified')
                # check if path attrs are valid pathnames
                if kwarg.endswith('_path') and not self._is_pathname_valid(kwargs[kwarg]):
                    raise ValueError('value ' + kwargs[kwarg] + ' for argument ' + kwarg
                                         + ' is not a valid pathname')
        
        # init class attrs if it set directly
        for kwarg in kwargs:
            if hasattr(self, self._attr_prefix + kwarg) and kwargs[kwarg] is not None:
                setattr(self, self._attr_prefix + kwarg, kwargs[kwarg])
                
        # init strictly valued attrs
        self._cur_path = os.getcwd()
        #self._update_cur_timestamp()
        self._update_logs_timestamp()
        
        # build attributes
        if not self._wrk_path:
            self._wrk_path = self._cur_path
        else:
            self._wrk_path = self._build_path(self._cur_path, self._wrk_path)
            
        # set paths to work dirs
        self._paths = {
            'temp': os.path.join(self._wrk_path, 'temp'),
            'logs': os.path.join(self._wrk_path, 'logs'),
            'config': os.path.join(self._wrk_path, 'config'),
            'catalogs': os.path.join(self._wrk_path, 'catalogs'),
            'stacks': os.path.join(self._wrk_path, 'stacks'),
            'images': os.path.join(self._wrk_path, 'images')
                }
        
        # set paths kwargs if specified
        for kwarg in kwargs:
            paths_dir = kwarg.rstrip('_path')
            if paths_dir in self._paths:
                self._paths[paths_dir] = self._build_path(self._wrk_path, kwargs[kwarg])
                
        # create dirs tree for application according to paths
        for path in self._paths:
            self._create_dir(self._paths[path], False, self._check_dirs_existance)

        # check if paths are ok
        for val in self._paths.values():
            print val
        
    def _update_logs_timestamp(self):
        self._logs_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    
    def _build_dir_name(self, *args, **kwargs):
        delim = '_'
        if 'delim' in kwargs:
            delim = kwargs['delim']
        res = ''
        for arg in args:
            if arg:
                res += arg + delim
        return res.rstrip(delim)
    
    def _is_pathname_valid(self, pathname):
        try:
            if not isinstance(pathname, str) or not pathname:
                return False

            _, pathname = os.path.splitdrive(pathname)
            root_dirname = os.environ.get('HOMEDRIVE', 'C:') if sys.platform == 'win32' else os.path.sep
            assert os.path.isdir(root_dirname)
            root_dirname = root_dirname.rstrip(os.path.sep) + os.path.sep

            for pathname_part in pathname.split(os.path.sep):
                try:
                    os.lstat(root_dirname + pathname_part)
                except OSError as os_err:
                    if hasattr(os_err, 'winerror'):
                        if os_err.winerror == ERROR_INVALID_NAME:
                            return False
                    elif os_err.errno in {errno.ENAMETOOLONG, errno.ERANGE}:
                        return False
        except TypeError:
            return False
        else:
            return True
        
    def _is_dir_writeable(self, pathname, base=None):
        dirname = os.path.dirname(pathname) or (base if base else self._wrk_path)
        return os.access(dirname, os.W_OK)
    
    def _is_dir_exists(self, pathname, check_pathname=True):
        try:
            return (True if not check_pathname else self._is_pathname_valid(pathname)) \
                    and os.path.isdir(pathname)
        except OSError:
            return False
    
    def _create_dir(self, pathname, check_pathname=True, check_exist=False):
        try:
            if check_pathname:
                if not self._is_pathname_valid(pathname):
                    raise ValueError('cannot create dir: pathname is not valid')
            os.makedirs(pathname)
        except OSError as os_err:
            if check_exist and os_err.errno == errno.EEXIST or os_err.errno != errno.EEXIST:
                raise

    def _build_path(self, base, pathname):
        return pathname if os.path.isabs(pathname) else os.path.join(base, pathname)
    
    def get_exec_mode(self):
        return self._exec_mode
    
    def set_exec_mode(self, new_exec_mode):
        if isinstance(new_exec_mode, str):
            self._exec_mode = new_exec_mode
        else:
            raise ValueError('Execution mode must have str type, not ' + type(new_exec_mode).__name__)
        return self._exec_mode
    
    def get_paths(self):
        return self._paths
    
    def get_paths_dir_names(self):
        return [dir_name for dir_name in self._paths]
    
    def new_log_dir(self, check_exist=True):
        self._update_logs_timestamp()
        self._log_paths.append(os.path.join(self._paths['logs'], 
                                            self._build_dir_name(self._exec_mode, 'log', 
                                                                 self._logs_timestamp, 
                                                                 delim=self._dirname_delim)))
        self._create_dir(self._log_paths[-1], False, check_exist)
        return self._log_paths[-1]
    
    def get_last_log_dir(self):
        return self._log_paths[-1]

In [6]:
cfg = AstroConfigurer(False, wrk_path=wrk_path)
cfg.new_log_dir()
print cfg.get_last_log_dir()
print cfg.get_paths_dir_names()

/home/ser/Dev/Notebooks/spark_pipeline_1/catalogs
/home/ser/Dev/Notebooks/spark_pipeline_1/logs
/home/ser/Dev/Notebooks/spark_pipeline_1/temp
/home/ser/Dev/Notebooks/spark_pipeline_1/images
/home/ser/Dev/Notebooks/spark_pipeline_1/config
/home/ser/Dev/Notebooks/spark_pipeline_1/stacks
/home/ser/Dev/Notebooks/spark_pipeline_1/logs/log_2017-04-05_06-20-54
['catalogs', 'logs', 'temp', 'images', 'config', 'stacks']


Spark code separately yet

In [7]:
def extract_1(input_item):
    file_name = input_item[0].lstrip('file:')
    catalog_name = os.path.join(cfg.get_paths()['catalogs'], os.path.basename(file_name.replace('.fit', '.cat')))
    sex_kwargs_1 = {'code': 'SExtractor'}
    sex_kwargs_1['config_file'] = os.path.join(cfg.get_paths()['config'], 'default.sex')
    sex_kwargs_1['config'] = {'CATALOG_NAME': catalog_name}
    sex_kwargs_1['config']['CATALOG_TYPE'] = 'FITS_LDAC'
    sex_kwargs_1['config']['FILTER'] = 'N'
    sex_kwargs_1['temp_path'] = cfg.get_paths()['temp']
    sex_kwargs_1['params'] = ['NUMBER', 'EXT_NUMBER', 'XWIN_IMAGE', 'YWIN_IMAGE', 'AWIN_IMAGE', 'BWIN_IMAGE',
                              'ERRAWIN_IMAGE','ERRBWIN_IMAGE', 'ERRTHETAWIN_IMAGE', 'ERRA_WORLD', 'ERRB_WORLD', 
                              'ERRTHETA_WORLD', 'X_WORLD', 'Y_WORLD', 'XWIN_WORLD', 'YWIN_WORLD', 
                              'FLUX_AUTO', 'FLUX_MAX', 'MAG_AUTO', 'FLUXERR_AUTO', 
                              'FLAGS', 'FLUX_RADIUS', 'ELONGATION']
    sextractor = aw.api.Astromatic(**sex_kwargs_1)
    sextractor.run(file_name)
    #with open(catalog_name, "r") as catalog:
        #output_item = ('file:' + catalog_name, catalog)
    return catalog_name

In [8]:
def extract_2(input_item):
    file_name = input_item[0]
    return file_name

In [7]:
def extract_3(input_item):
    file_name = os.path.basename(input_item[0])
    tmp_file_name = os.path.join(cfg.get_paths()['temp'], file_name)
    catalog_name = os.path.join(cfg.get_paths()['catalogs'], file_name.replace('.fit', '.cat'))
    with open(tmp_file_name, "w+") as fits_file:
        fits_file.write(input_item[1])
    sex_kwargs_1 = {'code': 'SExtractor'}
    sex_kwargs_1['config_file'] = os.path.join(cfg.get_paths()['config'], 'default.sex')
    sex_kwargs_1['config'] = {'CATALOG_NAME': catalog_name}
    sex_kwargs_1['config']['CATALOG_TYPE'] = 'FITS_LDAC'
    sex_kwargs_1['config']['FILTER'] = 'N'
    sex_kwargs_1['temp_path'] = cfg.get_paths()['temp']
    sex_kwargs_1['params'] = ['NUMBER', 'EXT_NUMBER', 'XWIN_IMAGE', 'YWIN_IMAGE', 'AWIN_IMAGE', 'BWIN_IMAGE',
                              'ERRAWIN_IMAGE','ERRBWIN_IMAGE', 'ERRTHETAWIN_IMAGE', 'ERRA_WORLD', 'ERRB_WORLD', 
                              'ERRTHETA_WORLD', 'X_WORLD', 'Y_WORLD', 'XWIN_WORLD', 'YWIN_WORLD', 
                              'FLUX_AUTO', 'FLUX_MAX', 'MAG_AUTO', 'FLUXERR_AUTO', 
                              'FLAGS', 'FLUX_RADIUS', 'ELONGATION']
    sextractor = aw.api.Astromatic(**sex_kwargs_1)
    sextractor.run(tmp_file_name)
    with open(catalog_name, "r") as catalog:
        output_item = ('file:' + catalog_name, catalog)
    return output_item

In [8]:
if __name__ == "__main__":
    sc = SparkContext(appName="SourceExtractor")
    #files = sc.binaryFiles(cfg.get_paths()['images'])
    files = sc.binaryFiles("hdfs://localhost:9000/ser/images/images")
    print files

Exception: Java gateway process exited before sending the driver its port number

In [24]:
files = files.filter(lambda x: x[0].endswith('.fit'))

In [25]:
#print files.map(extract).reduce(join_cats)
#files.map(extract_1).map(lambda x: [x]).reduce(lambda a, b: a + b)
catalogs = files.map(extract_3).collect()
#len(catalogs)
#catalogs.map(lambda x: [x]).reduce(lambda a, b: a + b)
#catalogs.saveAsSequenceFile(cfg.get_paths()['catalogs'])

In [29]:
catalogs[0]

(u'file:/home/ser/Dev/Notebooks/spark_pipeline_1/catalogs/GRB130427_R60_001_001.cat',
 <closed file '<uninitialized file>', mode '<uninitialized file>' at 0x7f573ba3e6f0>)

In [30]:
sc.stop()