In [1]:
import pandas as pd
import numpy as np

from os import listdir
from fnmatch import fnmatch

from astropy import wcs
from astropy.io import fits

In [2]:
def prepareData(path):
    data = pd.read_csv(path)
    data.drop(0, inplace=True)
    data.reset_index(drop=True, inplace=True)

    data["plate"] = np.nan
    data["path"] = np.nan
    data["dx"] = np.zeros(data.shape[0])
    data["dy"] = np.zeros(data.shape[0])
    data[['_RAJ2000', '_DEJ2000']] = data[['_RAJ2000', '_DEJ2000']].astype(float)

    # print(data.head())
    return data

def prepareFits(headers_path, fits_path, headers_pattern, fits_pattern):
    headers_folder = listdir(headers_path)
    fits_folder = listdir(fits_path)

    fits_headers = []
    fits_files = []

    headers_pattern = headers_pattern
    fits_pattern = fits_pattern

    for entry in headers_folder:
        if fnmatch(entry, headers_pattern):
                fits_headers.append('./data/fits_headers/' + entry)

    for entry in fits_folder:
        if fnmatch(entry, fits_pattern):
                fits_files.append('./data/fits_files/' + entry)

    # print(fits_headers[:5])
    # print('Files in headers folder:', len(headers_folder))
    # print('Headers in headers folder:', len(fits_headers))
    # print()
    # print(fits_files[:5])
    # print('Files in fits folder:', len(fits_folder))
    # print('Fits files in fits folder:', len(fits_files))

    fits_headers = np.array(fits_headers)
    fits_files = np.array(fits_files)
    fits_set = set(map(lambda x: x.split('/')[-1].split('.')[0], fits_files))

    return fits_headers, fits_files, fits_set

def getCoordinates(fits_headers, data):
    coordinates = np.ones((len(fits_headers), data.shape[0], 2)) * (-1)

    for i in range(len(fits_headers)):
        hdulist = fits.open(fits_headers[i])
        w = wcs.WCS(hdulist[0].header)

        xy = w.all_world2pix(data[['_RAJ2000', '_DEJ2000']], 1, quiet=True)

        matching_indices = np.where((xy[:,0] >= 0) & (xy[:,0] <= 9601) & (xy[:,1] >= 0) & (xy[:,1] <= 9601))[0]

        coordinates[i][matching_indices] = xy[matching_indices]

    return coordinates

In [3]:
data = prepareData(
    path='data/DFBS.csv')
data.head()

Unnamed: 0,_Glon,_Glat,_RAJ2000,_DEJ2000,Cl,Name,Vmag,z,plate,path,dx,dy
0,100.174423,-55.203358,0.04875,5.388056,Sy1,RXS J00001+0523,16.4,0.04,,,0.0,0.0
1,99.844434,-57.30727,0.61,3.351667,Sy1,MARK 543,14.68,0.026,,,0.0,0.0
2,86.112841,-70.112882,0.88375,-10.744722,Sy1,NGC 7808,15.4,0.029,,,0.0,0.0
3,114.304767,-16.638006,1.039583,45.440278,Sy1,RXS J00041+4526,16.9,0.12,,,0.0,0.0
4,104.972206,-50.897341,1.45625,10.376944,Sy1,RXS J00058+1022,16.7,0.095,,,0.0,0.0


In [4]:
data['Cl'].value_counts()

sd     1668
WD     1161
M      1136
QSO     978
Mrk     858
Sy1     839
PN      409
cv      272
C        36
Name: Cl, dtype: int64

In [5]:
mean_number_of_class_samples = data['Cl'].value_counts().mean()
mean_number_of_class_samples

817.4444444444445

In [6]:
aug_classes = []
for c in data['Cl'].unique():
    if mean_number_of_class_samples/(data['Cl']==c).sum() > 1.5:
        aug_classes.append(c)

aug_classes

['C', 'cv', 'PN']

In [7]:
data = data[data['Cl'].isin(aug_classes)]
data.reset_index(inplace=True)

In [8]:
fits_headers, fits_files, fits_set = prepareFits(
    headers_path='data/fits_headers',
    fits_path='data/fits_files',
    headers_pattern="*.hdr",
    fits_pattern="*.fits")

coordinates = getCoordinates(
    fits_headers=fits_headers,
    data=data)
np.save('data/small_class_coordinates.csv', coordinates+1)
# coordinates = np.load('data/small_class_coordinates.csv.npy') - 1



In [34]:
# np.where((coordinates[:,:,0]>=0))[0]
# .sum(axis=1).argsort()[-31]

In [35]:
for aug_c in aug_classes:
    indices = data[data['Cl'] == aug_c].index
    where = np.where((coordinates[:,indices,0]>=0))
    header_indices = where[0]
    print(header_indices)
    break

[   4   30   30  119  193  224  244  376  407  408  415  415  455  605
  659  692  747  749  762  811  901  916  921  944 1159 1256 1303 1304
 1329 1330 1331 1338 1364 1381 1404 1420 1433 1447 1474 1479 1552 1584]


In [36]:
fits_headers[header_indices]

array(['./data/fits_headers/fbs0013_cor.fits.hdr',
       './data/fits_headers/fbs0047_cor.fits.hdr',
       './data/fits_headers/fbs0047_cor.fits.hdr',
       './data/fits_headers/fbs0138a_cor.fits.hdr',
       './data/fits_headers/fbs0214_cor.fits.hdr',
       './data/fits_headers/fbs0248_cor.fits.hdr',
       './data/fits_headers/fbs0281_cor.fits.hdr',
       './data/fits_headers/fbs0424_cor.fits.hdr',
       './data/fits_headers/fbs0462_cor.fits.hdr',
       './data/fits_headers/fbs0463_cor.fits.hdr',
       './data/fits_headers/fbs0473_cor.fits.hdr',
       './data/fits_headers/fbs0473_cor.fits.hdr',
       './data/fits_headers/fbs0519_cor.fits.hdr',
       './data/fits_headers/fbs0695_cor.fits.hdr',
       './data/fits_headers/fbs0752_cor.fits.hdr',
       './data/fits_headers/fbs0779_cor.fits.hdr',
       './data/fits_headers/fbs0814M_cor.fits.hdr',
       './data/fits_headers/fbs0815M_cor.fits.hdr',
       './data/fits_headers/fbs0824M_cor.fits.hdr',
       './data/fits_headers

In [72]:
coordinates[where[0], where[1]]

array([[3032.07126066, 3537.96057995],
       [7712.99113625, 2310.31818115],
       [7390.02859932, 2700.50219885],
       [2716.53633868, 7883.53162908],
       [ 663.9076957 , 3654.76869535],
       [4972.8344308 , 5494.6883189 ],
       [2917.67632742, 4580.38040568],
       [8195.35088564, 3972.50653914],
       [2964.49113115, 3675.71395803],
       [2970.06612048, 3620.99688778],
       [7698.047972  , 2390.00302107],
       [7375.78215878, 2780.74364225],
       [7497.26974577, 6532.66062903],
       [1915.50901787, 3028.17465342],
       [9253.88655179,  315.12788116],
       [7271.01693821, 3288.78578884],
       [ 255.39412035,  951.38083506],
       [9037.78949679,  696.08344086],
       [5557.19456276, 7719.60729292],
       [4602.26409214,  435.95297966],
       [2846.29010793, 7952.43892295],
       [1331.81523462,  877.07288299],
       [ 483.11699807,  526.99071842],
       [2998.8054555 , 4133.70732508],
       [ 391.70514175, 4609.98326429],
       [8125.55153232, 50

In [24]:
# for aug_c in aug_classes:
#     coef = (np.round(mean_number_of_class_samples/(data['Cl']==aug_c).sum())).astype(int) - 1
    

22
2
1


In [10]:
# data['fname'] = data.path.str.split('/', expand=True).iloc[:,-1]

In [None]:
# 1) Find the plates where are C-type stars
# 2) Extract them from plates into DFBS_extracted
# 3) Augment C-type stars and add to the data
# 4) Retrain the model on augmented data
# 5) Infer the model's results with current