In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from pathlib import Path
import pandas as pd
import re

In [None]:
ROOT_DIR = Path().resolve().parent
DATA_DIR = ROOT_DIR / 'data'
SMOG_DATA_DIR = DATA_DIR / 'external' / 'dataset-SMOG'
RGB_DATA_DIR = DATA_DIR / 'raw' / 'galaxies-rgb'

# Scan and parse dataset-SMOG from circulo de astrofisica

In [None]:
# List zone folders Galaxias/10450_0145	No_Galaxias/10450_0145

pattern_5_4 = re.compile(r'\d{5}_\d{4}')
def is5_4(name):
    return bool(pattern_5_4.fullmatch(name))

folder_path = SMOG_DATA_DIR / 'Galaxias'
folders = [f.name for f in folder_path.iterdir() if f.is_dir() and is5_4(f.name)]
items = []
items.extend( [dict(folder='Galaxias/'+f, galaxy=1) for f in folders] )

folder_path = SMOG_DATA_DIR / 'No_Galaxias'
folders = [f.name for f in folder_path.iterdir() if f.is_dir() and is5_4(f.name)]
items.extend( [dict(folder='No_Galaxias/'+f, galaxy=0) for f in folders] )

df_topdirs = pd.DataFrame(items, columns=['folder', 'galaxy'])
#df_topdirs.to_dict(orient='records')
df_topdirs

Unnamed: 0,folder,galaxy
0,Galaxias/10450_0145,1
1,Galaxias/10550_0270,1
2,Galaxias/10350_0020,1
3,Galaxias/10350_0145,1
4,Galaxias/10550_0020,1
5,Galaxias/10250_0270,1
6,Galaxias/10250_0020,1
7,Galaxias/10250_0145,1
8,Galaxias/10150_0145,1
9,Galaxias/10550_0145,1


In [None]:
# List subdirs by imagetype Galaxias/10450_0145/Blue_10450_0145

pattern_color = re.compile(r'^[A-Za-z0-9]+')
def extract_imagetype(name):
  match = pattern_color.match(name)
  return match.group(0) if match else None

items=[]
for _,item in df_topdirs.iterrows():
  folder = item['folder']
  galaxy = item['galaxy']
  folder_path = SMOG_DATA_DIR / folder
  subfolders = [f.name for f in folder_path.iterdir() if f.is_dir()]

  items.extend( [dict(folder=folder, subfolder=f, imagetype=extract_imagetype(f), galaxy=galaxy) for f in subfolders] )

df_imagedirs = pd.DataFrame(items, columns=['folder', 'subfolder', 'imagetype','galaxy'])
df_imagedirs

Unnamed: 0,folder,subfolder,imagetype,galaxy
0,Galaxias/10450_0145,Blue_10450_0145,Blue,1
1,Galaxias/10450_0145,I3_10450_0145,I3,1
2,Galaxias/10450_0145,Red_10450_0145,Red,1
3,Galaxias/10450_0145,Green_10450_0145,Green,1
4,Galaxias/10450_0145,RGB_10450_0145,RGB,1
...,...,...,...,...
126,No_Galaxias/10450_0270,Blue_10450_0270,Blue,0
127,No_Galaxias/10450_0270,I3_10450_0270,I3,0
128,No_Galaxias/10450_0270,Green_10450_0270,Green,0
129,No_Galaxias/10450_0270,Red_10450_0270,Red,0


In [None]:
# List all images Galaxias/10450_0145/RGB_10450_0145/104.7263830,2.0560720_10450_0145_RGB-composite.jpeg
# And extract metadata

def parse_filename(filename):
  match = re.match(r'^(-?\d+\.\d+),(-?\d+\.\d+)_([0-9]+_[0-9]+)_(.*)\.([\w]+)$', filename)
  if match:
    return dict(ra=match.group(1), dec=match.group(2), zone=match.group(3), type=match.group(4), suffix=match.group(5))
  return dict(ra=None, dec=None, zone=None, type=None, suffix=None)

items=[]
for _,item in df_imagedirs.iterrows():
  D = item.to_dict()
  folder = D['folder']
  subfolder = D['subfolder']
  folder_path = SMOG_DATA_DIR / folder / subfolder
  imagefiles = [f.name for f in folder_path.iterdir() if f.is_file()]
  items.extend( [dict(**D, file=f, **parse_filename(f)) for f in imagefiles] )

df_images = pd.DataFrame(items, columns=['folder', 'subfolder', 'file', 'imagetype','galaxy', 'ra', 'dec', 'zone', 'type', 'suffix'])	
df_images

Unnamed: 0,folder,subfolder,file,imagetype,galaxy,ra,dec,zone,type,suffix
0,Galaxias/10450_0145,Blue_10450_0145,"104.8347729,2.0068920_10450_0145_Azul-3.5micra...",Blue,1,104.8347729,2.0068920,10450_0145,Azul-3.5micras,jpeg
1,Galaxias/10450_0145,Blue_10450_0145,"104.1032257,1.8147153_10450_0145_Azul-3.5micra...",Blue,1,104.1032257,1.8147153,10450_0145,Azul-3.5micras,jpeg
2,Galaxias/10450_0145,Blue_10450_0145,"104.8214547,0.9927273_10450_0145_Azul-3.5micra...",Blue,1,104.8214547,0.9927273,10450_0145,Azul-3.5micras,jpeg
3,Galaxias/10450_0145,Blue_10450_0145,"103.9853656,1.3372564_10450_0145_Azul-3.5micra...",Blue,1,103.9853656,1.3372564,10450_0145,Azul-3.5micras,jpeg
4,Galaxias/10450_0145,Blue_10450_0145,"104.6244403,1.6596911_10450_0145_Azul-3.5micra...",Blue,1,104.6244403,1.6596911,10450_0145,Azul-3.5micras,jpeg
...,...,...,...,...,...,...,...,...,...,...
7120,No_Galaxias/10450_0270,RGB_10450_0270,"104.6175231,2.4517444_10450_0270_RGB-composite...",RGB,0,104.6175231,2.4517444,10450_0270,RGB-composite,jpeg
7121,No_Galaxias/10450_0270,RGB_10450_0270,"104.8877737,2.9923550_10450_0270_RGB-composite...",RGB,0,104.8877737,2.9923550,10450_0270,RGB-composite,jpeg
7122,No_Galaxias/10450_0270,RGB_10450_0270,"104.5104268,2.5284167_10450_0270_RGB-composite...",RGB,0,104.5104268,2.5284167,10450_0270,RGB-composite,jpeg
7123,No_Galaxias/10450_0270,RGB_10450_0270,"104.9372738,2.9354354_10450_0270_RGB-composite...",RGB,0,104.9372738,2.9354354,10450_0270,RGB-composite,jpeg


In [25]:
def move_columns_to_front(df, columns_to_move):
  """
  Move specific columns to the front of a DataFrame.

  Parameters:
  df (pd.DataFrame): The DataFrame to modify.
  columns_to_move (list): List of column names to move to the front.

  Returns:
  pd.DataFrame: The modified DataFrame with specified columns at the front.
  """
  return df[columns_to_move + [col for col in df.columns if col not in columns_to_move]]

In [27]:
df_images['id_str'] = df_images.apply(lambda x: f"{x.ra},{x.dec}_{x.zone}" if x.ra is not None else None, axis=1)
df_images['file_loc'] = df_images.apply(lambda x: f"{x.folder}/{x.subfolder}/{x.file}", axis=1)
df_images = move_columns_to_front(df_images, ['file_loc', 'galaxy', 'id_str', 'imagetype', 'file'])
df_images.head()

Unnamed: 0,file_loc,galaxy,id_str,imagetype,file,folder,subfolder,ra,dec,zone,type,suffix
0,Galaxias/10450_0145/Blue_10450_0145/104.834772...,1,"104.8347729,2.0068920_10450_0145",Blue,"104.8347729,2.0068920_10450_0145_Azul-3.5micra...",Galaxias/10450_0145,Blue_10450_0145,104.8347729,2.006892,10450_0145,Azul-3.5micras,jpeg
1,Galaxias/10450_0145/Blue_10450_0145/104.103225...,1,"104.1032257,1.8147153_10450_0145",Blue,"104.1032257,1.8147153_10450_0145_Azul-3.5micra...",Galaxias/10450_0145,Blue_10450_0145,104.1032257,1.8147153,10450_0145,Azul-3.5micras,jpeg
2,Galaxias/10450_0145/Blue_10450_0145/104.821454...,1,"104.8214547,0.9927273_10450_0145",Blue,"104.8214547,0.9927273_10450_0145_Azul-3.5micra...",Galaxias/10450_0145,Blue_10450_0145,104.8214547,0.9927273,10450_0145,Azul-3.5micras,jpeg
3,Galaxias/10450_0145/Blue_10450_0145/103.985365...,1,"103.9853656,1.3372564_10450_0145",Blue,"103.9853656,1.3372564_10450_0145_Azul-3.5micra...",Galaxias/10450_0145,Blue_10450_0145,103.9853656,1.3372564,10450_0145,Azul-3.5micras,jpeg
4,Galaxias/10450_0145/Blue_10450_0145/104.624440...,1,"104.6244403,1.6596911_10450_0145",Blue,"104.6244403,1.6596911_10450_0145_Azul-3.5micra...",Galaxias/10450_0145,Blue_10450_0145,104.6244403,1.6596911,10450_0145,Azul-3.5micras,jpeg


In [28]:
# Check all images have proper id
df_images[df_images.id_str.isnull()]

Unnamed: 0,file_loc,galaxy,id_str,imagetype,file,folder,subfolder,ra,dec,zone,type,suffix


In [None]:
# Save full catalog
df_images.to_csv(SMOG_DATA_DIR / 'catalog.csv', index=False)

# Prepare RGB subset for ML training

In [None]:
df_images = pd.read_csv(SMOG_DATA_DIR / 'catalog.csv')
df_images.head()

Unnamed: 0,folder,subfolder,file,imagetype,galaxy,ra,dec,zone,type,suffix,id_str
0,Galaxias/10450_0145,Blue_10450_0145,"104.8347729,2.0068920_10450_0145_Azul-3.5micra...",Blue,1,104.834773,2.006892,10450_0145,Azul-3.5micras,jpeg,"104.8347729,2.0068920_10450_0145"
1,Galaxias/10450_0145,Blue_10450_0145,"104.1032257,1.8147153_10450_0145_Azul-3.5micra...",Blue,1,104.103226,1.814715,10450_0145,Azul-3.5micras,jpeg,"104.1032257,1.8147153_10450_0145"
2,Galaxias/10450_0145,Blue_10450_0145,"104.8214547,0.9927273_10450_0145_Azul-3.5micra...",Blue,1,104.821455,0.992727,10450_0145,Azul-3.5micras,jpeg,"104.8214547,0.9927273_10450_0145"
3,Galaxias/10450_0145,Blue_10450_0145,"103.9853656,1.3372564_10450_0145_Azul-3.5micra...",Blue,1,103.985366,1.337256,10450_0145,Azul-3.5micras,jpeg,"103.9853656,1.3372564_10450_0145"
4,Galaxias/10450_0145,Blue_10450_0145,"104.6244403,1.6596911_10450_0145_Azul-3.5micra...",Blue,1,104.62444,1.659691,10450_0145,Azul-3.5micras,jpeg,"104.6244403,1.6596911_10450_0145"


In [41]:
# dataset of only RGB images

df = df_images[df_images.imagetype=='RGB'].copy()

# Change file_locfor new organization of dataset
df['smog_file_loc'] = df['file_loc']
df['file_loc'] = df.apply(lambda x: f"{'galaxy' if x.galaxy else 'no_galaxy'}/{x.file}", axis=1)
df.reset_index(inplace=True) # Get a fresh index for the dataset

df = move_columns_to_front(df, ['file_loc', 'galaxy', 'id_str', 'file'])
df.head()

Unnamed: 0,file_loc,galaxy,id_str,file,index,imagetype,folder,subfolder,ra,dec,zone,type,suffix,smog_file_loc
0,"galaxy/104.7263830,2.0560720_10450_0145_RGB-co...",1,"104.7263830,2.0560720_10450_0145","104.7263830,2.0560720_10450_0145_RGB-composite...",240,RGB,Galaxias/10450_0145,RGB_10450_0145,104.726383,2.056072,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.7263830...
1,"galaxy/104.1504793,1.3987231_10450_0145_RGB-co...",1,"104.1504793,1.3987231_10450_0145","104.1504793,1.3987231_10450_0145_RGB-composite...",241,RGB,Galaxias/10450_0145,RGB_10450_0145,104.1504793,1.3987231,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.1504793...
2,"galaxy/104.9237340,1.5302104_10450_0145_RGB-co...",1,"104.9237340,1.5302104_10450_0145","104.9237340,1.5302104_10450_0145_RGB-composite...",242,RGB,Galaxias/10450_0145,RGB_10450_0145,104.923734,1.5302104,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.9237340...
3,"galaxy/104.7788009,1.1385662_10450_0145_RGB-co...",1,"104.7788009,1.1385662_10450_0145","104.7788009,1.1385662_10450_0145_RGB-composite...",243,RGB,Galaxias/10450_0145,RGB_10450_0145,104.7788009,1.1385662,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.7788009...
4,"galaxy/104.3578369,1.5539122_10450_0145_RGB-co...",1,"104.3578369,1.5539122_10450_0145","104.3578369,1.5539122_10450_0145_RGB-composite...",244,RGB,Galaxias/10450_0145,RGB_10450_0145,104.3578369,1.5539122,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.3578369...


In [34]:
# Create directories if they do not exist
!mkdir -p '{RGB_DATA_DIR}' '{RGB_DATA_DIR}/galaxy' '{RGB_DATA_DIR}/no_galaxy'

In [42]:
# Save RGB catalog
df.to_csv(RGB_DATA_DIR / 'catalog.csv', index=True) # Keep index for easier image identification

In [37]:
from tqdm import tqdm
from shutil import copyfile

# Do the copying
for _,item in tqdm(df.iterrows()):
  D = item.to_dict()
  folder = D['folder']
  subfolder = D['subfolder']
  file = D['file']
  smog_file_loc = D['smog_file_loc']
  file_loc = D['file_loc']
  inpath = SMOG_DATA_DIR / smog_file_loc
  outpath = RGB_DATA_DIR / file_loc
  copyfile(inpath, outpath)

0it [00:00, ?it/s]

1425it [00:00, 1882.27it/s]


# Prepare splits of RGB dataset

In [38]:
from sklearn.model_selection import train_test_split

In [44]:
RGB_DATA_DIR = DATA_DIR / 'raw' / 'galaxies-rgb'
catalog = pd.read_csv(RGB_DATA_DIR / 'catalog.csv', index_col=0)
catalog.head()

Unnamed: 0,file_loc,galaxy,id_str,file,index,imagetype,folder,subfolder,ra,dec,zone,type,suffix,smog_file_loc
0,"galaxy/104.7263830,2.0560720_10450_0145_RGB-co...",1,"104.7263830,2.0560720_10450_0145","104.7263830,2.0560720_10450_0145_RGB-composite...",240,RGB,Galaxias/10450_0145,RGB_10450_0145,104.726383,2.056072,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.7263830...
1,"galaxy/104.1504793,1.3987231_10450_0145_RGB-co...",1,"104.1504793,1.3987231_10450_0145","104.1504793,1.3987231_10450_0145_RGB-composite...",241,RGB,Galaxias/10450_0145,RGB_10450_0145,104.150479,1.398723,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.1504793...
2,"galaxy/104.9237340,1.5302104_10450_0145_RGB-co...",1,"104.9237340,1.5302104_10450_0145","104.9237340,1.5302104_10450_0145_RGB-composite...",242,RGB,Galaxias/10450_0145,RGB_10450_0145,104.923734,1.53021,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.9237340...
3,"galaxy/104.7788009,1.1385662_10450_0145_RGB-co...",1,"104.7788009,1.1385662_10450_0145","104.7788009,1.1385662_10450_0145_RGB-composite...",243,RGB,Galaxias/10450_0145,RGB_10450_0145,104.778801,1.138566,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.7788009...
4,"galaxy/104.3578369,1.5539122_10450_0145_RGB-co...",1,"104.3578369,1.5539122_10450_0145","104.3578369,1.5539122_10450_0145_RGB-composite...",244,RGB,Galaxias/10450_0145,RGB_10450_0145,104.357837,1.553912,10450_0145,RGB-composite,jpeg,Galaxias/10450_0145/RGB_10450_0145/104.3578369...


In [45]:
# Column(s) to use as labels
label_cols = ['galaxy']

train_catalog, test_catalog = train_test_split(catalog, test_size=0.2, random_state=42, stratify=catalog[label_cols])

# Add train columns to tyrack the splits
train_catalog['train']=1
test_catalog['train']=0
catalog['train'] = 0
catalog.loc[train_catalog.index,'train'] = 1

catalog = move_columns_to_front(catalog, ['file_loc', 'galaxy', 'train', 'id_str', 'file'])

print(f"Training set size: {len(train_catalog)}")
print(f"Testing set size: {len(test_catalog)}")

Training set size: 1140
Testing set size: 285


In [46]:
train_catalog.to_csv( RGB_DATA_DIR / 'train_catalog.csv', index=True)
test_catalog.to_csv( RGB_DATA_DIR / 'test_catalog.csv', index=True)
catalog.to_csv( RGB_DATA_DIR / 'catalog_with_split.csv', index=True)

In [None]:
# Pack dataset for easy upload to server
!tar -czvf {DATA_DIR}/galaxies-rgb.tar.gz -C {RGB_DATA_DIR.parent} galaxies-rgb

In [None]:
# Delete if not used
#!rm {DATA_DIR}/galaxies-rgb.tar.gz

In [None]:
# List content
!tar -tzvf {DATA_DIR}/galaxies-rgb.tar.gz

In [None]:
# Unpack
!tar -xzvf {DATA_DIR}/galaxies-rgb.tar.gz -C {DATA_DIR / 'raw'}