In [43]:
from os.path import join
from glob import glob
from utils.custom_utils import RetrieveData,new_dataset
from tqdm import tqdm
import numpy as np
import pandas as pd

from astropy.io import fits
from photutils.segmentation import detect_sources, detect_threshold
import statmorph

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss,roc_auc_score, precision_recall_fscore_support

In [44]:
URL='https://tinyurl.com/hdd4kwva'

RetrieveData(URL)
new_dataset()

In [3]:
root_dir='./data/new_data/'

data_path=glob(join(root_dir,'lensed/*.fits'))+glob(join(root_dir,'non_lensed/*.fits'))
category=[1 for _ in range(len(glob(join(root_dir,'lensed/*.fits'))))]+[0 for _ in range(len(glob(join(root_dir,'non_lensed/*.fits'))))]

data_mapping=np.transpose([data_path,category])
rng = np.random.default_rng()
rng.shuffle(data_mapping)

In [4]:
def corner_value(image,size=5):
    dimensions=np.shape(image)
    
    data=image[:size,:size] #up left corner
    data=np.append(data,image[:size,dimensions[0]-size:]) #up right corner
    data=np.append(data,image[dimensions[0]-size:,:size]) #down left corner
    data=np.append(data,image[dimensions[0]-size:,dimensions[1]-size:]) #down right corner
    
    return data

def galaxy_map(segm):
    label = np.argmax(segm.areas)+1
    segm_map=1*(segm.data==label)
    return segm_map

def mask(segm):
    galaxy=(galaxy_map(segm))
    segm_map=segm.data!=0
    mask=segm_map-galaxy
    bool_mask=mask==1
    return bool_mask

In [5]:
def compute_parameters(file_path,n_sigma=1.5,npixels=5):
    directory=join(*file_path.split('/')[:-1])
    filename=file_path.split('/')[-1]
    rms_path=join(directory,'RMS',filename)
    psf_path=join(directory,'PSF',filename)
    
    image, header = fits.getdata(file_path, header=True)
    rms,rms_header=fits.getdata(rms_path,header=True)
    psf,psf_header=fits.getdata(psf_path,header=True)
    
    mean_background=np.mean(corner_value(image))
    std_background=np.std(corner_value(image))
    
    threshold=detect_threshold(image,n_sigma,background=mean_background,error=std_background)
    segm = detect_sources(image, threshold, npixels=npixels)
    
    Mask=mask(segm)
    segm_map=galaxy_map(segm)
    
    weight_map=1/rms
    
    try:
        source_morphs = statmorph.source_morphology(image, segm_map, weightmap=weight_map, mask=Mask,psf=psf)
        morph = source_morphs[0]
        return morph
    except:
        return None

In [6]:
data={'file':[],'category':[],'flag_sersic':[],'flag':[],'working':[],
     'sersic_amplitude':[],'sersic_rhalf':[],'sersic_n':[],'sersic_xc':[],'sersic_yc':[],'sersic_ellip':[],'sersic_theta':[],
     'asymmetry':[],'concentration':[],'deviation':[],'gini':[],'intensity':[],'m20':[],'rpetro_circ':[],'smoothness':[]}
for file,category in tqdm(data_mapping[:2000]):
    params=compute_parameters(file)
    if params:
        data['file'].append(file)
        data['category'].append(category)
        data['flag_sersic'].append(int(params.flag_sersic))
        data['flag'].append(int(params.flag))
        data['working'].append(1)
        
        data['sersic_amplitude'].append(params.sersic_amplitude)
        data['sersic_rhalf'].append(params.sersic_rhalf)
        data['sersic_n'].append(params.sersic_n)
        data['sersic_xc'].append(params.sersic_xc)
        data['sersic_yc'].append(params.sersic_yc)
        data['sersic_ellip'].append(params.sersic_ellip)
        data['sersic_theta'].append(params.sersic_theta)
        
        data['asymmetry'].append(params.asymmetry)
        data['concentration'].append(params.concentration)
        data['deviation'].append(params.deviation)
        data['gini'].append(params.gini)
        data['intensity'].append(params.intensity)
        data['m20'].append(params.m20)
        data['rpetro_circ'].append(params.rpetro_circ)
        data['smoothness'].append(params.smoothness)
        
    else:
        data['file'].append(file)
        data['category'].append(category)
        data['flag_sersic'].append(None)
        data['flag'].append(None)
        data['working'].append(0)
        
        data['sersic_amplitude'].append(None)
        data['sersic_rhalf'].append(None)
        data['sersic_n'].append(None)
        data['sersic_xc'].append(None)
        data['sersic_yc'].append(None)
        data['sersic_ellip'].append(None)
        data['sersic_theta'].append(None)
        
        data['asymmetry'].append(None)
        data['concentration'].append(None)
        data['deviation'].append(None)
        data['gini'].append(None)
        data['intensity'].append(None)
        data['m20'].append(None)
        data['rpetro_circ'].append(None)
        data['smoothness'].append(None)
    

  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
100%|███████████████████████████████████████████████████████████████████████████████| 2000/2000 [13:12<00:00,  2.52it/s]


In [7]:
df=pd.DataFrame(data)

In [8]:
df.to_csv(join(root_dir,'data.csv'),index=False)

In [35]:
#some statistics
#ratio of category
raw_ratio=len(df.loc[df['category']=='0'])/len(df)*100
print(f'dans le dataset il y a {raw_ratio} % qui sont non_lentillé et {100-raw_ratio} % de lentillé')

df_working=df.loc[df['working']==1]
working_ratio=len(df_working)/len(df)*100
print(f'le ratio de cas qui marche est de {working_ratio} % et de cas qui ne marche pas est {100-working_ratio} %.')

cat_working_ratio=len(df_working.loc[df_working['category']=='0'])/len(df_working)*100
print(f'au sein des cas qui marchent, il y a {cat_working_ratio} % de non_lentillés et {100-cat_working_ratio} % de lentillés')

print('\nOn ne prend plus que les cas qui marchent\n\n')
#flag ratio
perfect_working=len(df_working.loc[(df_working['flag_sersic']==0)].loc[df_working['flag']==0])/len(df_working)*100
print(f'il y a {perfect_working} % de cas qui marche parfaitement')
global_sersic=len(df_working.loc[df_working['flag_sersic']==1])/len(df_working)*100
print(f'il y a {global_sersic} % de flag sersic\n')

pur_sersic=len(df_working.loc[df_working['flag_sersic']==1].loc[df_working['flag']==0])/len(df_working)*100
print(f'nombre de cas ou il y a seulement un flag sersic : {pur_sersic} %')
flag1_sersic=len(df_working.loc[df_working['flag_sersic']==1].loc[df_working['flag']==1])/len(df_working)*100
flag2_sersic=len(df_working.loc[df_working['flag_sersic']==1].loc[df_working['flag']==2])/len(df_working)*100
flag4_sersic=len(df_working.loc[df_working['flag_sersic']==1].loc[df_working['flag']==4])/len(df_working)*100
print(f'sersic + flag=1 :{flag1_sersic} %')
print(f'sersic + flag=2 :{flag2_sersic} %')
print(f'sersic + flag=4 :{flag4_sersic} %\n')

pur_flag=len(df_working.loc[df_working['flag_sersic']==0].loc[df_working['flag']!=0])/len(df_working)*100
print(f'nombre de cas ou il y a seulement un flag : {pur_flag} %')
flag1=len(df_working.loc[df_working['flag_sersic']==0].loc[df_working['flag']==1])/len(df_working)*100
flag2=len(df_working.loc[df_working['flag_sersic']==0].loc[df_working['flag']==2])/len(df_working)*100
flag4=len(df_working.loc[df_working['flag_sersic']==0].loc[df_working['flag']==4])/len(df_working)*100
print(f'nombre de cas flag=1 : {flag1} %')
print(f'nombre de cas flag=2 : {flag2} %')
print(f'nombre de cas flag=4 : {flag4} %')

dans le dataset il y a 48.449999999999996 % qui sont non_lentillé et 51.550000000000004 % de lentillé
le ratio de cas qui marche est de 88.7 % et de cas qui ne marche pas est 11.299999999999997 %.
au sein des cas qui marchent, il y a 52.8184892897407 % de non_lentillés et 47.1815107102593 % de lentillés

On ne prend plus que les cas qui marchent


il y a 52.536640360766626 % de cas qui marche parfaitement
il y a 38.50056369785795 % de flag sersic

nombre de cas ou il y a seulement un flag sersic : 29.143179255918827 %
sersic + flag=1 :1.4656144306651635 %
sersic + flag=2 :7.891770011273957 %
sersic + flag=4 :0.0 %

nombre de cas ou il y a seulement un flag : 8.962795941375424 %
nombre de cas flag=1 : 1.5783540022547913 %
nombre de cas flag=2 : 7.384441939120631 %
nombre de cas flag=4 : 0.0 %


In [21]:
df_working

Unnamed: 0,file,category,flag_sersic,flag,working,sersic_amplitude,sersic_rhalf,sersic_n,sersic_xc,sersic_yc,sersic_ellip,sersic_theta,asymmetry,concentration,deviation,gini,intensity,m20,rpetro_circ,smoothness
0,./data/new_data/non_lensed/7356.fits,0,1.0,2.0,1,139.480509,2.667820,53.547793,21.709884,22.723896,0.173040,2.846994,0.028386,3.511263,0.034894,0.582110,0.002019,-2.132467,20.969019,-0.044056
2,./data/new_data/non_lensed/64515.fits,0,0.0,0.0,1,24.110644,10.295667,5.138294,20.977286,21.985942,0.949935,1.058480,0.042432,3.344042,0.011201,0.531248,0.000000,-1.972364,12.440231,0.005108
3,./data/new_data/lensed/126769.fits,1,0.0,0.0,1,77.194436,5.271907,1.052158,22.585768,22.107724,0.365200,3.001088,0.113418,2.297448,0.061063,0.388610,0.265074,-1.659978,12.044792,0.007073
4,./data/new_data/lensed/127920.fits,1,1.0,0.0,1,0.539090,44.344960,28.811880,22.598053,22.246463,0.022398,0.624617,0.102597,2.693456,0.040105,0.374677,0.193068,-1.871922,13.784706,0.003875
5,./data/new_data/lensed/137664.fits,1,0.0,0.0,1,9.678310,18.714956,3.869475,23.001873,21.993020,0.549188,2.103453,0.314246,3.143232,0.184678,0.490289,0.352723,-1.452069,23.869446,-0.072409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,./data/new_data/non_lensed/127442.fits,0,1.0,2.0,1,5.303144,17.294794,69.926877,21.976482,21.763607,0.664109,1.731764,0.026656,3.748557,0.065274,0.580749,0.074119,-1.997179,22.860423,-0.088106
1996,./data/new_data/non_lensed/62736.fits,0,1.0,0.0,1,24.964284,15.981744,15.126223,22.027556,19.854263,0.988194,1.680654,0.098419,3.538059,0.201677,0.585768,0.264671,-1.472602,14.827051,-0.034859
1997,./data/new_data/non_lensed/63442.fits,0,0.0,2.0,1,21.308942,9.589825,3.679732,21.000088,21.998738,0.105148,2.477426,0.014144,3.262965,0.014380,0.554529,0.000000,-2.026665,22.004891,-0.037043
1998,./data/new_data/non_lensed/14760.fits,0,0.0,0.0,1,127.209037,14.254807,5.504594,20.930838,21.000176,0.993009,0.089264,-0.036488,3.132175,0.036590,0.583838,0.057218,-1.873377,10.541174,0.036042


In [47]:
#test en injectant tout
mat=df_working.loc[:,df.columns!='file']
y=np.array(mat.category)
X=np.array(mat.loc[:,mat.columns!='category'])

cut1,cut2=int(0.6*len(X)),int(0.8*len((X)))
X_train,X_valid,X_test=X[:cut1],X[cut1:cut2],X[cut2:]
y_train,y_valid,y_test=y[:cut1],y[cut1:cut2],y[cut2:]

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf=clf.fit(X_train, y_train)
    
score=clf.score(X_valid,y_valid)
print(f'score={score}')
    
pred=clf.predict(X_test)
    
conf_matrix=confusion_matrix(y_test,pred)
print(f'confusion matrix: \n{conf_matrix}')
print(f'len(X_test)={len(X_test)}\n')
acc=accuracy_score(y_test,pred)
precision,recall,fscore,support=precision_recall_fscore_support(y_test,pred,average='weighted')
print(f'Accuracy: {acc*100} %',f'Precision: {precision*100} %', f'Recall: {recall*100} %',f'F-score: {fscore*100} %',sep='\n')



score=0.9126760563380282
confusion matrix: 
[[182  16]
 [ 10 147]]
len(X_test)=355

Accuracy: 92.67605633802816 %
Precision: 92.75393156484922 %
Recall: 92.67605633802816 %
F-score: 92.68838028169016 %


conf matrix = [[TN,FP],
               [FN,TP]]

Accuracy= (TP+TN)/total #Number of correct prediction
Precision= TP/(TP+FP) = TP/positiv #Number of correct positive predictions out of all predicted positive
Recall= TP/(TP+FN) #Number of correct positive prediction out of really positive cases
F-score= 2*(pressicion*recall)/(precision+recall) #Harmonique mean of the precision and the recall

precision=1 ---> classifier don't make errors
recall=1 ---> classifier find all the relevant cases
F-score = 1 the best classifier