In [1]:
from os.path import join
from glob import glob
from utils.custom_utils import RetrieveData,new_dataset
from tqdm import tqdm
import numpy as np
import pandas as pd

from astropy.io import fits
from photutils.segmentation import detect_sources, detect_threshold
import statmorph

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss,roc_auc_score, precision_recall_fscore_support

In [2]:
URL='https://tinyurl.com/hdd4kwva'

RetrieveData(URL)
new_dataset()

In [3]:
root_dir='./data/new_data/'

data_path=glob(join(root_dir,'lensed/*.fits'))+glob(join(root_dir,'non_lensed/*.fits'))
category=[1 for _ in range(len(glob(join(root_dir,'lensed/*.fits'))))]+[0 for _ in range(len(glob(join(root_dir,'non_lensed/*.fits'))))]

data_mapping=np.transpose([data_path,category])
rng = np.random.default_rng()
rng.shuffle(data_mapping)

In [4]:
def corner_value(image,size=5):
    dimensions=np.shape(image)
    
    data=image[:size,:size] #up left corner
    data=np.append(data,image[:size,dimensions[0]-size:]) #up right corner
    data=np.append(data,image[dimensions[0]-size:,:size]) #down left corner
    data=np.append(data,image[dimensions[0]-size:,dimensions[1]-size:]) #down right corner
    
    return data

def galaxy_map(segm):
    label = np.argmax(segm.areas)+1
    segm_map=1*(segm.data==label)
    return segm_map

def mask(segm):
    galaxy=(galaxy_map(segm))
    segm_map=segm.data!=0
    mask=segm_map-galaxy
    bool_mask=mask==1
    return bool_mask

In [5]:
def compute_parameters(file_path,n_sigma=1.5,npixels=5):
    directory=join(*file_path.split('/')[:-1])
    filename=file_path.split('/')[-1]
    rms_path=join(directory,'RMS',filename)
    psf_path=join(directory,'PSF',filename)
    
    image, header = fits.getdata(file_path, header=True)
    rms,rms_header=fits.getdata(rms_path,header=True)
    psf,psf_header=fits.getdata(psf_path,header=True)
    
    mean_background=np.mean(corner_value(image))
    std_background=np.std(corner_value(image))
    
    threshold=detect_threshold(image,n_sigma,background=mean_background,error=std_background)
    segm = detect_sources(image, threshold, npixels=npixels)
    
    Mask=mask(segm)
    segm_map=galaxy_map(segm)
    
    weight_map=1/rms
    
    try:
        source_morphs = statmorph.source_morphology(image, segm_map, weightmap=weight_map, mask=Mask,psf=psf)
        morph = source_morphs[0]
        return morph
    except:
        return None

In [8]:
data={'file':[],'category':[],'flag_sersic':[],'flag_good':[],'flag_suspect':[],'flag_bad':[],'flag_catastrophic':[],'working':[],
     'sersic_amplitude':[],'sersic_rhalf':[],'sersic_n':[],'sersic_xc':[],'sersic_yc':[],'sersic_ellip':[],'sersic_theta':[],
     'asymmetry':[],'concentration':[],'deviation':[],'gini':[],'intensity':[],'m20':[],'rpetro_circ':[],'smoothness':[],'Z':[],'SNR':[]}
for file,category in tqdm(data_mapping[:2000]):
    params=compute_parameters(file)
    image, header = fits.getdata(file, header=True)
    if params:
        try:
            data['Z'].append(header['ZL'])
            data['SNR'].append(params.sn_per_pixel)
        except:
            data['Z'].append(header['REDSHIFT'])
            data['SNR'].append(params.sn_per_pixel)
        
        data['file'].append(file)
        data['category'].append(category)
        data['flag_sersic'].append(int(params.flag_sersic))
        data['flag_good'].append(1 if int(params.flag)==0 else 0)
        data['flag_suspect'].append(1 if int(params.flag)==1 else 0)
        data['flag_bad'].append(1 if int(params.flag)==2 else 0)
        data['flag_catastrophic'].append(1 if int(params.flag)==4 else 0)
        data['working'].append(1)
        
        data['sersic_amplitude'].append(params.sersic_amplitude)
        data['sersic_rhalf'].append(params.sersic_rhalf)
        data['sersic_n'].append(params.sersic_n)
        data['sersic_xc'].append(params.sersic_xc)
        data['sersic_yc'].append(params.sersic_yc)
        data['sersic_ellip'].append(params.sersic_ellip)
        data['sersic_theta'].append(params.sersic_theta)
        
        data['asymmetry'].append(params.asymmetry)
        data['concentration'].append(params.concentration)
        data['deviation'].append(params.deviation)
        data['gini'].append(params.gini)
        data['intensity'].append(params.intensity)
        data['m20'].append(params.m20)
        data['rpetro_circ'].append(params.rpetro_circ)
        data['smoothness'].append(params.smoothness)
        
    else:
        try:
            data['Z'].append(header['ZL'])
            data['SNR'].append(header['SN'])
        except:
            data['Z'].append(header['REDSHIFT'])
            data['SNR'].append(None)
        
        data['file'].append(file)
        data['category'].append(category)
        data['flag_sersic'].append(None)
        data['flag_good'].append(None)
        data['flag_suspect'].append(None)
        data['flag_bad'].append(None)
        data['flag_catastrophic'].append(None)
        data['working'].append(0)
        
        data['sersic_amplitude'].append(None)
        data['sersic_rhalf'].append(None)
        data['sersic_n'].append(None)
        data['sersic_xc'].append(None)
        data['sersic_yc'].append(None)
        data['sersic_ellip'].append(None)
        data['sersic_theta'].append(None)
        
        data['asymmetry'].append(None)
        data['concentration'].append(None)
        data['deviation'].append(None)
        data['gini'].append(None)
        data['intensity'].append(None)
        data['m20'].append(None)
        data['rpetro_circ'].append(None)
        data['smoothness'].append(None)
    

  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
  return amplitude * np.exp(-bn * (z ** (1 / n) - 1))
100%|███████████████████████████████████████████████████████████████████████████████| 2000/2000 [14:01<00:00,  2.38it/s]


In [9]:
df=pd.DataFrame(data)

In [10]:
df.to_csv(join(root_dir,'data.csv'),index=False)

In [11]:
#some statistics
#ratio of category
raw_ratio=len(df.loc[df['category']=='0'])/len(df)*100
print(f'dans le dataset il y a {raw_ratio} % qui sont non_lentillé et {100-raw_ratio} % de lentillé')

df_working=df.loc[df['working']==1]
working_ratio=len(df_working)/len(df)*100
print(f'le ratio de cas qui marche est de {working_ratio} % et de cas qui ne marche pas est {100-working_ratio} %.')

cat_working_ratio=len(df_working.loc[df_working['category']=='0'])/len(df_working)*100
print(f'au sein des cas qui marchent, il y a {cat_working_ratio} % de non_lentillés et {100-cat_working_ratio} % de lentillés')

print('\nOn ne prend plus que les cas qui marchent\n\n')
#flag ratio
perfect_working=len(df_working.loc[(df_working['flag_sersic']==0)].loc[df_working['flag']==0])/len(df_working)*100
print(f'il y a {perfect_working} % de cas qui marche parfaitement')
global_sersic=len(df_working.loc[df_working['flag_sersic']==1])/len(df_working)*100
print(f'il y a {global_sersic} % de flag sersic\n')

pur_sersic=len(df_working.loc[df_working['flag_sersic']==1].loc[df_working['flag']==0])/len(df_working)*100
print(f'nombre de cas ou il y a seulement un flag sersic : {pur_sersic} %')
flag1_sersic=len(df_working.loc[df_working['flag_sersic']==1].loc[df_working['flag']==1])/len(df_working)*100
flag2_sersic=len(df_working.loc[df_working['flag_sersic']==1].loc[df_working['flag']==2])/len(df_working)*100
flag4_sersic=len(df_working.loc[df_working['flag_sersic']==1].loc[df_working['flag']==4])/len(df_working)*100
print(f'sersic + flag=1 :{flag1_sersic} %')
print(f'sersic + flag=2 :{flag2_sersic} %')
print(f'sersic + flag=4 :{flag4_sersic} %\n')

pur_flag=len(df_working.loc[df_working['flag_sersic']==0].loc[df_working['flag']!=0])/len(df_working)*100
print(f'nombre de cas ou il y a seulement un flag : {pur_flag} %')
flag1=len(df_working.loc[df_working['flag_sersic']==0].loc[df_working['flag']==1])/len(df_working)*100
flag2=len(df_working.loc[df_working['flag_sersic']==0].loc[df_working['flag']==2])/len(df_working)*100
flag4=len(df_working.loc[df_working['flag_sersic']==0].loc[df_working['flag']==4])/len(df_working)*100
print(f'nombre de cas flag=1 : {flag1} %')
print(f'nombre de cas flag=2 : {flag2} %')
print(f'nombre de cas flag=4 : {flag4} %')

dans le dataset il y a 48.699999999999996 % qui sont non_lentillé et 51.300000000000004 % de lentillé
le ratio de cas qui marche est de 89.9 % et de cas qui ne marche pas est 10.099999999999994 %.
au sein des cas qui marchent, il y a 52.72525027808677 % de non_lentillés et 47.27474972191323 % de lentillés

On ne prend plus que les cas qui marchent




KeyError: 'flag'

In [12]:
df_working

Unnamed: 0,file,category,flag_sersic,flag_good,flag_suspect,flag_bad,flag_catastrophic,working,sersic_amplitude,sersic_rhalf,...,asymmetry,concentration,deviation,gini,intensity,m20,rpetro_circ,smoothness,Z,SNR
0,./data/new_data/lensed/12099.fits,1,1.0,0.0,0.0,1.0,0.0,1,524.416108,1.440874,...,0.146327,2.777350,0.045681,0.439069,0.149210,-1.852272,12.900855,0.030842,0.483737,53.129640
1,./data/new_data/lensed/35483.fits,1,1.0,0.0,1.0,0.0,0.0,1,3.046317,28.395130,...,0.075931,2.538504,0.019805,0.420948,0.000000,-1.761930,13.327979,0.002933,0.340375,139.564673
2,./data/new_data/lensed/72554.fits,1,1.0,1.0,0.0,0.0,0.0,1,4.392755,20.744494,...,0.065317,2.576134,0.041377,0.420794,0.000000,-1.797446,14.443681,0.009249,0.473475,54.733937
3,./data/new_data/lensed/3986.fits,1,0.0,1.0,0.0,0.0,0.0,1,24.442520,9.307224,...,0.221113,2.155435,0.004062,0.403194,0.730937,-1.195620,14.355088,0.010990,0.644127,54.813584
5,./data/new_data/non_lensed/45287.fits,0,0.0,1.0,0.0,0.0,0.0,1,63.258823,11.646850,...,0.018803,3.124333,0.050756,0.545883,0.000000,-1.838669,12.220252,0.006409,0.576639,36.169921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,./data/new_data/non_lensed/96624.fits,0,0.0,1.0,0.0,0.0,0.0,1,35.908321,3.089651,...,0.013012,3.528122,0.038966,0.571116,0.000000,-1.924582,12.201862,0.009323,0.463051,88.143209
1996,./data/new_data/non_lensed/113408.fits,0,0.0,1.0,0.0,0.0,0.0,1,26.888571,3.520800,...,0.023090,3.252241,0.067976,0.562345,0.041887,-1.918069,10.650697,0.055754,0.462582,43.825410
1997,./data/new_data/lensed/45324.fits,1,0.0,1.0,0.0,0.0,0.0,1,13.263579,9.269161,...,0.188681,2.731005,0.032227,0.403825,0.313178,-1.886952,14.169639,0.002623,0.499422,52.325542
1998,./data/new_data/lensed/27793.fits,1,0.0,1.0,0.0,0.0,0.0,1,61.074683,5.374431,...,0.010525,2.233231,0.052506,0.382839,0.140667,-1.649299,13.187438,-0.007134,0.401208,60.356273


In [13]:
#test en injectant tout
mat=df_working.loc[:,df.columns!='file']
y=np.array(mat.category)
X=np.array(mat.loc[:,mat.columns!='category'])

cut1,cut2=int(0.6*len(X)),int(0.8*len((X)))
X_train,X_valid,X_test=X[:cut1],X[cut1:cut2],X[cut2:]
y_train,y_valid,y_test=y[:cut1],y[cut1:cut2],y[cut2:]

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf=clf.fit(X_train, y_train)
    
score=clf.score(X_valid,y_valid)
print(f'score={score}')
    
pred=clf.predict(X_test)
    
conf_matrix=confusion_matrix(y_test,pred)
print(f'confusion matrix: \n{conf_matrix}')
print(f'len(X_test)={len(X_test)}\n')
acc=accuracy_score(y_test,pred)
precision,recall,fscore,support=precision_recall_fscore_support(y_test,pred,average='weighted')
print(f'Accuracy: {acc*100} %',f'Precision: {precision*100} %', f'Recall: {recall*100} %',f'F-score: {fscore*100} %',sep='\n')



score=0.9333333333333333
confusion matrix: 
[[167  17]
 [ 13 163]]
len(X_test)=360

Accuracy: 91.66666666666666 %
Precision: 91.69135802469135 %
Recall: 91.66666666666666 %
F-score: 91.66769560027986 %


conf matrix = [[TN,FP],
               [FN,TP]]

Accuracy= (TP+TN)/total #Number of correct prediction
Precision= TP/(TP+FP) = TP/positiv #Number of correct positive predictions out of all predicted positive
Recall= TP/(TP+FN) #Number of correct positive prediction out of really positive cases
F-score= 2*(pressicion*recall)/(precision+recall) #Harmonique mean of the precision and the recall

precision=1 ---> classifier don't make errors
recall=1 ---> classifier find all the relevant cases
F-score = 1 the best classifier