# Chapter 5: Astronomical Data Analysis¶

## Machine learning

### Image classification

The package ```tensorflow``` is not included by default in most Python distributions. 
If you use Anaconda, see [docs.anaconda.com/anaconda/user-guide/tasks/tensorflow/](https://docs.anaconda.com/anaconda/user-guide/tasks/tensorflow/) for an installation guide. Other options are explained here: [www.tensorflow.org/install](https://www.tensorflow.org/install).

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image as image
import tensorflow as tf
from tensorflow import keras

#### Preparing the data

The data file ``efigi.dat`` loaded below is a pre-selected subset of the [EFIGI survey dataset](https://www.astromatic.net/projects/efigi).

In [None]:
data = open("data_files/galaxies/efigi.dat","r")

names = []
types = []

for line in data:
    fields = line.split(" ")
    names.append( fields[0] )
    types.append( fields[1] )
    
nData = len(names)
imgSize = 64

Elliptical galaxies belong to class 0, spirals to class 1 and irregulars to class 2.

In [None]:
galaxies = np.zeros((nData,imgSize,imgSize,3))
labels = np.zeros(nData, dtype='int')

for i in range(nData):
    # load image
    img = image.open("data_files/galaxies/png/"+str(names[i])+".png")

    # resize to imgSize
    imgResized = img.resize(size=(imgSize,imgSize))
    
    galaxies[i,:,:,:] = np.array(imgResized)/255
    labels[i] = types[i]

In [None]:
labels.size

Split the full dataset into training, validation, and test sets:

In [None]:
import random

In [None]:
size = labels.size
sample = random.sample([n for n in range(size)], int(0.3*size))

# split in training and other set
otherLabels = labels[sample]
otherGalaxies = galaxies[sample,:,:,:]
trainLabels = np.delete(labels, sample)
trainGalaxies = np.delete(galaxies, sample, axis=0)

print(otherLabels.size, trainLabels.size)
print(otherGalaxies.shape, trainGalaxies.shape)

In [None]:
type(sample)

In [None]:
size = otherLabels.size
subsample = random.sample([n for n in range(size)], int(size/2))

# split into validation and test sets
valdLabels = otherLabels[subsample]
valdGalaxies = otherGalaxies[subsample,:,:,:]
testLabels = np.delete(otherLabels, subsample)
testGalaxies = np.delete(otherGalaxies, subsample, axis=0)

print(valdLabels.size, testLabels.size)
print(valdGalaxies.shape, testGalaxies.shape)

In [None]:
plt.hist(trainLabels, bins=[-0.5,0.5,1.5,2.5], histtype='step', lw=2)
plt.hist(valdLabels,  bins=[-0.5,0.5,1.5,2.5], histtype='step', lw=2, ls='--')
plt.hist(testLabels,  bins=[-0.5,0.5,1.5,2.5], histtype='step', lw=2, ls=':')
plt.show()

#### Training of the network

In [None]:
galNet = keras.Sequential([
    keras.layers.Conv2D(96, (8,8), activation='relu', 
                        input_shape=(imgSize,imgSize,3)),
    keras.layers.MaxPooling2D(pool_size=(4,4)),
    keras.layers.Flatten(),
    keras.layers.Dense(30, activation='relu'),
    keras.layers.Dense(3, activation='softmax')
])

In [None]:
galNet.summary()

In [None]:
galNet.compile(optimizer='adam', 
               loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])

In [None]:
results = galNet.fit(trainGalaxies, trainLabels, epochs = 40, 
                     validation_data=(valdGalaxies, valdLabels))

Training vs validation data:

In [None]:
plt.figure(figsize=(6,4), dpi=100)

plt.plot(results.history['loss'], color='green', label='training')
plt.plot(results.history['val_loss'], color='red', label='validation')
plt.xlabel("Epochs")
plt.ylim(0,1)
plt.ylabel("Loss")
plt.legend()
plt.savefig("galnet_loss.pdf")

In [None]:
print(f"{results.history['accuracy'][-1]:.4f} "
      f"{results.history['val_accuracy'][-1]:.4f}")

Modified network (smaller number of feature maps, dropout layer):

In [None]:
galNet = keras.Sequential([
    keras.layers.Conv2D(32, (8,8), activation='relu', 
                        input_shape=(imgSize,imgSize,3)),
    keras.layers.MaxPooling2D(pool_size=(4,4)),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(3, activation='softmax')
])

In [None]:
galNet.summary()

In [None]:
galNet.compile(optimizer='adam', 
               loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])

In [None]:
results = galNet.fit(trainGalaxies, trainLabels, epochs = 40, 
                     validation_data=(valdGalaxies, valdLabels))

Training vs validation data (modified network):

In [None]:
plt.figure(figsize=(6,4), dpi=100)

plt.plot(results.history['loss'], color='green', label='training')
plt.plot(results.history['val_loss'], color='red', label='validation')
plt.xlabel("Epochs")
plt.ylim(0,1)
plt.ylabel("Loss")
plt.legend()

plt.savefig("galnet_loss2.pdf")

In [None]:
plt.figure(figsize=(6,4), dpi=100)

plt.plot(results.history['accuracy'], color='green', label='training')
plt.plot(results.history['val_accuracy'], color='red', label='validation')
plt.xlabel("Epochs")
plt.ylim(0,1)
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
print(f"{results.history['accuracy'][-1]:.4f} "
      f"{results.history['val_accuracy'][-1]:.4f}")

#### Evaluation of test dataset

In [None]:
loss, acc = galNet.evaluate(testGalaxies, testLabels) 

In [None]:
print(f"{acc:.4f}")

#### Classification of a galaxy (NGC 1232)

In [None]:
img = image.open("data_files/galaxies/NGC_1232.jpg")

imgResized = img.resize(size=(imgSize,imgSize))

imgArr = np.array(imgResized)/255

In [None]:
imgArrExp = np.expand_dims(imgArr, axis=0)
print(imgArrExp.shape)

In [None]:
pred = galNet.predict(imgArrExp)

label = ["elliptical", "spiral", "irregular"]
for i,p in enumerate(pred.flatten()):
    print(f"{label[i]:10s} {p:.4e}")

### Spectral classification

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from os import listdir
from os.path import isfile, join

#### Preparing the data

Load dataset (not included in zip archive; can be requested from authors)

In [None]:
path = "/hs/fs06/data/AG_Schmidt/specnet/training"
#path = "specnet/training"

In [None]:
specnames = [f for f in listdir(path) if isfile(join(path, f))]

n_spectra = len(specnames)
print("Total number of training spectra:", n_spectra)

Determine labels from filenames

In [None]:
temp = np.zeros(n_spectra, dtype='int')

for i,spec in enumerate(specnames):
    temp[i] = int( spec[0:4] )
    
temp_class = sorted(list(set(temp)))
n_labels = len(temp_class)

print("Total number of temperature classes:", len(temp_class))
print("List of temperatures:", temp_class)

Example spectrum

In [None]:
spectrum_file = join(path, "5800_1_65999_177.97.npz") 

spec_arr = np.load(spectrum_file)
print(spec_arr.files)

In [None]:
wave = spec_arr["arr_0"][:,0]
flux = spec_arr["arr_0"][:,1]

print("Wavelength range:", np.min(wave), np.max(wave))

spec_size = len(flux)

print("Number of values per spectrum:", spec_size)

In [None]:
%matplotlib inline 

plt.plot(0.1*wave, flux, color='navy')
plt.xlabel("$\lambda$ / nm")
plt.xlim(650,660)
plt.ylabel("Normalized flux")
plt.ylim(0,1.05)

plt.savefig("synth_spect.pdf")

In [None]:
n_channels = 20
channel_length = int(spec_size/n_channels)

print("Values per channel:", channel_length)

Create training data array (this may take quite a while)

In [None]:
spectra = np.zeros((n_spectra, channel_length, n_channels), 
                   dtype='float64')
labels = np.zeros(n_spectra, dtype='int')

for i in range(n_spectra):
    labels[i] = temp_class.index(temp[i])
    
    spectrum_file = join(path, specnames[i])
    spec_arr = np.load(spectrum_file)

    flux = spec_arr["arr_0"][:,1]
    flux_2d = np.reshape(flux, (-1,n_channels))
    
    spectra[i,:,:] = flux_2d
    
print(spectra.shape)

#### Training of the network

In [None]:
SpecNet = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(24, 4, activation='relu', input_shape=(channel_length, n_channels)),
    tf.keras.layers.Conv1D(120, 10, activation='relu'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(n_labels, activation='softmax'),
])

print(SpecNet.summary())

In [None]:
SpecNet.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

In [None]:
SpecNet.fit(spectra, labels, epochs=4)

Use the following statement for GPU offloading.

In [None]:
with tf.device('/gpu:0'):
    SpecNet.fit(spectra, labels, epochs=4)

#### Evaluation of test dataset

In [None]:
path = "/hs/fs06/data/AG_Schmidt/specnet/test"
specnames_test = [f for f in listdir(path) if isfile(join(path, f))]
    
n_spectra_test = len(specnames_test)

print("Total number of test spectra:", n_spectra_test)

temp_test = np.zeros(n_spectra_test, dtype='int')
i = 0

for spec in specnames_test:
    temp_test[i] = int( spec[0:4] )
    i=i+1
    
spectra_test = np.zeros((n_spectra_test,channel_length, n_channels), dtype='float64')
labels_test = np.zeros(n_spectra_test, dtype='int')

for i in range(n_spectra_test):
    labels_test[i] = temp_class.index(temp_test[i])
    
for i in range(n_spectra_test):
    spectrum_file = join(path, specnames_test[i])
    spec_arr = np.load(spectrum_file)

    flux = spec_arr["arr_0"][:,1]
    flux_2d = np.reshape(flux, (-1,n_channels))
    
    spectra_test[i,:,:] = flux_2d
    
print(spectra_test.shape)

In [None]:
test_loss, test_acc = SpecNet.evaluate(spectra_test, labels_test)
print("Accuracy on the test data:",test_acc)

#### Test of a single spectrum

In [None]:
i_test = 4000
print("Name of the spectrum:", specnames_test[i_test], "\n")

spec = spectra_test[i_test]

guess = SpecNet.predict(np.expand_dims(spec, axis=0))

for i in range(n_labels):
    print("{:4d} K  {:6.2f} %".
          format(temp_class[i], 100*guess[0,i]))

In [None]:
plt.plot(wave*0.1, spec.flatten(), color='navy')
plt.xlabel("$\lambda$ / nm")
plt.xlim(650,660)
plt.ylabel("Normalized flux")
plt.ylim(0,1.05)

plt.savefig("test_spect.pdf")

Changed format to tf since h5 results in an error for recent versions of ```h5py``` (see https://github.com/tensorflow/tensorflow/issues/44467)

In [None]:
#SpecNet.save('specnet_model.h5')
SpecNet.save('data_files/specnet_model.tf',save_format='tf')

#### Application to spectrum of the Sun

Restore network from file

In [None]:
#SpecNet = tf.keras.models.load_model('specnet_model.h5')
SpecNet = tf.keras.models.load_model('data_files/specnet_model.tf')

You need the following definitions if the training data were not processed

In [None]:
temp_class = [4000, 4200, 4400, 4600, 4800, 5000, 5200, 5400, 5600, 5800, 6000]
n_labels = len(temp_class)
n_channels = 20

Read and plot spectrum of the Sun

In [None]:
spectrum_file = "data_files/sun_spec.npz"

spec_arr = np.load(spectrum_file)
wave = spec_arr["arr_0"][:,0]
flux = spec_arr["arr_0"][:,1]

flux_2d = np.reshape(flux, (-1,n_channels))

In [None]:
plt.plot(wave*0.1, flux, color='navy')
plt.xlabel("$\lambda$ / nm")
plt.xlim(650,660)
plt.ylabel("Normalized flux")
plt.ylim(0,1.05)

plt.savefig("solar_spect.pdf")

Classification

In [None]:
guess = SpecNet.predict(np.expand_dims(flux_2d, axis=0))

for i in range(n_labels):
    print("{:4d} K  {:6.2f} %".format(temp_class[i], 100*guess[0,i]))

print("\nEffective temperature estimate: {:.0f} K".
      format(np.average(temp_class, weights=guess.flatten())))