### **Machine Learning Star Classification**
Classification of stars based on their spectral characteristics

**Authors:**
- *Stefano Quaggio 866504*
- *Stefano Andreotti 851596*
- *Alberto Varisco 866109*

**Classification models used:**
- <u>Neural Networks</u>
- <u>SVM</u>
- <u>K-Means Algorithm</u>

## <u>Initial Analysis</u>

In [None]:
# All libraries imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Keras imports
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam

#Diamo accesso al nostro google drive che conterrà il dataset che utilizzeremo in questo laboratorio
# from google.colab import drive

# drive.mount('/content/drive/')

In [None]:
full_df = pd.read_csv('../dataset/star_classification.csv')

full_df.head()

In [None]:
full_df.info()
# Check number of missing values in columns
full_df.isnull().sum()

In [None]:
full_df.hist(figsize=(18,18))

In [None]:
# Initialize LabelEncoder object
label_encoder = LabelEncoder()

# Apply LabelEncoder on 'class' column (target) -> 0 = Galaxy, 1 = Quasar, 2 = Star
full_df['class'] = label_encoder.fit_transform(full_df['class'])
# create dict with { label: value } (funziona solo quando viene eseguito dall'inizio se no diventa { value: value })
#label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
# per non dover rieseguire tutto il codice dall'inizio
label_mapping = { 'Galaxy': 0, 'Quasar': 1, 'Star': 2 }

#Check distribution of target variable
sns.countplot(x = full_df['class'])
plt.show()

In [None]:
# Check correlation between features
plt.figure(figsize=(15,8))
sns.heatmap(full_df.corr(), annot=True, cmap='YlGnBu')

In [None]:
# Remove 'rerun_ID' column as it has only one value and is not useful for classification
full_df.drop(['rerun_ID'], axis=1, inplace=True)

# Split dataset into train and test sets
x = full_df.drop(['class'], axis=1)
y = full_df['class']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


## Rete neurale

In [None]:
# Feature scaling (standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train

In [None]:
# Convert target variable to categorical, as it is a multi-class classification problem
y_train_neural = keras.utils.to_categorical(y_train)
y_test_neural = keras.utils.to_categorical(y_test)
print(X_test.shape, y_test_neural.shape)

print(y_train_neural)

In [None]:
input_features = X_train.shape[1]

model=Sequential()
model.add(Dense(32, input_dim=input_features, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
# Train the model
history = model.fit(X_train, y_train_neural, epochs=20, batch_size=100, verbose=1, validation_data=(X_test, y_test_neural))

# Evaluate the model on the test set
_, train_acc = model.evaluate(X_train, y_train_neural, verbose=0)
_, test_acc = model.evaluate(X_test, y_test_neural, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
# plot loss during training
plt.subplot(111)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
# plot accuracy during training
plt.subplot(222)
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# predict probabilities for test set
yhat_probs = model.predict(X_test, verbose=0)
# predict crisp classes for test set
yhat_classes=np.argmax(yhat_probs,axis=1)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

y_test_unidimension = [0 if val[0] else 1 if val[1] else 2 for val in y_test_neural]
    
print(classification_report(y_test_unidimension, yhat_classes, target_names=label_mapping.keys()))

## SVM

In [None]:
from sklearn import svm

# Crea il classificatore SVM
# C = 1 default provato C=100 cambia pochissimo non vale la pena
sv = svm.SVC(kernel='linear', C=1, probability=True)

sv.fit(X_train, y_train)
y_pred = sv.predict(X_test)

# calcolo dell'accuratezza
# label_mapping dict: Galaxy = 0, Quasar = 1, Star = 2
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuratezza: {accuracy}\n\n")
print(classification_report(y_test, y_pred, target_names=label_mapping.keys()))
print("\n\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score, roc_curve

# from sklearn.metrics import roc_curve, roc_auc_score
# 
# y_pred_prob = sv.predict_proba(X_test)[:, 1]
# 
# # Calcola la curva ROC
# fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# 
# # Calcola l'AUC della curva ROC
# roc_auc = roc_auc_score(y_test, y_pred_prob)
# 
# # Disegna la curva ROC
# plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
# plt.plot([0, 1], [0, 1], 'k--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic')
# plt.legend(loc="lower right")
# plt.show()
 
y_pred_prob = sv.predict_proba(X_test)

# Get ROC metrics for each class
fpr = {}
tpr = {}
thresh ={}

for i in range(3):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, y_pred_prob[:,i], pos_label=i)
    
# Plot the ROC chart
plt.plot(fpr[0], tpr[0], linestyle='--',color='yellow', label= 'Galaxy vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Quasar vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='red', label='Star vs Rest')
plt.title('Multiclass ROC curve SVM')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()


## Decision Tree


In [None]:
# Decision Tree
model = DecisionTreeClassifier(random_state = 30)
model.fit(X_train, y_train)
y_pred3 = model.predict(X_test)

dtree_score = recall_score(y_test, y_pred3, average='weighted')
print(dtree_score)

In [None]:
# Stampa albero
fig, ax = plt.subplots(figsize=(150, 100))
plot_tree(model, filled=True, ax=ax)
plt.plot()
# Salva l'albero
# plt.savefig("quattr.svg", format="svg")

In [None]:
# Predizione per nuovo modello
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

print("Confusion matrix:\n", cm)
print("\nAccuratezza:", cm.diagonal().sum() / cm.sum())

In [None]:
path = model.cost_complexity_pruning_path(X_train, y_train)

ccp_alphas = path.ccp_alphas
# Valori di complessità = [0.001, 0.01, 0.1, 1, 10, 100]
ccp_alphas = path.ccp_alphas

# Addestra l'albero decisionale con diversi valori di complessità
train_accuracy = []
test_accuracy = []
for complexity in ccp_alphas:
    clf = DecisionTreeClassifier(max_depth=3, ccp_alpha=complexity)
    clf.fit(X_train, y_train)
    train_accuracy.append(clf.score(X_train, y_train))
    test_accuracy.append(clf.score(X_test, y_test))

# Plotta l'accuratezza del modello in funzione del parametro di complessità
plt.plot(ccp_alphas, train_accuracy, label='Training Accuracy')
plt.plot(ccp_alphas, test_accuracy, label='Test Accuracy')
plt.xlabel('Complexity Parameter')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Complexity Parameter')
plt.xscale('log')
plt.legend()
plt.show()

In [ ]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print(
    "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]
    )
)

In [ ]:
clfs = clfs[:-1]
complexity_values = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1)
ax[0].plot(complexity_values, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("Alpha")
ax[0].set_ylabel("Numero di nodi")
ax[0].set_title("Numero nodi vs Alpha")
ax[1].plot(complexity_values, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("Alpha")
ax[1].set_ylabel("Profondità albero")
ax[1].set_title("Profondità albero vs Alpha")
fig.tight_layout()