# UCI MAGIC GAMMA TELESCOPE DATASET

https://archive.ics.uci.edu/ml/machine-learning-databases/magic/

## Dataset

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data
!mv magic04.data magic.data

--2023-03-04 03:59:43--  https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1477391 (1.4M) [application/x-httpd-php]
Saving to: ‘magic04.data’


2023-03-04 03:59:44 (3.66 MB/s) - ‘magic04.data’ saved [1477391/1477391]



## Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report

## Preprocessing

In [None]:
cols = ["flength", "fWidth", "fSize", "fConc", "fConcl", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("magic.data", names=cols)
df.head()

Unnamed: 0,flength,fWidth,fSize,fConc,fConcl,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [None]:
df["class"].unique()

array(['g', 'h'], dtype=object)

In [None]:
df["class"] = (df["class"] == "g").astype(int)

In [None]:
df.head()

Unnamed: 0,flength,fWidth,fSize,fConc,fConcl,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


## Visualizing

In [None]:
for label in cols[:-1]:
  plt.hist(df[df["class"] == 1][label], color="blue", label="gamma", alpha=0.7, density=True)
  plt.hist(df[df["class"] == 0][label], color="red", label="hadron", alpha=0.7, density=True)
  plt.title(label)
  plt.xlabel(label)
  plt.ylabel("Probability")
  plt.legend()
  plt.show()
  print()

## Train, Validation and Test data

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])

In [None]:
len(df) , len(train), len(valid), len(test)

(19020, 11412, 3804, 3804)

In [None]:
def scale_dataset(dataframe, overSample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if overSample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y 

In [None]:
print(len(train[train["class"] == 1])) # gamma
print(len(train[train["class"] == 0])) # hadron

7486
3926


In [None]:
train, x_train, y_train = scale_dataset(train, overSample=True)
valid, x_valid, y_valid = scale_dataset(valid, overSample=False)
test, x_test, y_test = scale_dataset(test, overSample=False)

## *KNN* (K Nearest Neighbours)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 7)
knn_model.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [None]:
y_pred = knn_model.predict(x_test)

In [None]:
y_pred

array([1, 1, 0, ..., 1, 1, 1])

In [None]:
y_test

array([1, 1, 0, ..., 1, 1, 1])

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74      1312
           1       0.86      0.87      0.87      2492

    accuracy                           0.82      3804
   macro avg       0.81      0.80      0.80      3804
weighted avg       0.82      0.82      0.82      3804



## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(x_train, y_train)

In [None]:
y_pred = nb_model.predict(x_test)

In [None]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
y_test

array([1, 1, 0, ..., 1, 1, 1])

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.41      0.51      1312
           1       0.74      0.90      0.81      2492

    accuracy                           0.73      3804
   macro avg       0.71      0.66      0.66      3804
weighted avg       0.72      0.73      0.71      3804



## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lgr_model = LogisticRegression()
lgr_model = lgr_model.fit(x_train, y_train)

In [None]:
y_pred = lgr_model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.70      0.70      1387
           1       0.83      0.82      0.83      2417

    accuracy                           0.78      3804
   macro avg       0.76      0.76      0.76      3804
weighted avg       0.78      0.78      0.78      3804



## SVM (Support Vector Machine)

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(x_train, y_train)

In [None]:
y_pred = svm_model.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.77      0.81      0.79      1317
           1       0.90      0.87      0.89      2487

    accuracy                           0.85      3804
   macro avg       0.84      0.84      0.84      3804
weighted avg       0.85      0.85      0.85      3804



## NN (Neural Networks)

In [None]:
import tensorflow as tf

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Binary Cross Entropy')
  plt.legend()
  plt.grid(True)
  plt.show()

def plot_accuracy(history):
  plt.plot(history.history['accuracy'], label='accuracy')
  plt.plot(history.history['val_accuracy'], label='val_accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.grid(True)
  plt.show()

def plot_history(history):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))

  ax1.plot(history.history['loss'], label='loss')
  ax1.plot(history.history['val_loss'], label='val_loss')
  ax1.set_xlabel('Epoch')
  ax1.set_ylabel('Binary Cross Entropy')
  ax1.grid(True)

  ax2.plot(history.history['accuracy'], label='accuracy')
  ax2.plot(history.history['val_accuracy'], label='val_accuracy')
  ax2.set_xlabel('Epoch')
  ax2.set_ylabel('Accuracy')
  ax2.grid(True)

  plt.show()

In [None]:
def train_model(x_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
  nn_model = tf.keras.Sequential([ 
      tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(num_nodes, activation='relu'),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy', metrics=['accuracy'])

  history = nn_model.fit(
    x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0
  )

  return nn_model, history

In [None]:
epochs=100

least_val_loss = float('inf')
least_loss_model = None

for num_nodes in [16, 32, 64]:
  for dropout_prob in [0, 0.2]:
    for lr in [0.01, 0.005, 0.001]:
      for batch_size in [32, 64, 128]:

        print(f"{num_nodes} nodes, dropout probability = {dropout_prob}, learning rate = {lr}, batch size = {batch_size}")
        model, history = train_model(x_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
        
        plot_history(history)

        val_loss = model.evaluate(x_valid, y_valid)[0]
        if val_loss < least_val_loss:
          least_val_loss = val_loss
          least_loss_model = model

Output hidden; open in https://colab.research.google.com to view.

In [None]:
y_pred = least_loss_model.predict(x_test)
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)
y_pred



array([1, 1, 1, ..., 1, 1, 1])

In [None]:
y_test

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.73      0.89      0.81      1140
           1       0.95      0.86      0.90      2664

    accuracy                           0.87      3804
   macro avg       0.84      0.88      0.85      3804
weighted avg       0.88      0.87      0.87      3804

