In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [2]:
df = pd.read_csv("data.csv")
df

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.0110,-8.2027,40.0920,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.2610,g
2,162.0520,136.0310,4.0612,0.0374,0.0187,116.7410,-64.8580,-45.2160,76.9600,256.7880,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.4490,116.7370,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.6480,356.4620,g
...,...,...,...,...,...,...,...,...,...,...,...
19015,21.3846,10.9170,2.6161,0.5857,0.3934,15.2618,11.5245,2.8766,2.4229,106.8258,h
19016,28.9452,6.7020,2.2672,0.5351,0.2784,37.0816,13.1853,-2.9632,86.7975,247.4560,h
19017,75.4455,47.5305,3.4483,0.1417,0.0549,-9.3561,41.0562,-9.4662,30.2987,256.5166,h
19018,120.5135,76.9018,3.9939,0.0944,0.0683,5.8043,-93.5224,-63.8389,84.6874,408.3166,h


In [3]:
df["class"] = (df["class"] == "g").astype(int) 

In [4]:
# for column in df.columns:
#     plt.hist(df[df["class"] == 1][column], label = "Gamma", alpha = 0.7, density = True)
#     plt.hist(df[df["class"] == 0][column], label = "Hadron", alpha = 0.7, density = True)
#     plt.title(column)
#     plt.xlabel(column)
#     plt.ylabel('Probability')
#     plt.legend()
#     plt.show()

# Train, Validation, Test Dataset

In [5]:
train, valid, test = np.split(df.sample(frac = 1), [int(0.6 * len(df)), int(0.8 * len(df))])

In [6]:
def scaleDataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [7]:
train, XTrain, yTrain = scaleDataset(train, oversample=True)
valid, XValid, yValid = scaleDataset(valid, oversample=False)
test, XTest, yTest = scaleDataset(test, oversample=False)

# kNN - K-Nearest Neighbours

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [9]:
knnModel = KNeighborsClassifier(n_neighbors=1)

In [10]:
knnModel.fit(XTrain, yTrain)

In [11]:
yPreds = knnModel.predict(XTest)

In [12]:
print(classification_report(yPreds, yTest))

              precision    recall  f1-score   support

           0       0.69      0.77      0.73      1187
           1       0.89      0.84      0.87      2617

    accuracy                           0.82      3804
   macro avg       0.79      0.81      0.80      3804
weighted avg       0.83      0.82      0.82      3804



# Naive Bayes

In [13]:
from sklearn.naive_bayes import GaussianNB

In [14]:
nbModel = GaussianNB()
nbModel.fit(XTrain, yTrain)

In [15]:
yPreds = nbModel.predict(XTest)

In [16]:
print(classification_report(yPreds, yTest))

              precision    recall  f1-score   support

           0       0.42      0.71      0.53       786
           1       0.91      0.74      0.82      3018

    accuracy                           0.74      3804
   macro avg       0.66      0.73      0.67      3804
weighted avg       0.81      0.74      0.76      3804



# Logisitic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lrModel = LogisticRegression()
lrModel.fit(XTrain, yTrain)

In [19]:
yPreds = lrModel.predict(XTest)

In [20]:
print(classification_report(yPreds, yTest))

              precision    recall  f1-score   support

           0       0.73      0.68      0.71      1418
           1       0.82      0.85      0.83      2386

    accuracy                           0.79      3804
   macro avg       0.78      0.77      0.77      3804
weighted avg       0.79      0.79      0.79      3804



# SVMs - Support Vector Machines

In [21]:
from sklearn.svm import SVC # SVC - Support Vector Classifier

In [22]:
svmModel = SVC()
svmModel.fit(XTrain, yTrain)

In [23]:
yPreds = svmModel.predict(XTest)

In [24]:
print(classification_report(yPreds, yTest))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1296
           1       0.90      0.89      0.90      2508

    accuracy                           0.86      3804
   macro avg       0.85      0.85      0.85      3804
weighted avg       0.86      0.86      0.86      3804



# Neural Network

In [25]:
import tensorflow as tf

In [26]:
def plotHistory(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (10, 4))
    ax1.plot(history.history["loss"], label = "loss")
    ax1.plot(history.history["val_loss"], label = "val_loss")
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel("Binary Crossentropy")
    ax1.grid(True)

    ax2.plot(history.history["accuracy"], label = "accuracy")
    ax2.plot(history.history["val_accuracy"], label = "val_accuracy")
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Accuracy")
    ax2.grid(True)
    
    plt.show()

In [27]:
def trainModel(Xtrain, yTrain, numOfNodes, dropoutProbability, learningRate, batchSize, epochs):
    nnModel = tf.keras.Sequential([
        tf.keras.layers.Dense(numOfNodes, activation = "relu", input_shape = (10,)),
        tf.keras.layers.Dropout(dropoutProbability),
        tf.keras.layers.Dense(numOfNodes, activation = "relu"),
        tf.keras.layers.Dropout(dropoutProbability),
        tf.keras.layers.Dense(1, activation = "sigmoid")
    ])
    nnModel.compile(optimizer = tf.keras.optimizers.Adam(learningRate), loss = "binary_crossentropy", metrics = ["accuracy"])
    history = nnModel.fit(XTrain, yTrain, epochs = epochs, batch_size = batchSize, validation_data = 0.2, verbose = 0)
    return nnModel, history

In [29]:
# leastValLoss = float("inf")
# leastLossModel = None
# epochs = 100
# for numOfNodes in [16, 32, 64]:
#     for dropoutProbability in [0, 0.2]:
#         for learningRate in [0.01, 0.005, 0.001]:
#             for batchSize in [32, 64, 128]:
#                 print(f"{numOfNodes} nodes, probability is {dropoutProbability}, learning rate is {learningRate}, and batch size is {batchSize}")
#                 model, history = trainModel(XTrain, yTrain, numOfNodes, dropoutProbability, learningRate, batchSize, epochs)
#                 plotHistory(history)
#                 valLoss = model.evaluate(XValid, yValid)
#                 if valLoss < leastValLoss:
#                     leastValLoss = valLoss
#                     leastLossModel = model