In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [13]:
gamma_data_path = "../data/processed/magic04.data"
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
gamma_data = pd.read_csv(gamma_data_path, names=cols)
gamma_data.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [14]:
gamma_data["class"].unique()

array(['g', 'h'], dtype=object)

In [15]:
gamma_data["class"] = (gamma_data["class"] == "g").astype(int)
print("Making class into a binary variable")

Making class into a binary variable


In [16]:
gamma_data["class"].unique()

array([1, 0])

In [None]:
for label in cols[:-1]:
  plt.hist(gamma_data[gamma_data["class"]==1][label], color='blue', label='gamma', alpha=0.7, density=True)
  plt.hist(gamma_data[gamma_data["class"]==0][label], color='red', label='hadron', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

In [32]:

train, valid, test = np.split(gamma_data.sample(frac=1), [int(.6*len(gamma_data)), int(.8*len(gamma_data))])

In [33]:
def scale_dataset(dataframe, oversample=False):
  features = dataframe[dataframe.columns[:-1]].values
  label = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  features = scaler.fit_transform(features)

  if oversample:
    ros = RandomOverSampler()
    features, label = ros.fit_resample(features, label)

  data = np.hstack((features, np.reshape(label, (-1, 1))))

  return data, features, label

In [34]:
# X means features, y means label
train, X_train, y_train = scale_dataset(train, oversample=True)
# dataframe train doesnt get used a
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

# KNN

In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [36]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [39]:
y_pred = knn_model.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1305
           1       0.86      0.86      0.86      2499

    accuracy                           0.81      3804
   macro avg       0.79      0.79      0.79      3804
weighted avg       0.81      0.81      0.81      3804



> ### precision
> - tp/tp+fp
> - The number of instances that are relevant, out of the total instances the model retrieved.
> - How many retrieved items are relevant?
> ### recall
> - tp/tp+fn
> - The number of instances which the model correctly identified as relevant out of the total relevant instances.
> - How many relevant items are retrieved?
- [precision and recall medium article](https://towardsdatascience.com/precision-and-recall-88a3776c8007)

# Naive Bayes

In [41]:
from sklearn.naive_bayes import GaussianNB

In [42]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [43]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.41      0.51      1305
           1       0.74      0.89      0.81      2499

    accuracy                           0.73      3804
   macro avg       0.70      0.65      0.66      3804
weighted avg       0.72      0.73      0.71      3804



### Naive Bayes
- [Navie Bayes explaination](https://www.youtube.com/watch?v=HZGCoVF3YvM)
- 

# Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [47]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.72      0.69      1305
           1       0.85      0.81      0.83      2499

    accuracy                           0.78      3804
   macro avg       0.76      0.77      0.76      3804
weighted avg       0.79      0.78      0.78      3804



### Logistic Regression
- logistic regression is a classification algorithm that tries to fit data to the sigmoid function.

# Support Vector Machine (SVM)

In [48]:
from sklearn.svm import SVC

In [49]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [50]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80      1305
           1       0.89      0.90      0.90      2499

    accuracy                           0.86      3804
   macro avg       0.85      0.84      0.85      3804
weighted avg       0.86      0.86      0.86      3804



### Support Vector Machine (SVM)
- SVM find the best hyperplane to separate the data