In [1]:
#Makes files in your google drive accessible
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
#Checks if google colab is running on GPU
#If yes, output should be: '/device:GPU:0'
#Choose to run on GPU on Runtime->Change runtime type->Hardware accelarator
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [0]:
#IMPORTS NEEDED PACKAGES
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt

from scipy import ndimage, fft
from scipy.ndimage.filters import uniform_filter1d, gaussian_filter
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.preprocessing import normalize, MinMaxScaler

In [0]:
#CLASS FOR PREPROCESSING LIGHT CURVES
class LightFluxProcessor:
  def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
    self.fourier = fourier
    self.normalize = normalize
    self.gaussian = gaussian
    self.standardize = standardize

  ##FUNDERA PÅ ORDNINGEN PÅ PROCESS-GREJSET
  def process(self, X, Xd):
    # Normalize
    if self.normalize:
      print("Normalizing...")
      X = normalize(X)
      Xd = normalize(Xd)

    if self.fourier:
      print("Applying Fourier...")
      X = np.abs(fft(X))
      Xd= np.abs(fft(Xd))

      # Keep first half of data as it is symmetrical after previous steps
      X = X[:,:(X.shape[1]//2)]
      Xd= Xd[:,:(Xd.shape[1]//2)]

    # Gaussian filter to smooth out data
    if self.gaussian:
      print("Applying Gaussian Filter...")
      X = ndimage.filters.gaussian_filter1d(X, sigma=100) #--_filter1d eller inte?
      Xd = ndimage.filters.gaussian_filter1d(Xd, sigma=100)

      #Standardize doesn't really work as it should, check axis if implemented
    if self.standardize:
        # Standardize X data
        print("Standardizing...")
        std_scaler = StandardScaler()
        X = std_scaler.fit_transform(X)
        Xd = std_scaler.transform(Xd)

    print("Finished Processing!")
    return X, Xd

In [0]:
#FUNCTION FOR PRESENTING GRID SEARCH RESULTS
def report(results, n_top=3):
    for i in range(1,n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [6]:
#LOADING DATA AND CONVERTING TO NUMPY ARRAYS
trainSetPath = "gdrive/My Drive/colab_kexoplanets_SVM/datasets/exoTrain.csv" #Make sure the folder fits on your drive
testSetPath = "gdrive/My Drive/colab_kexoplanets_SVM/datasets/exoTest.csv"
print("Loading datasets...")
df_train = pd.read_csv(trainSetPath, encoding = "ISO-8859-1") #Reads CSV-file with features
df_dev = pd.read_csv(testSetPath, encoding = "ISO-8859-1")    #


print("Finished loading!")

df_train_x = df_train.drop('LABEL', axis=1) #axis=1 chooses to drop columns
df_dev_x = df_dev.drop('LABEL', axis=1)
df_train_y = df_train.LABEL              #Y-values has the column name 'LABEL'
df_dev_y = df_dev.LABEL
X = np.array(df_train_x)  #
Xd = np.array(df_dev_x)   #
Y = np.array(df_train_y)  # Converts to numpy arrays
Yd= np.array(df_dev_y)    #

#Adds the backwards time series for training data
extra = np.flip(X[0:37,:], axis=-1)
extraY = Y[0:37]
X = np.append(X,extra, axis=0)
Y = np.append(Y,extraY,axis=0) 

#Adds the backwards timeseries to testing data
dextra = np.flip(Xd[0:5,:], axis=-1)
dextraY = Yd[0:5]
Xd = np.append(Xd, dextra, axis=0) #adding flipped data
Yd = np.append(Yd,dextraY,axis=0)

Y=Y-1     #To get postives to 1 and negatives to 0, original labels are
Yd=Yd-1   #    2 for positive and 1 for negatives

Loading datasets...
Finished loading!


In [7]:
#PREPROCESSING
LFP = LightFluxProcessor(
    fourier=False,
    normalize=False,             
    gaussian=False,              
    standardize=False)     #Standardize går kanske längs båda axlar - dåligt, ha inte på
X, Xd = LFP.process(X, Xd)

#Nornmalizing
X = ((X - np.mean(X, axis=1).reshape(-1,1)) / np.std(X, axis=1).reshape(-1,1))
Xd = ((Xd - np.mean(Xd, axis=1).reshape(-1,1)) / np.std(Xd, axis=1).reshape(-1,1))

Finished Processing!


In [0]:
#Parameters for GridSearch:
#Classweights
c_w0 = {0: 1/10000,
      1: 1}
c_w1 = {0: 1/1000,
      1: 1}
c_w2 = {0: 1/500,
      1: 1}
c_w3 = {0: 1/200,
      1: 1}    
c_w4 = {0: 1/100,
      1: 1}
c_w5 = {0: 1/50,
      1: 1}
c_w6 = {0: 1/20,
      1: 1}
c_w7 = {0: 1,
      1: 1}    
c_w8 = {0: 1,
      1: 1000}
c_w9 = {0: 1,
      1: 500}
c_w10= {0: 1,
      1: 200}    
c_w11= {0: 1,
      1: 100}
c_w12= {0: 1,
      1: 50}
c_w13= {0: 1,
      1: 20}

#GRID SEARCH WITH DIFFERENT KERNELS
gs1 = GridSearchCV(SVC(),
                scoring=make_scorer(fbeta_score,beta=2),
                n_jobs=12,                                 #n_jobs - Number of cores to run on, -1 is all possible (lots on GPU) but seems to generate errors
                cv=3,                                     #cv - How many cross validation splits are made
                param_grid={"kernel":['rbf'],             #Param grid is the parameters to input in the chosen classifer
                "class_weight":['balanced',c_w1,c_w2,c_w3,c_w4,c_w5,c_w6,c_w8,c_w9,c_w10,c_w11,c_w12,c_w13], 
                "gamma":['scale'],
                "max_iter":[1500]} )

gs2 = GridSearchCV(SVC(),
                scoring=make_scorer(fbeta_score,beta=2),
                n_jobs=12,
                cv=3, #check how splits are made
                param_grid={"kernel":['poly', 'linear'],
                "class_weight":['balanced',c_w1,c_w2,c_w3,c_w4,c_w5,c_w6,c_w8,c_w9,c_w10,c_w11,c_w12,c_w13], 
                "gamma":['scale'],
                "degree":[2,3],
                "max_iter":[1500]} )

gs3 = GridSearchCV(LinearSVC(),
                scoring=make_scorer(fbeta_score,beta=2),
                n_jobs=12,
                cv=3, #check how splits are made
                param_grid={"class_weight":['balanced' , c_w3, c_w10], 
                "max_iter":[1500,5000],
                "dual":[False, True]           } )

In [9]:
#PARAMETER SEARCH AND EVALUATION + BEST MODEL TRAINING #1
print("Starting grid search 1 fit..")
start = time()
gs1.fit(X,Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(gs1.cv_results_['params'])))
report(gs1.cv_results_)

print(' ')
print("Training best model 1...")
gs1.fit(X, Y)               #Choose which data to train on, features osv
print("Finished training!")
train_outputs1=gs1.predict(X)   
test_outputs1 =gs1.predict(Xd)
print(' ')
print('______________________________________________________________________')
print(' ')

#PARAMETER SEARCH AND EVALUATION + BEST MODEL TRAINING #2
print("Starting grid search 2 fit..")
start = time()
gs2.fit(X,Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(gs2.cv_results_['params'])))
report(gs2.cv_results_)

print(' ')
print("Training best model 2...")
gs2.fit(X, Y)               #Choose which data to train on, features osv
print("Finished training!")
train_outputs2=gs2.predict(X)   
test_outputs2 =gs2.predict(Xd)

#PARAMETER SEARCH AND EVALUATION + BEST MODEL TRAINING #3
print("Starting grid search 3 fit..")
start = time()
gs3.fit(X,Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
  % (time() - start, len(gs3.cv_results_['params'])))
report(gs3.cv_results_)

print(' ')
print("Training best linear-kernel model 3...")
gs3.fit(X, Y)               #Choose which data to train on, features osv
print("Finished training!")
train_outputs3=gs3.predict(X)   
test_outputs3 =gs3.predict(Xd)

Starting grid search 1 fit..




GridSearchCV took 754.50 seconds for 13 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.068 (std: 0.001)
Parameters: {'class_weight': {0: 0.001, 1: 1}, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1500}

Model with rank: 1
Mean validation score: 0.068 (std: 0.001)
Parameters: {'class_weight': {0: 0.002, 1: 1}, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1500}

Model with rank: 1
Mean validation score: 0.068 (std: 0.001)
Parameters: {'class_weight': {0: 0.005, 1: 1}, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1500}

Model with rank: 1
Mean validation score: 0.068 (std: 0.001)
Parameters: {'class_weight': {0: 0.01, 1: 1}, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1500}

Model with rank: 1
Mean validation score: 0.068 (std: 0.001)
Parameters: {'class_weight': {0: 0.02, 1: 1}, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1500}

 
Training best model 1...




Finished training!
 
______________________________________________________________________
 
Starting grid search 2 fit..




GridSearchCV took 2196.53 seconds for 52 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.168 (std: 0.028)
Parameters: {'class_weight': 'balanced', 'degree': 2, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': 1500}

Model with rank: 2
Mean validation score: 0.123 (std: 0.016)
Parameters: {'class_weight': {0: 1, 1: 1000}, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': 1500}

Model with rank: 2
Mean validation score: 0.123 (std: 0.016)
Parameters: {'class_weight': {0: 1, 1: 500}, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': 1500}

Model with rank: 2
Mean validation score: 0.123 (std: 0.016)
Parameters: {'class_weight': {0: 1, 1: 200}, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': 1500}

Model with rank: 2
Mean validation score: 0.123 (std: 0.016)
Parameters: {'class_weight': {0: 1, 1: 100}, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': 1500}

Model with rank: 2
Mean validation score: 0.123 (std: 0.016)
P



Finished training!
Starting grid search 3 fit..




GridSearchCV took 3846.01 seconds for 12 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.086 (std: 0.016)
Parameters: {'class_weight': {0: 0.005, 1: 1}, 'dual': True, 'max_iter': 5000}

Model with rank: 2
Mean validation score: 0.085 (std: 0.015)
Parameters: {'class_weight': {0: 0.005, 1: 1}, 'dual': False, 'max_iter': 5000}

Model with rank: 3
Mean validation score: 0.083 (std: 0.014)
Parameters: {'class_weight': {0: 0.005, 1: 1}, 'dual': True, 'max_iter': 1500}

 
Training best linear-kernel model 3...




Finished training!




In [10]:
#METRICS AND EVALUATION
print("#1")
precision_train=precision_score(Y,train_outputs1)
precision_test=precision_score(Yd,test_outputs1)
recall_train = recall_score(Y, train_outputs1)
recall_test = recall_score(Yd,test_outputs1)
fbeta_train = fbeta_score(Y,train_outputs1, beta=2)
fbeta_test = fbeta_score(Yd,test_outputs1,beta=2)
print("Precision training set: %.2f" %precision_train)
print("Precision test set: %.2f" %precision_test)
print("Recall training set: %.2f" %recall_train)
print("Recall test set: %.2f" %recall_test)
print("FB score training set: %.2f" %fbeta_train)
print("FB score test set: %.2f" %fbeta_test)
print(' ')
confM=confusion_matrix(Y,train_outputs1)
print("Confusion Matrix - Train Set")
print(confM)

confMd=confusion_matrix(Yd,test_outputs1)
print("Confusion Matrix - Test Set")
print(confMd)

#METRICS AND EVALUATION
print("#2")
precision_train=precision_score(Y,train_outputs2)
precision_test=precision_score(Yd,test_outputs2)
recall_train = recall_score(Y, train_outputs2)
recall_test = recall_score(Yd,test_outputs2)
fbeta_train = fbeta_score(Y,train_outputs2, beta=2)
fbeta_test = fbeta_score(Yd,test_outputs2,beta=2)
print("Precision training set: %.2f" %precision_train)
print("Precision test set: %.2f" %precision_test)
print("Recall training set: %.2f" %recall_train)
print("Recall test set: %.2f" %recall_test)
print("FB score training set: %.2f" %fbeta_train)
print("FB score test set: %.2f" %fbeta_test)
print(' ')
confM=confusion_matrix(Y,train_outputs2)
print("Confusion Matrix - Train Set")
print(confM)

confMd=confusion_matrix(Yd,test_outputs2)
print("Confusion Matrix - Test Set")
print(confMd)

#METRICS AND EVALUATION
print("#3")
precision_train=precision_score(Y,train_outputs3)
precision_test=precision_score(Yd,test_outputs3)
recall_train = recall_score(Y, train_outputs3)
recall_test = recall_score(Yd,test_outputs3)
fbeta_train = fbeta_score(Y,train_outputs3, beta=2)
fbeta_test = fbeta_score(Yd,test_outputs3,beta=2)
print("Precision training set: %.2f" %precision_train)
print("Precision test set: %.2f" %precision_test)
print("Recall training set: %.2f" %recall_train)
print("Recall test set: %.2f" %recall_test)
print("FB score training set: %.2f" %fbeta_train)
print("FB score test set: %.2f" %fbeta_test)
print(' ')
confM=confusion_matrix(Y,train_outputs3)
print("Confusion Matrix - Train Set")
print(confM)

confMd=confusion_matrix(Yd,test_outputs3)
print("Confusion Matrix - Test Set")
print(confMd)

#1
Precision training set: 0.01
Precision test set: 0.02
Recall training set: 1.00
Recall test set: 1.00
FB score training set: 0.07
FB score test set: 0.08
 
Confusion Matrix - Train Set
[[   0 5050]
 [   0   74]]
Confusion Matrix - Test Set
[[  0 565]
 [  0  10]]
#2
Precision training set: 0.77
Precision test set: 0.00
Recall training set: 1.00
Recall test set: 0.00
FB score training set: 0.94
FB score test set: 0.00
 
Confusion Matrix - Train Set
[[5028   22]
 [   0   74]]
Confusion Matrix - Test Set
[[563   2]
 [ 10   0]]
#3
Precision training set: 0.63
Precision test set: 0.02
Recall training set: 1.00
Recall test set: 0.20
FB score training set: 0.90
FB score test set: 0.06
 
Confusion Matrix - Train Set
[[5007   43]
 [   0   74]]
Confusion Matrix - Test Set
[[442 123]
 [  8   2]]
