In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [0]:
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt

from scipy import ndimage, fft
from scipy.ndimage.filters import uniform_filter1d, gaussian_filter
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import make_scorer, fbeta_score


In [0]:
#FUNCTION FOR PRESENTING GRID SEARCH RESULTS
def report(results, n_top=3):
    for i in range(1,n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [6]:
#LOADING DATA AND CONVERTING TO NUMPY ARRAYS
trainSetPath = "gdrive/My Drive/colab_kexoplanets_SVM/datasets/exoTrain.csv"
testSetPath = "gdrive/My Drive/colab_kexoplanets_SVM/datasets/exoTest.csv"
print("Loading datasets...")
df_train = pd.read_csv(trainSetPath, encoding = "ISO-8859-1")
df_dev = pd.read_csv(testSetPath, encoding = "ISO-8859-1")

feat_train = "gdrive/My Drive/colab_kexoplanets_SVM/datasets/+train_features_n_s100.csv"
feat_test = "gdrive/My Drive/colab_kexoplanets_SVM/datasets/+test_features_n_s100.csv"
# Generate X and Y dataframe sets
X_feat = pd.read_csv(feat_train, encoding = "ISO-8859-1")
Xd_feat = pd.read_csv(feat_test, encoding = "ISO-8859-1")
print("Finished loading!")

#Converting to numpy arrays
X_feat = np.array(X_feat)
Xd_feat = np.array(Xd_feat)
df_train_y = df_train.LABEL
df_dev_y = df_dev.LABEL
Y = np.array(df_train_y)
Yd= np.array(df_dev_y)

extraY = Y[0:37]
Y = np.append(Y,extraY,axis=0) #Kan vara bra att lägga till fler exempel för test-setet också
dextraY = Yd[0:5]
Yd = np.append(Yd,dextraY,axis=0)
Y=Y-1
Yd=Yd-1     #To get postives to 1 and negatives to 0


#Scaling each feautre for RBF-kernel
X_rbf = ((X_feat - np.mean(X_feat, axis=0).reshape(1,-1)) / np.std(X_feat, axis=0).reshape(1,-1))
Xd_rbf = ((Xd_feat - np.mean(Xd_feat, axis=0).reshape(1,-1)) / np.std(Xd_feat, axis=0).reshape(1,-1))

Loading datasets...
Finished loading!


In [0]:
#Parameters for GridSearch:
#region Classweights
c_w0 = {0: 1/25,
      1: 1}
c_w1 = {0: 1/25,
      1: 3}
c_w2 = {0: 1/30,
      1: 1}
c_w3 = {0: 1/40,
      1: 2.5}    
c_w4 = {0: 1/50,
      1: 2}
c_w5 = {0: 1/50,
      1: 3}
c_w6 = {0: 1/40,
      1: 3}
c_w7 = {0: 1/70,
      1: 5}    
c_w8 = {0: 1/10,
      1: 16}
c_w9 = {0: 1/35,
      1: 2}
c_w10= {0: 1/30,
      1: 2}    
c_w11= {0: 1/40,
      1: 1.5}
c_w12= {0: 1/10,
      1: 8}
c_w13= {0: 1/10,
      1: 6}
#endregion

gs1 = GridSearchCV(SVC(),
                scoring=make_scorer(fbeta_score,beta=2),
                n_jobs=-1,
                cv=3, #check how splits are made
                param_grid={"kernel":['rbf'],
                "class_weight":['balanced',c_w0,c_w1,c_w2,c_w3,c_w4,c_w5,c_w6,c_w7,c_w8,c_w9,c_w10,c_w11,c_w12,c_w13], 
                "gamma":['scale','auto'],
                "cache_size":[6000],
                "max_iter":[500,1000,1500, 10000]} )

gs2 = GridSearchCV(SVC(),
                scoring=make_scorer(fbeta_score,beta=2),
                n_jobs=-1,
                cv=3, #check how splits are made
                param_grid={"kernel":['poly','linear'],
                "class_weight":['balanced',c_w0,c_w1,c_w2,c_w3,c_w4,c_w5,c_w6,c_w7,c_w8,c_w9,c_w10,c_w11,c_w12,c_w13], 
                "gamma":['scale',"auto"],
                "degree":[2,3],
                "cache_size":[6000],
                "max_iter":[10000]} )

gs3 = GridSearchCV(LinearSVC(),
                scoring=make_scorer(fbeta_score,beta=2),
                n_jobs=-1,
                cv=3, #check how splits are made
                param_grid={"class_weight":['balanced',c_w0,c_w1,c_w2,c_w3,c_w4,c_w5,c_w6,c_w7,c_w8,c_w9,c_w10,c_w11,c_w12,c_w13], 
                "max_iter":[10000],
                "dual":[False,True]           } )


In [12]:
#PARAMETER SEARCH AND EVALUATION + BEST MODEL TRAINING #1
print("Starting grid search 1 fit..")
start = time()
gs1.fit(X_rbf,Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
  % (time() - start, len(gs1.cv_results_['params'])))
report(gs1.cv_results_)

print(' ')
print("Training best RBF-kernel model 1...")
gs1.fit(X_rbf, Y)               #Choose which data to train on, features osv
print("Finished training!")
train_outputs1=gs1.predict(X_rbf)   
test_outputs1 =gs1.predict(Xd_rbf)
print(' ')
print('______________________________________________________________________')
print(' ')

#PARAMETER SEARCH AND EVALUATION + BEST MODEL TRAINING #2
print("Starting grid search 2 fit..")
start = time()
gs2.fit(X_feat,Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
  % (time() - start, len(gs2.cv_results_['params'])))
report(gs2.cv_results_)

print(' ')
print("Training best poly-kernel model 2...")
gs2.fit(X_feat, Y)               #Choose which data to train on, features osv
print("Finished training!")
train_outputs2=gs2.predict(X_feat)   
test_outputs2 =gs2.predict(Xd_feat)


print(' ')
print('______________________________________________________________________')
print(' ')

#PARAMETER SEARCH AND EVALUATION + BEST MODEL TRAINING #3
print("Starting grid search 3 fit..")
start = time()
gs3.fit(X_feat,Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
  % (time() - start, len(gs3.cv_results_['params'])))
report(gs3.cv_results_)

print(' ')
print("Training best linear-kernel model 3...")
gs3.fit(X_feat, Y)               #Choose which data to train on, features osv
print("Finished training!")
train_outputs3=gs3.predict(X_feat)   
test_outputs3 =gs3.predict(Xd_feat)

Starting grid search 1 fit..
GridSearchCV took 808.55 seconds for 120 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.264 (std: 0.030)
Parameters: {'cache_size': 6000, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 10000}

Model with rank: 2
Mean validation score: 0.264 (std: 0.029)
Parameters: {'cache_size': 6000, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1500}

Model with rank: 3
Mean validation score: 0.264 (std: 0.032)
Parameters: {'cache_size': 6000, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': 1500}

 
Training best RBF-kernel model 1...
Finished training!
 
______________________________________________________________________
 
Starting grid search 2 fit..
GridSearchCV took 695.01 seconds for 120 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.201 (std: 0.051)
Parameters: {'cache_size': 6000, 'class_weight': {0: 0.1, 1: 6}, 'degree': 2, '

In [13]:
# #METRICS AND EVALUATION
print("#1")
precision_train=precision_score(Y,train_outputs1)
precision_test=precision_score(Yd,test_outputs1)
recall_train = recall_score(Y, train_outputs1)
recall_test = recall_score(Yd,test_outputs1)
fbeta_train = fbeta_score(Y,train_outputs1, beta=2)
fbeta_test = fbeta_score(Yd,test_outputs1,beta=2)
print("Precision training set: %.2f" %precision_train)
print("Precision test set: %.2f" %precision_test)
print("Recall training set: %.2f" %recall_train)
print("Recall test set: %.2f" %recall_test)
print("FB score training set: %.2f" %fbeta_train)
print("FB score test set: %.2f" %fbeta_test)
print(' ')
confM=confusion_matrix(Y,train_outputs1)
print("Confusion Matrix - Train Set")
print(confM)

confMd=confusion_matrix(Yd,test_outputs1)
print("Confusion Matrix - Test Set")
print(confMd)

#METRICS AND EVALUATION
print("#2")
precision_train=precision_score(Y,train_outputs2)
precision_test=precision_score(Yd,test_outputs2)
recall_train = recall_score(Y, train_outputs2)
recall_test = recall_score(Yd,test_outputs2)
fbeta_train = fbeta_score(Y,train_outputs2, beta=2)
fbeta_test = fbeta_score(Yd,test_outputs2,beta=2)
print("Precision training set: %.2f" %precision_train)
print("Precision test set: %.2f" %precision_test)
print("Recall training set: %.2f" %recall_train)
print("Recall test set: %.2f" %recall_test)
print("FB score training set: %.2f" %fbeta_train)
print("FB score test set: %.2f" %fbeta_test)
print(' ')
confM=confusion_matrix(Y,train_outputs2)
print("Confusion Matrix - Train Set")
print(confM)

confMd=confusion_matrix(Yd,test_outputs2)
print("Confusion Matrix - Test Set")
print(confMd)

#METRICS AND EVALUATION
print("#3")
precision_train=precision_score(Y,train_outputs3)
precision_test=precision_score(Yd,test_outputs3)
recall_train = recall_score(Y, train_outputs3)
recall_test = recall_score(Yd,test_outputs3)
fbeta_train = fbeta_score(Y,train_outputs3, beta=2)
fbeta_test = fbeta_score(Yd,test_outputs3,beta=2)
print("Precision training set: %.2f" %precision_train)
print("Precision test set: %.2f" %precision_test)
print("Recall training set: %.2f" %recall_train)
print("Recall test set: %.2f" %recall_test)
print("FB score training set: %.2f" %fbeta_train)
print("FB score test set: %.2f" %fbeta_test)
print(' ')
confM=confusion_matrix(Y,train_outputs3)
print("Confusion Matrix - Train Set")
print(confM)

confMd=confusion_matrix(Yd,test_outputs3)
print("Confusion Matrix - Test Set")
print(confMd)

#1
Precision training set: 0.17
Precision test set: 0.00
Recall training set: 0.99
Recall test set: 0.00
FB score training set: 0.50
FB score test set: 0.00
 
Confusion Matrix - Train Set
[[4695  355]
 [   1   73]]
Confusion Matrix - Test Set
[[520  45]
 [ 10   0]]
#2
Precision training set: 0.07
Precision test set: 0.07
Recall training set: 0.36
Recall test set: 0.20
FB score training set: 0.20
FB score test set: 0.14
 
Confusion Matrix - Train Set
[[4707  343]
 [  47   27]]
Confusion Matrix - Test Set
[[538  27]
 [  8   2]]
#3
Precision training set: 0.09
Precision test set: 0.09
Recall training set: 0.23
Recall test set: 0.10
FB score training set: 0.18
FB score test set: 0.10
 
Confusion Matrix - Train Set
[[4883  167]
 [  57   17]]
Confusion Matrix - Test Set
[[555  10]
 [  9   1]]
