In [1]:
#imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn import svm

### Preprocessing and set up

In [2]:
brc = pd.read_csv('brc.txt')

In [3]:
#Set column names
headers = ['record_id', 'clump_thickness', 'uniform_cellsize', 'uniform_cellshape', 'marginal_adhesion', 'cell_size_epit_sing', 'bare_nuclei', 'bland_chromatin', 'normal_nuclei', 'mitoses', 'malignancy']
brc.columns = headers

In [4]:
#display the headers
brc.head()

Unnamed: 0,record_id,clump_thickness,uniform_cellsize,uniform_cellshape,marginal_adhesion,cell_size_epit_sing,bare_nuclei,bland_chromatin,normal_nuclei,mitoses,malignancy
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [5]:
#create the X set, which contains the features
xset = brc[['clump_thickness', 'uniform_cellsize', 'uniform_cellshape', 'marginal_adhesion', 'cell_size_epit_sing', 'bare_nuclei', 'bland_chromatin', 'normal_nuclei', 'mitoses']]
#create the Y set, which contains the labels
yset = brc['malignancy']

In [6]:
#split testing and training sets
xtrain, xtest, ytrain, ytest = train_test_split(xset, yset, random_state = 13)

### Question 1 polynomial grid search

In [7]:
#define polynomial kernel grid search
def poly_grid_search(costs = [0.001, 0.01, 0.1, 1, 10, 100, 1000], degree = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]):
    
    bestCost = -1
    bestDeg = -1
    bestScore = -1
    
    #iterate over all costs
    for cost in costs:
        #iterate over all degrees
        for deg in degree:
            #instantiate SVC with the current loops' values
            psvmc = svm.SVC(kernel = 'poly', C = cost, degree = deg)
            #fit the data
            psvmc.fit(xtrain, ytrain)
            #store the local score of this combination of hyperparameters
            localScore = psvmc.score(xtest, ytest)
            
            #print this iteration's peformance and hyperparameters
            #use the string format/token placeholder/{} replacement method to format the output so it is uniform
            print('C = {0: <5} || Degree = {1: <4} || Accuracy: {2: <20}'.format(cost, deg, localScore))
            
            #selects the best combination:
            #if the current combination did a better job (scored better in accuracy), 
            if(localScore > bestScore):
                #replace the current best values and score for output
                bestCost = cost
                bestDeg = deg
                bestScore = localScore
    
    #string formatted output, using positional replacement in tandem with positional arguments
    print('\nBest Score: {}, with Cost = {} and Degree = {}'.format(bestScore, bestCost, bestDeg))
    return bestCost, bestDeg, bestScore

In [8]:
poly_grid_search([0.01, 0.01, 0.1, 1, 10], [2, 3, 4])

C = 0.01  || Degree = 2    || Accuracy: 0.9590643274853801  
C = 0.01  || Degree = 3    || Accuracy: 0.9590643274853801  
C = 0.01  || Degree = 4    || Accuracy: 0.9590643274853801  
C = 0.01  || Degree = 2    || Accuracy: 0.9590643274853801  
C = 0.01  || Degree = 3    || Accuracy: 0.9590643274853801  
C = 0.01  || Degree = 4    || Accuracy: 0.9590643274853801  
C = 0.1   || Degree = 2    || Accuracy: 0.9532163742690059  
C = 0.1   || Degree = 3    || Accuracy: 0.9532163742690059  
C = 0.1   || Degree = 4    || Accuracy: 0.9532163742690059  
C = 1     || Degree = 2    || Accuracy: 0.9590643274853801  
C = 1     || Degree = 3    || Accuracy: 0.9532163742690059  
C = 1     || Degree = 4    || Accuracy: 0.935672514619883   
C = 10    || Degree = 2    || Accuracy: 0.9532163742690059  
C = 10    || Degree = 3    || Accuracy: 0.935672514619883   
C = 10    || Degree = 4    || Accuracy: 0.935672514619883   

Best Score: 0.9590643274853801, with Cost = 0.01 and Degree = 2


(0.01, 2, 0.9590643274853801)

### Question 2 rbf grid search

In [9]:
def rbf_grid_search(costs = [0.001, 0.01, 0.1, 1, 10, 100, 1000], gamma = [0.001, 0.01, 0.1, 1, 10, 100, 1000]):
    
    bestCost = -1
    bestGam = -1
    bestScore = -1
    
    for cost in costs:
        for gam in gamma:
            rbf_svmc = svm.SVC(kernel = 'rbf', C = cost, gamma = gam)
            rbf_svmc.fit(xtrain, ytrain)
            localScore = rbf_svmc.score(xtest, ytest)
            
            print('C = {0: <5} || Gamma = {1: <5} || Accuracy: {2: <20}'.format(cost, gam, localScore))
            
            #selects the best combination
            if (localScore > bestScore):
                bestCost = cost
                bestGam = gam
                bestScore = localScore
            
    print('\nBest Score: {}, with Cost = {} and Gamma = {}'.format(bestScore, bestCost, bestGam))
    return bestCost, bestGam, bestScore

In [10]:
rbf_grid_search([0.01, 0.1, 1, 10], [0.001, 0.01, 0.1])

C = 0.01  || Gamma = 0.001 || Accuracy: 0.672514619883041   
C = 0.01  || Gamma = 0.01  || Accuracy: 0.9532163742690059  
C = 0.01  || Gamma = 0.1   || Accuracy: 0.672514619883041   
C = 0.1   || Gamma = 0.001 || Accuracy: 0.9532163742690059  
C = 0.1   || Gamma = 0.01  || Accuracy: 0.9532163742690059  
C = 0.1   || Gamma = 0.1   || Accuracy: 0.9239766081871345  
C = 1     || Gamma = 0.001 || Accuracy: 0.9532163742690059  
C = 1     || Gamma = 0.01  || Accuracy: 0.9590643274853801  
C = 1     || Gamma = 0.1   || Accuracy: 0.9532163742690059  
C = 10    || Gamma = 0.001 || Accuracy: 0.9590643274853801  
C = 10    || Gamma = 0.01  || Accuracy: 0.9590643274853801  
C = 10    || Gamma = 0.1   || Accuracy: 0.9649122807017544  

Best Score: 0.9649122807017544, with Cost = 10 and Gamma = 0.1


(10, 0.1, 0.9649122807017544)