# Training and Optimization

## Import Important Libraries

In [2]:
import pandas as pd  
import numpy as np  

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

## Load data to work with

In [3]:
train_data = pd.read_csv("train.csv")
train_data.drop('Unnamed: 0',axis=1, inplace=True)

cv_data = pd.read_csv("CV.csv")

cv_data.drop('Unnamed: 0',axis=1, inplace=True)

## Data Analysis

In [4]:
print(train_data.shape)
print(cv_data.shape)

(443, 10)
(120, 10)


In [5]:
train_data.head(5)

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,4,1,1,1,2,1,2,1,1,2
1,10,10,10,10,7,10,7,10,4,4
2,4,1,1,2,2,1,2,1,1,2
3,9,7,7,5,5,10,7,8,3,4
4,6,10,10,10,8,10,7,10,7,4


In [6]:
cv_data.head(5)

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,1,1,1,1,2,1,1,1,1,2
1,1,1,1,1,2,5,5,1,1,2
2,4,1,1,1,2,1,1,2,1,2
3,4,5,5,10,4,10,7,5,8,4
4,2,1,1,1,2,1,1,1,1,2


## Sorting data for training and Cross Validation

In [26]:


train_data = pd.read_csv('train.csv',nrows=443)

cv_data = pd.read_csv('CV.csv', nrows=443)
#Load Training data
X_train = np.matrix(train_data[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bland_chromatin','normal_nucleoli','mitoses']])
y_train = np.matrix(train_data['class'])
y_train= y_train.reshape(443,1)


#load cross validation data[
X_cv = np.matrix(cv_data[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bland_chromatin','normal_nucleoli','mitoses']])
y_cv = np.matrix(cv_data['class'])
y_cv= y_cv.reshape(120,1)
print(y_cv.shape)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_cv = scaler.fit_transform(X_cv)



(120, 1)


## Training SVM classifier

In [13]:
C_range = np.logspace(-3, 11, 8, base = 2.0)        #  range of C values : 2^-3<C< 2^11
gamma_range = np.logspace(-11, 3, 8, base=2.0)      # range of gamma values : 2^-11 to 2^3
combinations = []
best_score = 0
best = ()

param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
y_train = np.ravel(y_train, order = 'C')
grid.fit(X_train, y_train)
for C in C_range:
    row = [C]
    for gamma in gamma_range:
        com = SVC(C=C, gamma=gamma)
        com.fit(X_train, y_train)
        score = com.score(X_cv, y_cv)
        if(score > best_score):
            best_score = score
            best =(score,com)
        row.append(score)
    combinations.append(row)



## Tabulate trained classifiers

In [14]:
table = pd.DataFrame(combinations, index=None, columns=["C","2^-11", "2^-9", "2^-7", "2^-5", "2^-3", "2^-1", "2^1", "2^3"])
table

Unnamed: 0,C,2^-11,2^-9,2^-7,2^-5,2^-3,2^-1,2^1,2^3
0,0.125,0.925,0.933333,0.933333,0.933333,0.933333,0.85,0.65,0.65
1,0.5,0.933333,0.933333,0.933333,0.941667,0.941667,0.891667,0.85,0.65
2,2.0,0.933333,0.933333,0.933333,0.941667,0.941667,0.908333,0.866667,0.733333
3,8.0,0.933333,0.925,0.933333,0.933333,0.941667,0.908333,0.866667,0.733333
4,32.0,0.933333,0.933333,0.941667,0.933333,0.941667,0.908333,0.866667,0.733333
5,128.0,0.933333,0.933333,0.933333,0.941667,0.941667,0.908333,0.866667,0.733333
6,512.0,0.933333,0.941667,0.933333,0.933333,0.941667,0.908333,0.866667,0.733333
7,2048.0,0.933333,0.916667,0.941667,0.933333,0.941667,0.908333,0.866667,0.733333


In [15]:
best_score      #  best score

0.9416666666666667

In [16]:
best            #  best Score and Classifier list

(0.9416666666666667,
 SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma=0.03125, kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False))

In [17]:
best[1]         # best Classifier

SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.03125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
from sklearn.metrics import classification_report, confusion_matrix 

test_data = pd.read_csv("test.csv", nrows=120)
X_test = np.matrix(test_data[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bland_chromatin','normal_nucleoli','mitoses']])
y_test = np.matrix(test_data['class'])
y_test = y_test.reshape(120,1)


print(test_data.shape)
test_data.head(5)

print("")
print("Scaled data")
#Scale features
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
X_test.shape

(120, 11)

Scaled data


(120, 8)

In [32]:
y_pred = best[1].predict(X_test)

In [33]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)  
cr =classification_report(y_test, y_pred)
aa =(cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Confusin_Matrix\n",cm,"\n\n")
print("Classification_report\n",cr,"\n\n")
print("overall accurancy\n",aa,"\n\n")

Confusin_Matrix
 [[ 2 76]
 [42  0]] 


Classification_report
               precision    recall  f1-score   support

           2       0.05      0.03      0.03        78
           4       0.00      0.00      0.00        42

    accuracy                           0.02       120
   macro avg       0.02      0.01      0.02       120
weighted avg       0.03      0.02      0.02       120
 


overall accurancy
 0.016666666666666666 




## Save best classifier

In [23]:
from sklearn.externals import joblib
filename = 'mymodel.joblib'
joblib.dump(best,filename)

['mymodel.joblib']