In [8]:
import numpy as np
from sklearn import preprocessing
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn import model_selection

In [9]:
df = pd.read_csv('breast-cancer-wisconsin.data')
diagnoses={2:'benign', 4:'malignant'} #2 for benign, 4 for malignant

df.replace('?',-99999, inplace=True) # to replace '?' values into -99999
df.drop(['sample_code'], 1, inplace=True) # to drop the first column as it's the id

X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)


In [10]:
def fit(model):
    
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print('accuracy= ',accuracy)

    #%%from sklearn.metrics import classification_report
    from sklearn.metrics import classification_report
    y_predict = model.predict(X_test)
    print(classification_report(y_test, y_predict))
    n_samples = X.shape[0]
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    print(cross_val_score(model, X, y, cv=cv))
    
    return model


In [16]:
# svm model
from sklearn.svm import SVC
svm_model = SVC( C=3,
                    kernel='rbf',
                    gamma='auto',
                    shrinking=True,
                    probability=False,
                    tol=0.001,
                    cache_size=1000,
                    verbose=False,
                    max_iter=-1,
                    decision_function_shape='ovo')
svm_model=fit(svm_model)

accuracy=  0.9571428571428572
              precision    recall  f1-score   support

           2       1.00      0.93      0.96        86
           4       0.90      1.00      0.95        54

    accuracy                           0.96       140
   macro avg       0.95      0.97      0.96       140
weighted avg       0.96      0.96      0.96       140

[0.94285714 0.96666667 0.95238095 0.93809524 0.95238095]


In [26]:
# using ensemble method
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
estimator=SVC( C=2,
                    kernel='rbf',
                    gamma='auto',
                    shrinking=True)
boosted_svm = AdaBoostClassifier(base_estimator = estimator , n_estimators = 100 , learning_rate=0.1 , algorithm='SAMME')
boosted_svm=fit(boosted_svm)


accuracy=  0.6142857142857143


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           2       0.61      1.00      0.76        86
           4       0.00      0.00      0.00        54

    accuracy                           0.61       140
   macro avg       0.31      0.50      0.38       140
weighted avg       0.38      0.61      0.47       140

[0.64285714 0.64285714 0.63809524 0.61904762 0.67142857]


In [27]:
# you only fit one Vector to Kmeans and let it do it's magic

from sklearn.cluster import KMeans

kmeans_model= KMeans(n_clusters=2)
kmeans_model.fit(X_test)

#%%from sklearn.metrics import classification_report
from sklearn.metrics import classification_report
y_predict = kmeans_model.predict(X_test)
print(classification_report(y_test, y_predict))
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
print(cross_val_score(kmeans_model, X_train, y_train, cv=cv))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00      86.0
           4       0.00      0.00      0.00      54.0

    accuracy                           0.00     140.0
   macro avg       0.00      0.00      0.00     140.0
weighted avg       0.00      0.00      0.00     140.0

[-12408.45103183 -11820.77921735 -11712.95540821 -11649.05872981
 -10158.19772686]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [44]:
# mean shift goes the same as kmeans
from sklearn.metrics import classification_report
from sklearn.cluster import MeanShift
meanshift_model = MeanShift(n_jobs=-1)
meanshift_model.fit(X)
# %%from sklearn.metrics import classification_report
y_predict = kmeans_model.predict(X_test)
print(classification_report(y_test, y_predict))
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
print(cross_val_score(kmeans_model, X_train, y_train, cv=cv))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00      85.0
           4       0.00      0.00      0.00      55.0

    accuracy                           0.00     140.0
   macro avg       0.00      0.00      0.00     140.0
weighted avg       0.00      0.00      0.00     140.0

[-11616.52336885  -9980.07292599 -11847.90256342 -12442.86584176
 -12345.9730353 ]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [45]:
from sklearn import neighbors
Knearest_model =neighbors.KNeighborsClassifier(n_neighbors=9 ) 
Knearest_model=fit(Knearest_model)

accuracy=  0.9642857142857143
              precision    recall  f1-score   support

           2       0.95      0.99      0.97        85
           4       0.98      0.93      0.95        55

    accuracy                           0.96       140
   macro avg       0.97      0.96      0.96       140
weighted avg       0.96      0.96      0.96       140

[0.95714286 0.97142857 0.97619048 0.94761905 0.98095238]


In [None]:
#%% to save the model
import pickle
with open('SVM.pickle','wb') as f:
    pickle.dump(model, f)

#%% to load the model
pickle_in = open('SVM.pickle','rb')
model = pickle.load(pickle_in)