In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

import seaborn as sns
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
df.head()

In [None]:
df1 = df.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

le_diagnosis = LabelEncoder()
df1['le_diagnosis'] = le_diagnosis.fit_transform(df['diagnosis']) # le_diagnosis is a categorical value representation of diagnosis column
df1['diagnosis'] = df1['le_diagnosis'] # replacing diagnosis column with le_diagnosis column
clean_data = df1.drop(['le_diagnosis','Unnamed: 32','id'], axis='columns') # dropping of le_diagnosis col
clean_data.head()

In [None]:
#ID column has nothing to do with rest of the data so we will be ignoring it
x = clean_data.iloc[:,1] #Independent Variable
y = clean_data.diagnosis #Dependent Variable
x = clean_data.drop(['diagnosis'],axis='columns')  # Because Unnamed:32 has all NaN values 
x.head()

In [None]:
y.value_counts() # 0 = Benign   1 = Malignant

In [None]:
labels = ["Benign",'Malignant'] # 0 = Benign   1 = Malignant
size = clean_data['diagnosis'].value_counts()
colors = ['lightblue','orange']
explode = [0,0.1]

plt.rcParams['figure.figsize'] = (9,5)
plt.pie(size,colors = colors, explode=explode, labels=labels,
       shadow=True, autopct='%.2f%%')
plt.title('Diagnosis',fontsize=20)
plt.axis('off')
plt.legend()
plt.show

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [None]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(x_train,y_train)
model.score(x_test,y_test)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
one = cross_val_score(svm.SVC(kernel='linear',C=10),x,y, cv=5)
print(np.mean(one))
two = cross_val_score(svm.SVC(kernel='rbf',C=10),x,y, cv=5)
print(np.mean(two))
three = cross_val_score(svm.SVC(kernel='rbf',C=20),x,y, cv=5)
print(np.mean(three))

In [None]:
import numpy as np
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),x,y, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

print(avg_scores)

In [None]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(x,y)
print(clf.cv_results_)

In [None]:
df = pd.DataFrame(clf.cv_results_)
df

In [None]:
df[['param_C','param_kernel','mean_test_score']]


In [None]:
clf.best_score_

In [None]:
clf.best_params_

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(x,y)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
        
    },
    
    'MultinomialNB' : {
        'model': MultinomialNB(),
        'params': {}
    }
    ,
    'GaussianNB':{
        'model':GaussianNB(),
        'params':{}
    }
}


In [None]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x,y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df