In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.diagnosis.value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,9))
plt.bar(df.diagnosis.value_counts().index, df.diagnosis.value_counts().values)
plt.title("Diagnosis Count by Type")
plt.xlabel("Diagnosis")
plt.show()

In [None]:
df.columns

In [None]:
df.drop(['id','Unnamed: 32'], inplace=True, axis=1)

In [None]:
encoding = {
    'B':0,
    'M':1
}
df.diagnosis.replace(encoding, inplace=True)

In [None]:
df.diagnosis.value_counts()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
df[df.columns[0:]].corr()['diagnosis'][:].sort_values(ascending=False)

In [None]:
corr_matrix = df.corr().abs() 

mask = np.triu(np.ones_like(corr_matrix, dtype = bool))
tri_df = corr_matrix.mask(mask)

to_drop = [x for x in tri_df.columns if any(tri_df[x] > 0.92)]

df_reduced = df.drop(to_drop, axis = 1)

print(f"The reduced dataframe has {df_reduced.shape[1]} columns.")

In [None]:
print(df.columns)
print(df_reduced.columns)

In [None]:
df_new = df[['diagnosis', 'concave points_worst', 'perimeter_worst',
       'concave points_mean', 'radius_worst', 'perimeter_mean', 'area_worst',
       'radius_mean', 'area_mean', 'concavity_mean', 'concavity_worst',
       'compactness_mean', 'compactness_worst', 'radius_se', 'perimeter_se',
       'area_se', 'texture_worst', 'smoothness_worst', 'symmetry_worst',
       'texture_mean', 'concave points_se', 'smoothness_mean', 'symmetry_mean',
       'fractal_dimension_worst', 'compactness_se', 'concavity_se',
       'fractal_dimension_se']]

In [None]:
X = df_new.drop(['diagnosis'], axis=1).values
y = df_new[['diagnosis']].values

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape, X_valid.shape, X_test.shape)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [None]:
svc_best = SVC(C=1, kernel='linear')
svc_best.fit(X_train, y_train)

print("Train acc : ", svc_best.score(X_train, y_train))
print("Val acc : ", svc_best.score(X_valid, y_valid))

In [None]:
from sklearn.metrics import classification_report

pred_svc_best = svc_best.predict(X_test)
print(classification_report(pred_svc_best, y_test))

In [None]:
print("Predict Label for data : ", X_test[0], "with true label : ", y_test[0])

pred_svc_single = svc_best.predict([X_test[0]])
print("Predicted Values : ",pred_svc_single)