Data Preprocessing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Collection and Preprocessing

In [None]:
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

#drop the extra columns
data.drop('Unnamed: 32', axis = 1, inplace = True)
data.drop('id', axis = 1, inplace = True)

In [None]:
data.head()

# Exploratory Data Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
figure = plt.figure(figsize=(10,10))
sns.heatmap(data.corr())

In [None]:
data.columns

In [None]:
data_mean = data[['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']]

In [None]:
sns.heatmap(data_mean.corr())

In [None]:
sns.pairplot(data_mean)

In [None]:
data_worst = data[['radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']]

In [None]:
sns.heatmap(data_worst.corr())

In [None]:
sns.pairplot(data_worst)

In [None]:
#Statistics about the features
for feature in data.columns[1:]:
    print('________{}_________'.format(feature))
    print(data[feature].describe())
    

# Splitting the data into Training and Test set

In [None]:
X = data[['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']]

In [None]:
y = data['diagnosis']

In [None]:
#Checking for NULL values
sns.heatmap(X.isnull(),yticklabels=False,cmap = 'viridis')

# Encoding the dependent variable 'diagnosis'

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

# Model Selection and fitting the training data

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

# Model Prediction

In [None]:
y_pred = classifier.predict(X_test)

# Model Evaluation 
**Accuracy, Confusion Matrix and Classification Report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('Model Accuracy : {:.2f}%'.format(accuracy_score(y_test, y_pred) * 100))