In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# imports
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [None]:
# read data
df = pd.read_csv('../input/breast-cancer-dataset/breast-cancer.csv')
df.head()

In [None]:
df.shape

In [None]:
# check for missing values
df.isnull().sum()

In [None]:
# drop id column
df.drop('id', axis='columns', inplace=True)
df.head(3)

In [None]:
# split data
X = df.iloc[:, 1:]
y = df.diagnosis
print("X shape: ", X.shape)
print("y shape: ", y.shape)

In [None]:
# obtain feature matrix
X = X.values
X.shape

In [None]:
# standardization
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# pca model
pca = PCA(n_components=30, random_state=42)
X = pca.fit_transform(X)

In [None]:
# variance explained by 30 components
np.cumsum(pca.explained_variance_ratio_ * 100)

In [None]:
# setting n components to 5
pca_5 = PCA(n_components=5, random_state=42)
X_5 = pca_5.fit_transform(X)
X_5.shape

In [None]:
# creating new dataframe
df_new = pd.DataFrame(X_5, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
df_new['diagnosis'] = y
df_new.head()

In [None]:
df_new.shape

In [None]:
# encode y
lbn = LabelEncoder()
df_new['diagnosis'] = lbn.fit_transform(df_new['diagnosis'])

In [None]:
# split into input features and output labels
X_new = df_new.iloc[:, 0:]
y_new = df_new.diagnosis
print('X shape: ', X.shape)
print('y shape: ', y.shape)

In [None]:
# split data 
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.3, random_state=0)
print('X train shape: ', X_train.shape)
print('y train shape: ', y_train.shape)
print('X test shape: ', X_test.shape)
print('y test shape: ', y_test.shape)

In [None]:
# define SVC model
svc = SVC(C=1, kernel="linear")

In [None]:
# train model
svc.fit(X_train, y_train)

In [None]:
# print training and testing score
print("Training score: ",svc.score(X_train, y_train)*100)
print("Testing score: ",svc.score(X_test, y_test)*100)