In [None]:
import warnings
warnings.simplefilter('ignore')

### Importing required libraries -

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

### Loading the dataset -

In [None]:
filepath = '../input/breast-cancer-wisconsin-data/data.csv'
data = pd.read_csv(filepath)

### Performing EDA -

In [None]:
data.sample(10)

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe()

### Checking for null values -

In [None]:
data.isnull().sum()

### Dropping unnecessary columns -

In [None]:
data.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)

### Checking for duplicate rows -

In [None]:
duplicate_rows = data[data.duplicated()]
duplicate_rows.shape[0]

### Checking the outcome labels -

In [None]:
data['diagnosis'].value_counts()

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(data=data, x='diagnosis')
plt.show()

### Encoding the target variable -

In [None]:
data.replace({'diagnosis':{'M': 0,
                            'B': 1}},inplace=True)

### Checking the data distribution -

In [None]:
data.drop('diagnosis', axis=1).hist(figsize=(18, 15))
plt.show()

### Checking the correlation between variables -

In [None]:
data.corr(method='spearman')

### Plotting the correlation matrix -

In [None]:
plt.figure(figsize = (15, 12))
cormat = sns.heatmap(data.corr(method='spearman'), linecolor='k')
cormat.set_title('Correlation Matrix')
plt.show()

### Performing PCA -

In [None]:
from sklearn.preprocessing import scale
scaled_data = scale(data)

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)
reduced_data = pca.transform(scaled_data)

In [None]:
print(pca.components_)

### Visualizing the data points -

In [None]:
sns.set()
plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], s=75, hue=data['diagnosis'])
plt.show()

### Seperating dependent & independent variables -

In [None]:
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

### Performing train-test split -

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

### Scaling the data -

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Importing performance metrics for classification -

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score, roc_auc_score, roc_curve

## **Support Vector Classifier**

In [None]:
from sklearn.svm import SVC
svc = SVC(probability=True)
svc.fit(X_train, y_train)

In [None]:
y_pred_svc = svc.predict(X_test)
y_pred_proba_svc = svc.predict_proba(X_test)[:, 1]

In [None]:
print("Train accuracy :{:.2f}".format(accuracy_score(y_train, svc.predict(X_train))))
print("Test accuracy :{:.2f}".format(accuracy_score(y_test, svc.predict(X_test))))

In [None]:
conmat = confusion_matrix(y_test, y_pred_svc)
sns.heatmap(conmat, annot=True, cbar=False)
plt.title("Confusion Matrix")
plt.show()

In [None]:
print("Classification Report")
print(classification_report(y_test, y_pred_svc))

In [None]:
print("F1 Score: {:.2f}".format(f1_score(y_test, y_pred_svc)))
print("Precision: {:.2f}".format(precision_score(y_test, y_pred_svc)))
print("Recall: {:.2f}".format(recall_score(y_test, y_pred_svc)))

In [None]:
tp = conmat[0][0]
fp = conmat[0][1]
fn = conmat[1][0]
tn = conmat[1][1]
tpr = tp/(tp+fn)
tnr = tn/(fp+tn)
fpr = fp/(tp+fn)
fnr = fn/(fp+tn)

In [None]:
print("Type 1 Error: {:.2f}".format(fpr))
print("Type 2 Error: {:.2f}".format(fnr))

In [None]:
print("Sensitivity: {:.2f}".format(tpr))
print("Specificity: {:.2f}".format(1-fpr))

In [None]:
print("AUC Score: {:.2f}".format(roc_auc_score(y_test, y_pred_proba_svc)))

In [None]:
from sklearn.model_selection import cross_val_score
svc_acc = np.mean(cross_val_score(svc, X, y, cv=10, scoring='accuracy')) 
print("Cross Validation Score: {:.2f}".format(svc_acc))

### Plotting the ROC Curve -

In [None]:
fpr_svc, tpr_svc, threshold_svc = roc_curve(y_test, y_pred_proba_svc)

plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(8, 5))
plt.plot(fpr_svc, tpr_svc, label="Support Vector Classifier")
plt.legend(loc='lower right', frameon=True)
plt.title("ROC Curve")
plt.ylabel("TPR")
plt.xlabel("FPR")
plt.show()

### Plotting the Learning Curve -

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(svc, X, y, 
                                    train_sizes=np.linspace(0.1, 1, 10), cv=10, scoring='accuracy', n_jobs=-1, verbose=1)

In [None]:
train_scores_mean = train_scores.mean(axis=1)
test_scores_mean = test_scores.mean(axis=1)

In [None]:
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_scores_mean, '-o', label = "Training Score", color='red')
plt.plot(train_sizes, test_scores_mean, '-o', label = "Cross Validation Score", color='blue')
plt.legend(loc='best', frameon=True)
plt.title("Learning Curve")
plt.xlabel("Training Size")
plt.ylabel("Accuracy")
plt.show()

### Tuning the Hyperparameters -

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = { 'C': [1, 10, 100, 1000, 10000],
           'kernel': ['poly', 'rbf'],
         }

random = RandomizedSearchCV(svc, param_distributions=params, cv=10, scoring='accuracy', n_jobs=-1)
random.fit(X,y)

In [None]:
random.best_params_

In [None]:
print("Cross Validation Score after Hyperparameter Tuning: {:.2f}".format(random.best_score_))