### Packages import

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os

### Importing the dataset

In [None]:
Data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

### Understanding the data

In [None]:
Data.head()

In [None]:
Data.describe()

In [None]:
Data.info()

In [None]:
Data.drop(columns=["id","Unnamed: 32"],inplace=True)

In [None]:
Data.head()

In [None]:
# Number of missing values
for i in Data.columns:
    print(f"{i} has {Data[i].isna().sum()} missing values")

There is no missing values in the data

In [None]:
# Number of unique values
for i in Data.columns:
    print(f"{i} has {Data[i].nunique()} unique values")

### Removing highly correlated variables using the Greedy Elimination Method

This process is used to identify the pairs of features which are highly correlated to each other and get rid of that feature in a pair which is lesser correlated with the target label.

In [None]:
Case_value = []
for i in Data.diagnosis:
    if i == "B":
        Case_value.append(1)
    else:
        Case_value.append(2)


In [None]:
Data["Case_severity"] = Case_value

In [None]:
for index,i in enumerate(Data.corr()):
    counter = 0
    for a in Data.corr():
        if Data.corr().loc[i,a] > 0.95:
            counter = counter + 1
    print(f"{i} is highly correlated to {counter} variables")
            
            

In [None]:
# Finding the pairs of features with high correlation ( > 0.95)

In [None]:
Highly_correlated = []
Highly_correlated_pairs = []
Variable_1 = []
Variable_2 = []
Coef = []
for index,i in enumerate(Data.corr()):
    for a in Data.corr():
        if i != a:
            if Data.corr().loc[i,a] > 0.95:
                print(f"{i} & {a} - {Data.corr().loc[i,a]}")
                Highly_correlated.append(i)
                Highly_correlated.append(a)
                Highly_correlated_pairs.append((i,a))
                Variable_1.append(i)
                Variable_2.append(a)
                Coef.append(Data.corr().loc[i,a])
Highly_correlated_variables = {i for i in Highly_correlated}
            

In [None]:
HighlyCorrelated = pd.DataFrame(list(zip(Variable_1,Variable_2,Coef)),columns=["Variable_1","Variable_2","Coef"])

In [None]:
# Picking the top 10 pairs

In [None]:
Top10 = HighlyCorrelated.sort_values("Coef",ascending=False)[0:10]

In [None]:
Top10.index = np.arange(0,10)

In [None]:
Top10

In [None]:
# Picking the features from the pairs which is highly correlated with the labels

In [None]:
Retained_Variables = []
for index,i in enumerate(Top10.Variable_1):
    if Data.corr().loc[i,"Case_severity"] > Data.corr().loc[Top10.Variable_2[index],"Case_severity"]:
        Retained_Variables.append(i)
    else:
        Retained_Variables.append(Top10.Variable_2[index])

In [None]:
Retained_Variables = [i for i in np.unique(Retained_Variables)]

In [None]:
Retained_Variables

From the process, we have arrived at 4 features which we would use as estimators. This 4 estimators are - Perimeter mean, perimeter worst, radius mean and radius worst. 

From here we would just check how these features are correlated with the target label

### Data Exploration of estimators and relationship with the target labels

In [None]:
Data = Data[["perimeter_mean","perimeter_worst","radius_mean","radius_worst","diagnosis","Case_severity"]]

In [None]:
Data.corr()

In [None]:
# Data for benign cases
Data[Data["diagnosis"] != "M"].describe()

In [None]:
# Data for Malignant cases
Data[Data["diagnosis"] == "M"].describe()

In [None]:
sns.pairplot(data = Data.drop("Case_severity",axis=1),hue="diagnosis")

##### NOTE - M stands for Malignant and B stands for Benign

Also, from looking at the pair plots, on the whole, it can be assumed that the chosen 4 features do have an influence on the target variable as it can be seen that the features of radius and perimeter ,when looked at using the 2 different diagnosis, defer significantly. 

In [None]:
plt.figure(figsize = (10,10))
plt.tight_layout(pad=4,w_pad=5,h_pad=8)
plt.subplot(221)
ax1 = sns.violinplot(data=Data,x="diagnosis",y="perimeter_mean")
ax1.set_xticklabels(["Malignant","Benign"])
plt.title("Perimeter Mean")
plt.tight_layout(h_pad=4,w_pad=4)

plt.subplot(222)
ax2 = sns.violinplot(data=Data,x="diagnosis",y="perimeter_worst")
ax2.set_xticklabels(["Malignant","Benign"])
plt.title("Perimeter Worst")
plt.tight_layout(h_pad=4,w_pad=4)

plt.subplot(223)
ax3 = sns.violinplot(data=Data,x="diagnosis",y="radius_mean")
ax3.set_xticklabels(["Malignant","Benign"])
plt.title("Radius Mean")
plt.tight_layout(h_pad=4,w_pad=4)

plt.subplot(224)
ax4 = sns.violinplot(data=Data,x="diagnosis",y="radius_worst")
ax4.set_xticklabels(["Malignant","Benign"])
plt.title("Radius Worst")
plt.tight_layout(h_pad=4,w_pad=4)

On the analysis of the violin plots displaying the radius and perimeter values for instances of malignant and benign breast cancer cases, the following insights were uncovered ( Note - The units for the data was unknown )

1. In terms of size of the core tumor, in malignant cases, the mean size appears to be around 115 whereas in benign cases, the mean size appears to be around 78. In the worst case scenario, this mean size in malignant cases can be around 141 whereas in benign cases, it can be close to 87. 

2. In terms of the distances from the center to the points on the perimeter, the mean radius in malignant cases can be around 17 whereas in benign cases, the mean radius can be close to 12-13. In worst case scenarios, the mean radius in malignant cases appears to be close to 21 whereas in benign cases, the mean radius appears to be around 13. 

This shows that the estimators are quite significant in determining the whether a particular instance of a breast cancer case could be malignant or benign.

### Model Development

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [None]:
X_data = Data[["perimeter_mean","perimeter_worst","radius_mean","radius_worst"]]
y_data = Data["Case_severity"]

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X_data,y_data,train_size = 0.75,random_state = 2607)

In [None]:
model = SVC(probability=True,).fit(X_train,y_train)

In [None]:
ModelPredictions = model.predict(X_test)

### Model Evaluation for default model

In [None]:
from sklearn.metrics import accuracy_score,recall_score,precision_score, classification_report , roc_auc_score, confusion_matrix,roc_curve

In [None]:
accuracy_score(y_test,ModelPredictions)

In [None]:
precision_score(y_test,ModelPredictions)

In [None]:
recall_score(y_test,ModelPredictions)

In [None]:
plt.figure()
ax = plt.subplot()
sns.heatmap(confusion_matrix(y_test,ModelPredictions),annot=True,cbar=False)
ax.set_xlabel("Predicted Values",labelpad = 10)
ax.set_ylabel("True Values")
ax.xaxis.set_ticklabels(["Benign","Malignant"])
ax.yaxis.set_ticklabels(["Benign","Malignant"])
ax.set_title("CONFUSION MATRIX")

In [None]:
print(classification_report(y_test,ModelPredictions))

In [None]:
model_predict_prob = model.predict_proba(X_test)

In [None]:
model_predict_prob = model_predict_prob[:,1]

In [None]:
fpr, tpr , threshold = roc_curve(y_test,model_predict_prob,pos_label=2)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(fpr,tpr,linestyle = "--")
plt.fill_between(fpr,tpr,alpha = 0.1)
plt.title("ROC for SVC")
plt.xlabel("False Positive rate")
plt.ylabel("True Positive rate")
print("ROC_AUC Score for SVC :",roc_auc_score(y_test,model_predict_prob))


### Searching for the best parameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
empty = SVC()

In [None]:
params = {"C":[0.001,0.001,0.01,0.1,1,10,100,1000],"kernel":["rbf","linear"],"gamma":[0.001,0.001,0.01,0.1,1,10,100,1000]}

In [None]:
Grid = GridSearchCV(empty,params,refit=True).fit(X_train,y_train)

In [None]:
Grid.best_params_

In [None]:
Grid.best_score_

 ### Tuned Model

In [None]:
tuned_model = SVC(C= 1, gamma= 0.001, kernel= 'linear',probability=True).fit(X_train,y_train)

In [None]:
tuned_model_predictions = tuned_model.predict(X_test)

 ### Evaluating the tuned model

In [None]:
print(classification_report(y_test,tuned_model_predictions))

In [None]:
print(f"Accuracy of tuned model = {accuracy_score(y_test,tuned_model_predictions)}")
print(f"Precision of tuned model = {precision_score(y_test,tuned_model_predictions)}")
print(f"Recall of tuned model = {accuracy_score(y_test,tuned_model_predictions)}")

In [None]:
plt.figure()
ax = plt.subplot()
sns.heatmap(confusion_matrix(y_test,tuned_model_predictions),annot=True,cbar=False)
ax.set_xlabel("Predicted Values",labelpad = 10)
ax.set_ylabel("True Values")
ax.xaxis.set_ticklabels(["Benign","Malignant"])
ax.yaxis.set_ticklabels(["Benign","Malignant"])
ax.set_title("CONFUSION MATRIX for tuned SVC")

In [None]:
tuned_model_predict_prob = tuned_model.predict_proba(X_test)

In [None]:
tuned_model_predict_prob = tuned_model_predict_prob[:,1]

In [None]:
fpr, tpr , threshold = roc_curve(y_test,tuned_model_predict_prob,pos_label=2)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(fpr,tpr,linestyle = "--")
plt.fill_between(fpr,tpr,alpha = 0.1)
plt.title("ROC for SVC")
plt.xlabel("False Positive rate")
plt.ylabel("True Positive rate")
print("ROC_AUC Score for tuned SVC :",roc_auc_score(y_test,tuned_model_predict_prob))


Hey guys, this is one of my 1st tasks on Kaggle and in Data Science. I will be grateful if you could provide me with some feedback regarding my model, any potential mistakes I made and how I could improve my model. Your comments and feedback will be very valuable for me going foward 