In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [None]:
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
df.diagnosis.unique()

In [None]:
df.diagnosis.value_counts()

In [None]:
sns.countplot(df.diagnosis)

## Create and Prepare the Data

In [None]:
df.head()

In [None]:
df.drop('id',axis =1,inplace = True)
df.drop('Unnamed: 32',axis =1,inplace = True)

In [None]:
df.head()

In [None]:
df['diagnosis'] = df['diagnosis'].map({'M':1,'B':0})
df.head()

In [None]:
##to see how many null values are there
df.isnull().sum()

In [None]:
df.corr()

In [None]:
plt.hist(df['diagnosis'],color='g')
plt.title('Plot Diagnosis (M=1 ,B=0)')
plt.show()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)

In [None]:
# generate a scatter plot matrix with the "mean" columns
cols = ['diagnosis',
        'radius_mean', 
        'texture_mean', 
        'perimeter_mean', 
        'area_mean', 
        'smoothness_mean', 
        'compactness_mean', 
        'concavity_mean',
        'concave points_mean', 
        'symmetry_mean', 
        'fractal_dimension_mean']

sns.pairplot(data=df[cols], hue='diagnosis', palette='rocket')


Here radius mean,perimeter mean ,area mean are highly correlated with other ,thus leading to problem known as multicollinearity


almost perfectly linear patterns between the radius, perimeter and area attributes are hinting at the presence of multicollinearity between these variables. (they are highly linearly related) Another set of variables that possibly imply multicollinearity are the concavity, concave_points and compactness.

Multicollinearity is a problem as it undermines the significance of independent varibales and we fix it by removing the highly correlated predictors from the model.

Use Partial Least Squares Regression (PLS) or Principal Components Analysis, regression methods that cut the number of predictors to a smaller set of uncorrelated components.


In [None]:
# Generate and visualize the correlation matrix

corr = df.corr().round(2) ##round-off to 2

# Mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set figure size
f, ax = plt.subplots(figsize=(20, 20))

# Define custom colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

plt.tight_layout()


 we can verify the presence of multicollinearity between some of the variables. For instance, the radius_mean column has a correlation of 1 and 0.99 with perimeter_mean and area_mean columns, respectively.

This is because the three columns essentially contain the same information, which is the physical size of the observation (the cell). Therefore we should only pick ONE of the three columns when we go into further analysis.



Another place where multicollienartiy is apparent is between the "mean" columns and the "worst" column.

For instance, the radius_mean column has a correlation of 0.97 with the radius_worst column.
also there is multicollinearity between the attributes compactness, concavity, and concave points.
So we can choose just ONE out of these, I am going for Compactness.

In [None]:
# first, drop all "worst" columns
cols = ['radius_worst', 
        'texture_worst', 
        'perimeter_worst', 
        'area_worst', 
        'smoothness_worst', 
        'compactness_worst', 
        'concavity_worst',
        'concave points_worst', 
        'symmetry_worst', 
        'fractal_dimension_worst']
df = df.drop(cols, axis=1)

In [None]:
# then, drop all columns related to the "perimeter" and "area" attributes
cols = ['perimeter_mean',
        'perimeter_se', 
        'area_mean', 
        'area_se']
df = df.drop(cols, axis=1)

In [None]:
# lastly, drop all columns related to the "concavity" and "concave points" attributes
cols = ['concavity_mean',
        'concavity_se', 
        'concave points_mean', 
        'concave points_se']
df = df.drop(cols, axis=1)

In [None]:
# verify remaining columns
df.columns

In [None]:
# Draw the heatmap again, with the new correlation matrix
corr = df.corr().round(2)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)
plt.tight_layout()

## Building a Model

In [None]:
sns.pairplot(data=df, hue='diagnosis', palette='rocket')


In [None]:
X = df.drop(['diagnosis'],axis=1)
y = df['diagnosis']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test,y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=40)


### Feature Scaling


StandardScaler standardizes a feature by subtracting the mean and then scaling to unit variance. Unit variance means dividing all the values by the standard deviation. ... StandardScaler makes the mean of the distribution 0

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

It transforms the data in such a manner that it has mean as 0 and standard deviation as 1. 
In short, it standardizes the data. Standardization is useful for data which has negative values. 
It arranges the data in a standard normal distribution.

### Models and finding out the Best one


#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr =LogisticRegression()

model1 = lr.fit(X_train,y_train)
prediction_model1 = model1.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix
cm =confusion_matrix(y_test,prediction_model1)
cm##True Positive,etc thing

In [None]:
sns.heatmap(cm,annot=True)
plt.savefig('confusion_matrix_loR.png')

In [None]:
TP=cm[0][0]
TN=cm[1][1]
FN=cm[1][0]
FP=cm[0][1]
print('Testing Accuracy:',(TP+TN)/(TP+TN+FN+FP))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,prediction_model1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,prediction_model1))

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
model2 = dtc.fit(X_train,y_train)
prediction_model2 =model2.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm2 = confusion_matrix(y_test,prediction_model2)
cm2

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,prediction_model2)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,prediction_model2))

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
model3 = rfc.fit(X_train,y_train)
prediction_model3 = model3.predict(X_test)
cm3 = confusion_matrix(y_test,prediction_model3)
cm3

In [None]:
accuracy_score(y_test,prediction_model3)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,prediction_model3))

#### KNN  


#### SVM   

#### Naives Bayes  

Techniques to create multiple model at same time

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [None]:
models =[]
models.append(('KNN',KNeighborsClassifier()))
models.append(('SVC',SVC()))
models.append(('Naive Bayes',GaussianNB()))


models.append(('LogisticRegression',LogisticRegression()))
models.append(('DecisionTreeClassifier',DecisionTreeClassifier()))
models.append(('RandomForestClassifier',RandomForestClassifier()))


KFOLD is a model validation technique, where it's not using your pre-trained model. Rather it just use the hyper-parameter and trained a new model with k-1 data set and test the same model on the kth set. K different models are just used for validation.

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
# evaluate each model

results =[]
names=[]

for name , model in models:
    kfold =KFold(n_splits = 10 ,random_state = 40)
    cv_results =cross_val_score(model, X_train, y_train, cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    
    msg= '%s:, %f, (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

#### Here SVC is giving the best result so using it to make predictions on test Data

In [None]:
# make predictions on test datasets

SVM = SVC()
model4 = SVM.fit(X_train,y_train)
prediction_model4 = model4.predict(X_test)

print(confusion_matrix(y_test, prediction_model4))
print(accuracy_score(y_test, prediction_model4))
print(classification_report(y_test, prediction_model4))


 We are getting the best accuracy with SVM which is 96.4% , the model is predicting with 96% accuracy on our test data