In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=Warning)

## Loading Dataset


In [None]:
dataset = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

# Exploratory Data Analysis

In [None]:
dataset.head()


In [None]:
dataset = dataset.drop('Unnamed: 32', axis =1)


In [None]:
dataset.describe()

In [None]:
dataset.isnull().values.any()

In [None]:
dataset.isnull().values.sum()

In [None]:
dataset.isnull().sum()


### Luckily There are no NAN values


In [None]:
dataset.shape

### Dataset have 569 rows and 32 Columns

In [None]:
dataset['diagnosis'].agg(['count', 'size', 'nunique'])


In [None]:

pd.value_counts(dataset['diagnosis'])


In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(12, 6))
sns.countplot(x="diagnosis", data=dataset, palette='magma');


### Diagnosis Column have 2 unique values Malignant(M) and Benign(B) having count of 212 and 357 respectively.

In [None]:
plt.figure(figsize=(20, 17))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True, linewidth=.8, mask=matrix, cmap="rocket");


As we can observe from the heatmaps that there are many negative correlations in this dataset. Lets observe these by plotting it out.

Negative Correlations
The column 'fractal_dimension_mean' had many negative correlations with many other attributes like 'area_mean', 'area_worst' etc. We'll plot some scatter plots for these.

Fractal analysis of images of breast tissue specimens provides a numeric description of tumour growth patterns as a continuous number between 1 and 2. This number is known as the Fractal Dimension

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 15))
sns.scatterplot(x='fractal_dimension_mean', y='area_mean', hue="diagnosis",
                data=dataset, ax=ax[0][0], palette='magma')
sns.scatterplot(x='fractal_dimension_worst', y='area_worst', hue="diagnosis",
                data=dataset, ax=ax[0][1], palette='magma')
sns.scatterplot(x='smoothness_se', y='radius_worst', hue="diagnosis",
                data=dataset, ax=ax[1][0], palette='magma')
sns.scatterplot(x='symmetry_se', y='radius_worst', hue="diagnosis",
                data=dataset, ax=ax[1][1], palette='magma');


In [None]:
# Creating a list of columns with only the columns that represent the mean.
mean_columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
             'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
             'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

# Creating a list of columns with only the columns that represent the worst values.
worst_columns = ['diagnosis', 'radius_worst', 'texture_worst',
              'perimeter_worst', 'area_worst', 'smoothness_worst',
              'compactness_worst', 'concavity_worst', 'concave points_worst',
              'symmetry_worst', 'fractal_dimension_worst']


In [None]:
sns.pairplot(dataset[mean_columns], hue="diagnosis", palette='husl');


In [None]:
sns.pairplot(dataset[worst_columns], hue="diagnosis", palette='viridis');


# Data Preprocessing

In [None]:
X = dataset.iloc[:,2:].values
# X = dataset.drop(['diagnosis','id'],axis=1).values
y = dataset.iloc[:, 1:2].values

In [None]:
X

In [None]:
y

In [None]:
X.shape

In [None]:
y.shape

## Encoding Dependent Variable with Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y.ravel())


In [None]:
y  # 1 - Malignant and 0 - benign

### Splitting Dataset into Training Set and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)


### Standardizing training data


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
X_train

# Training Models on Training Set and Prediciting Results

In [None]:
accuracy_scores = {}
def predictor(predictor, params):
    global accuracy_scores
    if predictor == 'lr':
        print('Training Logistic Regression on Training Set')
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(**params)

    elif predictor == 'svm':
        print('Training Support Vector Machine on Training Set')
        from sklearn.svm import SVC
        classifier = SVC(**params)

    elif predictor == 'knn':
        print('TrainingK-Nearest Neighbours on Training Set')
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(**params)

    elif predictor == 'dt':
        print('Training LDecision Tree Classifier on Training Set')
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(**params)

    elif predictor == 'nb':
        print('Training Naive Bayes Classifier on Training Set')
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB(**params)
        
    elif predictor == 'rfc':
        print('Training Random Forest Classifier on Training Set')
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(**params)


    classifier.fit(X_train, y_train)

    print('''Predicting Single Cell Result''')
    single_predict = classifier.predict(sc.transform([[
        17.99, 10.38, 122.8, 1001, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419,
        0.07871, 1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587,
        0.03003, 0.006193, 25.38, 17.33, 184.6, 2019, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189
    ]])) 
    if single_predict > 0.5 :
        print("Cancer is Malignant \n")
    else :
        print("Cancer is Benign \n")

    print('''Prediciting Test Set Result''')
    y_pred = classifier.predict(X_test)
    result = np.concatenate((y_pred.reshape(len(y_pred), 1),
                             y_test.reshape(len(y_test), 1)), 1)
    print(result,'\n')
    print('''Making Confusion Matrix''')
    from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm,'\n')
    print('True Positives :',cm[0][0])
    print('False Positives :',cm[0][1])
    print('False Negatives :',cm[1][0])
    print('True Negatives :', cm[0][1],'\n')

    print('''Classification Report''')
    print(classification_report(y_test, y_pred,
          target_names=['M', 'B'], zero_division=1))

    print('''Evaluating Model Performance''')
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy,'\n')

    print('''Applying K-Fold Cross validation''')
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    accuracy_scores[classifier] = accuracies.mean()*100
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100),'\n')   


## Training Logisitic Regression

In [None]:
predictor('lr', {'penalty': 'l1', 'solver': 'saga', 'max_iter': 5000})


## Training Support Vector Machine 

In [None]:
predictor('svm', {'C': 1, 'gamma': 0.8,
          'kernel': 'linear', 'random_state': 0})


## Training Kernel Support Vector Machine

In [None]:
predictor('svm', {'C': 1, 'gamma': 0.1, 'kernel': 'rbf', 'random_state': 0})


## Training K-Nearest Neighbours

In [None]:
predictor('knn', {'n_neighbors': 5, 'n_jobs':1})


## Training Decision Trees

In [None]:
predictor('dt', {'criterion': 'gini', 'max_features': 'auto', 'splitter': 'random' ,'random_state': 0})


## Training Naive Bayes

In [None]:
predictor('nb', {})


## Training Random Forest

In [None]:
predictor('rfc', {'criterion': 'entropy',
          'max_features': 'auto', 'n_estimators': 250,'random_state': 0})


In [None]:
maxKey = max(accuracy_scores, key=lambda x: accuracy_scores[x])
print('The model with highest K-Fold Validation Accuracy score is  {0} with an accuracy of  {1:.2f}'.format(
    maxKey, accuracy_scores[maxKey]))


As you can see the Model with highest accuracy is Support Vector Machine with an accuracy of 97.80


In [None]:
plt.figure(figsize=(12, 6))
model_accuracies = list(accuracy_scores.values())
model_names = ['LogisticRegression', 'SVC',
                 'K-SVC','KNN','Decisiontree', 'GaussianNB','RandomForest']
sns.barplot(x=model_accuracies, y=model_names, palette='mako');


# Summary
- SVM performed Best on this dataset with an accuracy of 97.80%
- Logisitic Regression is just behind with an accuracy of 97.58%


## **Please do leave your valuable feedbacks in the comments and any improvements or suggestions are welcomed!**