This notebook corresponds Breast Cancer Wisconsin (Diagnostic) Data Set.
Here I analyze and visualize the data and Use Random Forest Classifier to predict diagnosis.
Also try various feature selection methods.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
bc = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

# Data Visualization

In [None]:
bc.info()

In [None]:
bc.describe()

In [None]:
bc.head()

We can drop the empty 'Unnamed: 32' column and 'id' column as they are not relevant to diagnosis.

In [None]:
bc.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)

Check for any null values

In [None]:
bc.isnull().sum().sum()

In [None]:
bc.isnull().any(axis=1).sum()

In [None]:
bc['diagnosis'].value_counts()

In [None]:
sns.countplot(bc['diagnosis'])

In [None]:
bc.groupby('diagnosis').mean()

In [None]:
plt.figure(figsize=(15,5))
plt.plot((bc.groupby('diagnosis').mean().loc['M'])/((bc.groupby('diagnosis').mean().loc['B'])))
plt.title('Ratio of Malignant to Benign values')
plt.xticks(rotation=90)
plt.show()

All values are higher for Malignant diagnosis than Benign diagnosis.

Plotting Swarm plots for features

In [None]:
X = bc.iloc[:, 1:]
y = bc.iloc[:, :1]

In [None]:
X1 = X.iloc[:, :15]

In [None]:
X1 = (X1-X1.mean())/X1.std()

In [None]:
data = pd.concat([y, X1], axis=1)

In [None]:
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

In [None]:
plt.figure(figsize=(20,7))
sns.swarmplot(x='features', y='value', hue='diagnosis', data=data)
plt.xticks(rotation=45)
plt.show()

In [None]:
X2 = X.iloc[:, 15:]

In [None]:
X2 = (X2-X2.mean())/X2.std()

In [None]:
data = pd.concat([y, X2], axis=1)

In [None]:
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

In [None]:
plt.figure(figsize=(20,7))
sns.swarmplot(x='features', y='value', hue='diagnosis', data=data)
plt.xticks(rotation=45)
plt.show()

# Random Forest Classifier

In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, precision_recall_curve

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def processing(df):
    X = df.iloc[ :, 1:]
    y = df['diagnosis']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scale = MinMaxScaler()
    X_train = scale.fit_transform(X_train)
    X_test = scale.transform(X_test)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    return X_train, X_test, y_train, y_test 

In [None]:
def random_forest(df, i):
    X = df.iloc[ :, 1:]
    y = df['diagnosis']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    scale = MinMaxScaler()
    X_train = scale.fit_transform(X_train)
    X_test = scale.transform(X_test)
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(X_train, y_train)
    return rfc, X_test, y_test

In [None]:
def scores(f, df, n):
    """
        f is machine learning algorithm funtion
        df is dataframe to given to ml funtion
        n is number of random states used for splitting the dataframe
        this function returns array of scores for random states 0 to n.
    """
    scores = []
    for i in range(n):
        clf, X_test, y_test = f(df, i)
        scores.append(clf.score(X_test, y_test))
    return np.array(scores)

In [None]:
score_list = scores(random_forest, bc, 100)

In [None]:
plt.figure(figsize=(15,5))
plt.plot(score_list)
plt.xlabel('random state')
plt.ylabel('mean accuracy score')

In [None]:
score_list.mean()

Average score for Random Forest Classifier is 0.96

In [None]:
rfc, X_test, y_test = random_forest(bc, 109)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
fig, axes = plt.subplots(1,1, figsize=(15,5))
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, ax=axes)

In [None]:
pred_prob = rfc.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred_prob[:, 1], pos_label='M')

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, pred_prob[:, 1], pos_label='M')

In [None]:
roc_score = auc(fpr, tpr)

In [None]:
plt.plot(fpr, tpr, label='(area = %0.2f)' % roc_score)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

Plotting probabilities for each case.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5), gridspec_kw={'width_ratios': [1, 2]})
sns.heatmap(pred_prob[np.argsort(pred_prob[:, 0])], ax=ax1)
ax2.plot(pred_prob[np.argsort(pred_prob[:, 0])])
ax2.set_xlabel('test case number')
ax2.set_ylabel('probability')
ax2.legend(['B', 'M'])

predictions with less than 80% probability

In [None]:
pred_prob[(pred_prob[:, 0] < .8) & (pred_prob[:, 1] < .8)]

In [None]:
percentage = pred_prob[(pred_prob[:, 0] < .8) & (pred_prob[:, 1] < .8)].shape[0]/pred_prob.shape[0]

In [None]:
f'{np.round(percentage*100, 2)} percentage of the predictions have less than 80 % of accuracy'

# Feature Selection

Since there are 30 features let us look into the correlations to see if we can reduce the number of features.

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(bc.corr(), cmap='Spectral', annot=True, fmt='.1f')

1. 'radius_mean', 'perimeter_mean', 'area_mean' are correlated with each other with Correlation coefficient of 1.0
2. 'radius_worst', 'perimeter_worst', 'area_worst' are correlated with each other with Correlation coefficient of 1.0
3. 'radius_se', 'perimeter_se', 'area_se' are correlated with each other with Correlation coefficient of 0.9 or more
4. 'compactness_mean', 'concavity_mean', 'concave points_mean' are correlated with each other with Correlation coefficient of 0.8 or more
5. 'compactness_worst', 'concavity_worst', 'concave points_worst' are correlated with each other with Correlation coefficient of 0.8 or more
6. 'compactness_se', 'concavity_se', 'concave points_se' are correlated with each other with Correlation coefficient of 0.7 or more

Let us select only one feature among three correlated features.

In [None]:
lst = ['diagnosis', 'texture_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'fractal_dimension_mean','texture_se',
       'area_se', 'smoothness_se','compactness_se', 'symmetry_se','fractal_dimension_se', 'texture_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'symmetry_worst', 'fractal_dimension_worst']

In [None]:
bc_1 = bc.loc[:, lst]

Heat map of correlations with reduced number of features.

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(bc_1.corr(), cmap='Spectral', annot=True)

Violin plot and Swarm plot for visualizing features.
Features are also standardized

In [None]:
data = bc_1.iloc[:, 1:]

In [None]:
data = (data-data.mean())/data.std()

In [None]:
data = pd.concat([bc['diagnosis'], data], axis=1)

In [None]:
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

In [None]:
plt.figure(figsize=(20,7))
sns.violinplot(x='features', y='value', hue='diagnosis', data=data, split=True, inner='quart')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(20,7))
sns.swarmplot(x='features', y='value', hue='diagnosis', data=data)
plt.xticks(rotation=45)
plt.show()

Random forest classifier for reduced number of features.

In [None]:
score_list_1 = scores(random_forest, bc_1, 100)

In [None]:
score_1 = np.round(score_list_1.mean(), 2)

In [None]:
score_1

In [None]:
rfc, X_test, y_test = random_forest(bc_1, 34)

In [None]:
y = bc['diagnosis']

In [None]:
rfc.score(X_test, y_test)

In [None]:
sns.heatmap(confusion_matrix(y_test, rfc.predict(X_test)), annot=True)

## Feature selection using random forest

In [None]:
feature_imp = pd.DataFrame(rfc.feature_importances_, bc_1.iloc[:, 1:].columns, columns=['importance']).sort_values(by='importance', ascending=False)

In [None]:
feature_imp.head()

In [None]:
rfc_df = pd.concat([bc['diagnosis'], bc_1[feature_imp.head().index]], axis=1)

In [None]:
score_list_fs_1 = scores(random_forest, rfc_df , 100)

In [None]:
score_fs_1 = np.round(score_list_fs_1.mean(), 2)

In [None]:
score_fs_1

In [None]:
F"With five important features we can get the score of {score_fs_1} compared to {score_1} with 18 features."

In [None]:
rfc, X_test, y_test = random_forest(rfc_df, 34)

In [None]:
sns.heatmap(confusion_matrix(y_test, rfc.predict(X_test)), annot=True)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, RFE, RFECV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bc.iloc[:, 1:], bc['diagnosis'], test_size=0.2, random_state=23)

In [None]:
min_ = X_train.min()
max_ = X_train.max()

In [None]:
X_train = (X_train - min_)/(max_ - min_)

In [None]:
X_test = (X_test - min_)/(max_ - min_)

## Feature Selection using chi2

In [None]:
best_features = SelectKBest(chi2, k=5).fit(X_train, y_train)

In [None]:
X_train.columns[best_features.get_support(indices=True)]

In [None]:
score_list = scores(random_forest, pd.concat([bc['diagnosis'], bc[X_train.columns[best_features.get_support(indices=True)]]], axis=1), 100)

In [None]:
score_list.mean()

In [None]:
f'Score with features selected by chi2 {np.round(score_list.mean(), 2)}'

## Feature Selection using RFECV

In [None]:
rfecv = RFECV(RandomForestClassifier(), min_features_to_select=5)

In [None]:
rfecv.fit(X_train, y_train)

In [None]:
X_train.columns[rfecv.support_]

In [None]:
rfecv.n_features_

## Feature Extraction by Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bc.iloc[:, 1:], bc['diagnosis'], test_size=0.2, random_state=101)

In [None]:
X_train = (X_train - X_train.min())/(X_train.max() - X_train.min())

In [None]:
pca = PCA()

In [None]:
pca.fit(X_train)

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.figure(figsize=(15,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xticks(np.arange(0, 31))
plt.grid()
plt.xlabel('n_components')
plt.ylabel('cumulative explained_variance_ratio_')

6 components can be used to explain more than 90% variance.