# Introduction

We will begin with getting the basic idea about dataset (i.e., checking nulls, univariate distributions, distributions conditional on target variable). Then we will proceed by looking into which features are the most important at predicting the malign/benign cancer. To estimate feature importance we will use two techniques: t-test and random forest feature importance test. We will check whether estimations given by the aforementioned tests agree with each other. Finally, we will train some classification models to see how accurately we can predict whether the cancer is benign or malign. Let’s begin:

# Import relevant libraries

In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import tensorflow as tf
import math
from scipy import special #comb, factorial
from keras import backend as K
from scipy.stats import uniform
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler,LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, recall_score, make_scorer, plot_confusion_matrix, confusion_matrix, accuracy_score,f1_score

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

We can see that `Unnamed: 32` only contain null values. We will remove the column.

In [None]:
df.drop(['Unnamed: 32'],axis=1,inplace=True)
df.head()

Let's see the distribution of our target variable, i.e., the distribution of  `diagnosis`:

In [None]:
df['diagnosis'].value_counts()

Normalized distirbution of `diagnosis`

In [None]:
df['diagnosis'].value_counts(normalize=True)

We see that we deal with disbalanced dataset, so when assesing the performance of our classification models, we will be using metrics other than accuracy.

Now let's check the univariate distributions of our features (one can see that all of our features are numeric):

In [None]:
#Check the UNIVARIATE distributions
cont_features = df.drop(['id','diagnosis'],axis=1).columns
WIDTH = 16
LENGTH = 40

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(rows,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    ax[i].hist(df[feature],alpha=0.6)
    ax[i].set_title(f'Distribution of a feature `{feature}`')


Now let's check the conditional distributions of each numeric feature (conditional on our taget variable, `diagnosis`):

In [None]:
# BOX
cont_features = df.drop(['id','diagnosis'],axis=1).columns
cat_variable = 'diagnosis'
WIDTH = 16
LENGTH = 50

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(rows,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i])
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

Visually, it seems that most of numeric features seem to be having a very decent predicting power of a target variable.

Now let's use more rigorous approache to assess the relation between each numeric feature and target variable. For that we will t-test for independence. We will do the following:
1. Run t-test for independence (independence between each numeric feature and `diagnosis`)
3. Check whether there are some features that **passed** independence test (we say that a numeric feature $X$ passed an independence test if we cannot reject the null hypothesis. The null hypothesis is: Numeric feature $X$ and categorical variable `diagnosis` are independent).
4. Consider all the features that failed independence test (i.e., null hypothesis is rejected). For each test, rank the features (the smaller $p$-value, the higher the rank). Which features have the best predicting power (i.e., have the highest rank or the smallest $p$-value)?

# The independent samples t-test

In [None]:
from scipy.stats import ttest_ind

cont_features = df.drop(['id','diagnosis'],axis=1).columns

label = 'diagnosis'
dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    't-statistic': []}


assert df[label].unique().size == 2, 'Label must only contain two unique values!'

for feature in cont_features:
    value_1 = df[label].unique()[0]
    value_2 = df[label].unique()[1]
    
    a = df[df[label] == value_1][feature].values
    b = df[df[label] == value_2][feature].values
    
    statistic, pval = ttest_ind(a,b)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['t-statistic'].append(statistic)


ttest_df = pd.DataFrame(dic)
ttest_df

How many features passed independence test? (Feature $X$ has passed an independence test if and only if $p ≥ 0.05$)

In [None]:
ttest_df['p < 0.05'].value_counts()

We see that we have only $5$ features that passed an independence test ($p$ value is larger than $0.05$). Let's have a look at the features that have passed the test

In [None]:
print("Following features have passed t-test:")
print([x for x in ttest_df[ttest_df['p < 0.05'] == False]['Numerical'].values])

The fact that we failed to reject the null hypothesis for these 5 features imply that the features are not doing good job at discerning benign and malign cancer. Let's visualize the box plot for these features one more time.

In [None]:
low_features = ttest_df[ttest_df['p < 0.05'] == False]['Numerical'].values


# BOX
cont_features = low_features
cat_variable = 'diagnosis'
WIDTH = 27
LENGTH = 5

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(1,5,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i],showfliers=False)
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

And indeed, we can see that the conditional distributions for each numeric feature are very similar.

Now we move onto the features the failed independence test (i.e., $p$ value is smaller than $0.05$). We will rank each feature according to its $p$-value: the smaller the $p$-value, the higher the rank (e.g., the highest rank is $1$, the next highest rank is $2$ etc.)

In [None]:
ttest_df = ttest_df.sort_values(by='p-value',ascending=True).reset_index().reset_index().drop('index',axis=1)
ttest_df.rename(columns={'level_0':'Rank'},inplace=True)
ttest_df['Rank'] += 1
ttest_df

Let's visualize the conditional distributions of the top 6 features.

In [None]:
# BOX
cont_features = ttest_df[ttest_df['Rank'] <= 6]['Numerical'].values
cat_variable = 'diagnosis'
WIDTH = 20
LENGTH = 12

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(2,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i],showfliers=False)
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

We can compare the graphs of the top 6 features with the lowest p-value with those features that passed and independence test. Top 6 features have staggeringly different conditional distributions (signified by the fact that the yellow and blue boxes are very clearly separated), but for the features that passed an independence test, the conditional distributions are hardly distinguishable (i.e., the boxes are of rouhgly the same shape and are on roughly the same level).

Now let's estimate feature importance using Random Forest, and let's compare top 6 features selected by t-test and top 6 features selected by random forest.

# Feature importance estimation via random forest

In [None]:
X = df.drop(['id','diagnosis'],axis=1).copy()
y = df['diagnosis'].copy()


forest_clf = RandomForestClassifier(n_estimators=100)
forest_clf.fit(X, y)

importances = forest_clf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(7,7))
plt.bar(range(len(indices)),importances[indices])
plt.xticks(range(len(indices)), indices)
plt.title("Feature importance (Random Forest)")
plt.xlabel('Index of a feature')
plt.ylabel('Feature importance')
plt.show()

In [None]:
top6_tt = set(ttest_df[ttest_df['Rank'] <= 6]['Numerical'].values)
top6_rf = set(np.array(X.iloc[:,indices[:6]].columns))


Top 6 features selected by t-test

In [None]:
top6_tt

Top 6 features selected by random forest

In [None]:
top6_rf

We see that top 6 features selected by RF and t-test are identical, which implies that these 6 features are indeed good predictors of our target variable.

Now we will try to classify. Since we are only dealing with the numeric features and a lot of those features are (roughly) normally distributed, we will scale the features using `StandardScaler`. Our training will have 2 stages:

Stage 1. We use **all** numeric features present in the dataset.

Stage 2. We only use 6 top features.

After training, we will compare the performance of our models based on the features used. Due to the fact that our target label is disbalanced, we will use macro f1 score to evaluate the performance of our models.

# Training:  All features

In [None]:
X = df.drop(['id','diagnosis'],axis=1).copy()
y = df['diagnosis'].copy()


X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=11)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



def f1_macro(clf,X_train=X_train,
             y_train=y_train,
             X_test=X_test,
             y_test=y_test):
    
    clf.fit(X_train,y_train)
    pred = clf.predict(X_test)
    return f1_score(y_pred=pred,y_true=y_test,average='macro')


models = {'GB':GaussianNB(),
          'Logistic': LogisticRegression(random_state=11,max_iter=4000),
          'SVM': SVC(), 
          'KNN': KNeighborsClassifier()}

model_name = [x for x in models]
f1_macro = [round(f1_macro(models[x]),2) for x in model_name]



cat_features = model_name

count = np.array(f1_macro)

to_sort = np.argsort(count)[::-1]
cat_features = np.array(cat_features)[to_sort]
count = count[to_sort]

plt.figure(figsize=(11,6))
graph = sns.barplot(cat_features,count)
for p in graph.patches:
    graph.annotate(p.get_height(), (p.get_x()+0.4, p.get_height()),
                   ha='center', va='bottom',
                   color= 'black')


plt.title("Performance of the models (all features)")
plt.xticks(rotation=45)
plt.ylabel('f1 score (macro)')
plt.xlabel('Model')
plt.show()

# Training:  Top 6 features

In [None]:
top_6 = ['perimeter_worst', 'radius_worst', 'concave points_worst', 'area_worst',
       'concave points_mean', 'perimeter_mean']

X = df[top_6]
y = df['diagnosis'].copy()


X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=11)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



def f1_macro(clf,X_train=X_train,
             y_train=y_train,
             X_test=X_test,
             y_test=y_test):
    
    clf.fit(X_train,y_train)
    pred = clf.predict(X_test)
    return f1_score(y_pred=pred,y_true=y_test,average='macro')


models = {'GB':GaussianNB(),
          'Logistic': LogisticRegression(random_state=11,max_iter=400),
          'SVM': SVC(), 
          'KNN': KNeighborsClassifier()}

model_name = [x for x in models]
f1_macro = [round(f1_macro(models[x]),2) for x in model_name]



cat_features = model_name

count = np.array(f1_macro)

to_sort = np.argsort(count)[::-1]
cat_features = np.array(cat_features)[to_sort]
count = count[to_sort]

plt.figure(figsize=(11,6))
graph = sns.barplot(cat_features,count)
for p in graph.patches:
    graph.annotate(p.get_height(), (p.get_x()+0.4, p.get_height()),
                   ha='center', va='bottom',
                   color= 'black')


plt.title("Performance of the models (top 6 features)")
plt.xticks(rotation=45)
plt.ylabel('f1 score (macro)')
plt.xlabel('Model')
plt.show()

We see that performance becomes better when we use only 6 features. We should also should note that we got very decent results even while using models with **default** hyperparameters.