In [None]:
import numpy as np
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from plotnine import *
from plotnine.data import *
import warnings

warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Read the data
data = pd.read_csv('../dataset/breast-cancer-wisconsin.data')

In [None]:
# Drop Unnamed: 32 column and id column
data.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)

In [None]:
# Encode the diagnosis column
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

In [None]:
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

In [None]:
# Correlation between features of X and y, ascending order
corr = X.corrwith(y).sort_values(ascending=False)
sns.barplot(x=corr, y=corr.index)
plt.show()

### Skewness
Skewness is a measure of symmetry, or more precisely, the lack of symmetry. A distribution, or data set, is symmetric if it looks the same to the left and right of the center point.

Kurtosis is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution.

$$\tilde{\mu}_3 = \operatorname{E}\left[\left(\frac{X-\mu}{\sigma}\right)^3 \right]
             = \frac{\mu_3}{\sigma^3}
             = \frac{\operatorname{E}\left[(X-\mu)^3\right]}{( \operatorname{E}\left[ (X-\mu)^2 \right] )^{3/2}}
             = \frac{\kappa_3}{\kappa_2^{3/2}}
$$

where $μ$ is the mean, $σ$ is the standard deviation, $E$ is the expected value expectation operator

In [None]:
# Show if skewness is positive or negative or normal distribution
for i in data.columns:
    if data[i].skew() > 0:
        print(i, 'is right skewed')
    elif data[i].skew() < 0:
        print(i, 'is left skewed')
    else:
        print(i, 'is normally distributed')
# Skewness plot for dataframe
data.skew().plot(kind='bar', figsize=(10, 5))
plt.show()

### Shapiro

$$ W = {\left(\sum_{i=1}^n a_i x_{(i)}\right)^2 \over \sum_{i=1}^n (x_i-\overline{x})^2} $$

where:

* $x_{(i)}$ (with parentheses enclosing the subscript index ''i''
* $\overline{x} = \left( x_1 + \cdots + x_n \right) / n$

In [None]:
# Check normality of each feature, Gaussian distribution
from scipy.stats import shapiro

fig, ax = plt.subplots(5, 6, figsize=(20, 20))
for variable, subplot in zip(X.columns, ax.flatten()):
    sns.distplot(X[variable], ax=subplot)
    stat, p = shapiro(X[variable])
    print(variable, 'Statistics=%.3f, p=%.3f' % (stat, p))
    if p > 0.05:
        print('Probably Gaussian')
    else:
        print('Probably not Gaussian')
plt.show()

In [None]:
# Plot
plt.title('Dataset')
plt.scatter(X.iloc[:, 0], X.iloc[:, 5], color='blue', s=10., label='Data points')
plt.axis('tight')
plt.xlabel('Mean radius (standardized)')
plt.ylabel('Mean texture (standardized)')
plt.legend()
plt.show()

In [None]:
# Chevbyshev's inequality for each feature of X
# Chebyshev's inequality: P(|X - μ| ≥ kσ) ≤ 1/k^2 for k > 1
# P(|X - μ| ≥ kσ) is the probability that the random variable X is at least k standard deviations away from the mean μ
# k is a constant greater than 1
for i in X.columns:
    print(i, 'mean:', X[i].mean(), 'std:', X[i].std())
    print('Chebyshev\'s inequality:', X[i].mean() - 2 * X[i].std(), '<=', i, '<=', X[i].mean() + 2 * X[i].std())

In [None]:
# Estimate interval confidence for each feature of X
# Interval confidence: μ ± zα/2σ/sqrt(n)
# μ is the mean of the population
# σ is the standard deviation of the population
# n is the number of observations
# zα/2 is the z-score for the desired confidence level
# α is the desired confidence level

def interval_confidence(data, alpha):
    z = stats.norm.ppf(1 - alpha / 2)
    mean = data.mean()
    std = data.std()
    n = len(data)
    return mean - z * std / np.sqrt(n), mean + z * std / np.sqrt(n)

# Estimate interval confidence for each feature of X with alpha = 0.05
for i in X.columns:
    print(i, 'mean:', X[i].mean(), 'std:', X[i].std())
    print('Interval confidence:', interval_confidence(X[i], 0.05))
    print()

In [None]:
# Correlation matrix of X
corr = X.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True, fmt='.2f')
plt.show()

In [None]:
# Create 3 groups of features, mean, se and worst
mean_features = [col for col in X.columns if 'mean' in col]
se_features = [col for col in X.columns if 'se' in col]
worst_features = [col for col in X.columns if 'worst' in col]

NameError: name 'X' is not defined

In [None]:
sns.pairplot(data, vars=mean_features, hue='diagnosis', diag_kind='kde')
plt.show()

In [None]:
sns.pairplot(data, vars=se_features, hue='diagnosis', diag_kind='kde')
plt.show()

In [None]:
sns.pairplot(data, vars=worst_features, hue='diagnosis', diag_kind='kde')
plt.show()

In [None]:
sns.clustermap(data[mean_features].corr(), annot=True, fmt='.2f')
sns.clustermap(data[se_features].corr(), annot=True, fmt='.2f')
sns.clustermap(data[worst_features].corr(), annot=True, fmt='.2f')
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(10, 10))
for i, feature in enumerate(mean_features):
    sns.violinplot(x='diagnosis', y=feature, data=data, ax=axes[i // 3, i % 3])
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(10, 10))
for i, feature in enumerate(se_features):
    sns.violinplot(x='diagnosis', y=feature, data=data, ax=axes[i // 3, i % 3])
plt.show()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(10, 10))
for i, feature in enumerate(worst_features):
    sns.violinplot(x='diagnosis', y=feature, data=data, ax=axes[i // 3, i % 3])
plt.show()