In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style()

In [None]:
df = pd.read_csv("breast_cancer.csv")

In [None]:
df.head()

##  Feature analysis

In [None]:
target = df["diagnosis"]
features_drop = ["id", "diagnosis", "Unnamed: 32"]
data = df.drop(features_drop, axis = 1)

In [None]:
sns.countplot(df['diagnosis'],label="Count")
plt.show()

### Correlation 

In [None]:
plt.figure(figsize = (12,10))

sns.heatmap(data.corr(), linewidths = 0.5)
plt.show()

In [None]:
plt.figure()
pd.plotting.scatter_matrix(data.iloc[:,0:10], figsize = (20,20))
plt.show()

In [None]:
data.describe()

In [None]:
data_std = (data - data.mean())/data.std()

In [None]:
plt.figure(figsize = (50,20))
sns.violinplot(data=data_std)
plt.show()

In [None]:
data_l = pd.concat([target, data_std.iloc[:,0:10]], axis=1)
data_r = pd.melt(data_l,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(50,20))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data_r, split=True)
plt.show()

In [None]:

plt.figure(figsize=(50,20))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data_r, split=True)
plt.show()

In [None]:
data_l = pd.concat([target, data_std.iloc[:,21:30]], axis=1)
data_r = pd.melt(data_l,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(50,20))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data_r, split=True)
plt.show()


### PCA-1

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = 1)
proj_data = pca.fit_transform(data)
df_proj = pd.DataFrame({"PCA_1_x1": proj_data[:,0], "diagnosis": target.values.tolist()})

data_proj = pd.melt(df_proj,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(10,7))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data_proj, split=True)
plt.show()

### PCA-2

In [None]:
pca = PCA(n_components = 2)
proj_data = pca.fit_transform(data)
df_proj = pd.DataFrame({"x1": proj_data[:,0], "x2":proj_data[:,1], "diagnosis": target.values.tolist()})
df_proj["diagnosis"] = target.tolist()

In [None]:
data_proj = pd.melt(df_proj,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(25,10))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data_proj, split=True)
plt.show()

In [None]:
#plt.figure(figsize = (50,50))
sns.jointplot(df_proj.loc[:,"x1"], df_proj.loc[:,'x2'], kind = "reg")
plt.show()

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (6,6))
df_proj_M = df_proj[df_proj["diagnosis"] == "M"]
df_proj_B = df_proj[df_proj["diagnosis"] == "B"]

X,Y = df_proj_M["x1"].values.tolist(), df_proj_M["x2"].values.tolist() 
plt.scatter(X,Y, c = "green")
X,Y = df_proj_B["x1"].values.tolist(), df_proj_B["x2"].values.tolist()
plt.scatter(X,Y, c = "red")
plt.show()


### PCA-3


In [None]:
pca = PCA(n_components = 3)
proj_data = pca.fit_transform(data)
df_proj = pd.DataFrame({"x1": proj_data[:,0], "x2":proj_data[:,1], "x3":proj_data[:,2], "diagnosis": target.values.tolist()})
df_proj["diagnosis"] = target.tolist()


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111, projection='3d')

df_proj_M = df_proj[df_proj["diagnosis"] == "M"]
df_proj_B = df_proj[df_proj["diagnosis"] == "B"]

X,Y,Z = df_proj_M["x1"].values.tolist(), df_proj_M["x2"].values.tolist(), df_proj_M["x3"].values.tolist() 
ax.scatter(X,Y,Z, c = "green")
X,Y,Z = df_proj_B["x1"].values.tolist(), df_proj_B["x2"].values.tolist(), df_proj_B["x3"].values.tolist() 
ax.scatter(X,Y,Z, c ="purple")
plt.show()

# Topological Data Analysis

### Mapper (filter = PCA-1)

In [None]:
#prepare data for mapper
data_mapper = [data.iloc[i,:].tolist() for i in range(len(data))]

import mapper as mp
out = mp.Mapper(lens = "PCA", clusterer = "DBSCAN", n_rcover = [100, 2], clusterer_params  = (0.1,5))
out.write_to_json(data_mapper)

![alt text](Breast_Cancer_TDA/breast_cancer_kaggle_mapper_PCA_100_2.png "mapper_PCA_100_2")


### Mapper (filter = eccentricity)

In [None]:
out = mp.Mapper(lens = "eccentricity", clusterer = "DBSCAN", n_rcover = [100, 2], clusterer_params  = (0.1,5))
out.write_to_json(data_mapper)

![alt text](Breast_Cancer_TDA/breast_cancer_kaggle_eccentricity_100_2.png "mapper_eccentricity")


## k-nerve 

In [None]:
# prepare data for k-nerve

data_kn = [data.iloc[i,:].tolist() for i in range(len(data))]
labels = target.values.tolist()
labels_kn = list(map(lambda x: 1 if x=="M" else 0, labels))

## 1-nerve

In [None]:
import k_nerve as kn
KN = kn.k_Nerve(n_components = 1, covering_size = 50, overlap = 1.5)
KN.draw(data_kn, labels_kn)

![alt text](Breast_Cancer_TDA/breast_cancer_1_50_150.png "1-nerve"| width = 100) ![2-nerve at 10x10](Breast_Cancer_TDA/breast_cancer_2_100_06.png "2-nerve at resolution 10x10") 


### 2-nerve at increasing resolutions

In [None]:
import k_nerve as kn
KN = kn.k_Nerve(n_components = 2, covering_size = 100, overlap = 0.6)
KN.draw(data_kn, labels_kn)

![2-nerve at 10x10](Breast_Cancer_TDA/breast_cancer_2_100_06.png "2-nerve at resolution 10x10") 


In [None]:
import k_nerve as kn
KN = kn.k_Nerve(n_components = 2, covering_size = 225, overlap = 0.65)
KN.draw(data_kn, labels_kn)

![alt text](Breast_Cancer_TDA/breast_cancer_2_225_065.png "2-nerve at resolution 15x15")


In [None]:
import k_nerve as kn
KN = kn.k_Nerve(n_components = 2, covering_size = 400, overlap = 0.75)
KN.draw(data_kn, labels_kn)

![alt text](Breast_Cancer_TDA/breast_cancer_2_400_075.png "2-nerve at resolution 20x20")


## 3-nerve

In [None]:
import k_nerve as kn
KN = kn.k_Nerve(n_components = 3, covering_size = 100, overlap = 0.25)
KN.draw(data_kn, labels_kn)

![alt text](Breast_Cancer_TDA/breast_cancer_kaggle_3_100_025.png "3-nerve")