In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def metricas( y_train, y_pred):
  
    x = {'acc': accuracy_score(y_train, y_pred), \
          'f1': f1_score(y_train, y_pred),\
          'prec':  precision_score(y_train, y_pred),\
          'rec': recall_score(y_train, y_pred)}  
    return x

data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [None]:
data.columns

In [None]:
data.isnull().sum()

In [None]:
data = data.drop(['Unnamed: 32'], axis = 1)

data.std()

In [None]:
data.describe().transpose()

In [None]:
diagnosis = data['diagnosis']
data = data.drop(['diagnosis', 'id'], axis = 1)

plt.figure(figsize=(16,4))
data.boxplot()
plt.xticks(rotation=90)

In [None]:
data = data.apply(np.log1p)

plt.figure(figsize=(16,4))
data.boxplot()
plt.xticks(rotation=90)


In [None]:
data = data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

plt.figure(figsize=(16,4))
data.boxplot()
plt.xticks(rotation=90)



In [None]:
diagnosis = diagnosis.replace({'M': 0, 'B':1})
print('Quantidade B: ', (diagnosis > 0).sum())
print('Quantidade M: ', (diagnosis < 1).sum())

In [None]:
X_Tsne = TSNE(n_components = 2).fit_transform(data)
plt.figure(figsize = (6,6))
colors = ['g', 'r']
target_ids = [0, 1]
class_names = ['M', 'B']

for i,c,label in zip(target_ids, colors, class_names):
    plt.scatter(X_Tsne[diagnosis == i, 0], X_Tsne[diagnosis == i, 1], c=c, label = label)
    
plt.legend()
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, diagnosis, train_size = 0.8)

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metricas(y_test, y_pred)

In [None]:
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metricas(y_test, y_pred)

In [None]:
clf = MLPClassifier(max_iter=300, solver = 'adam')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metricas(y_test, y_pred)