In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
data.shape

In [None]:
data.info()

In [None]:
data.head(10)

In [None]:
data.columns

In [None]:
data = data.drop(['Unnamed: 32'], axis=1)

In [None]:
data = data.drop(['id'], axis=1)

In [None]:
data.head(10)

In [None]:
data.describe()

In [None]:
target = data.diagnosis
input_col = data.iloc[:,1:]

In [None]:
fig = plt.figure(figsize=(12,18))
for i in range(len(input_col.columns)):
    fig.add_subplot(9,4,i+1)
    sns.distplot(input_col.iloc[:,i], kde=True, hist=True)
    plt.xlabel(input_col.columns[i])
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(12,18))
for i in range(len(input_col.columns)):
    fig.add_subplot(9,4,i+1)
    sns.boxplot(y=input_col.iloc[:,i])
    plt.xlabel(input_col.columns[i])
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,20))
corr = data.corr()
mask = np.triu(np.ones_like(corr))
sns.heatmap(data=corr, annot=True, cmap="YlGnBu", mask=mask);

In [None]:
attributes_to_drop = ['perimeter_mean', 'area_mean', 'radius_worst', 'perimeter_worst', 'area_worst', 'concave points_mean', 'perimeter_se', 'radius_se']
input_col = input_col.drop(attributes_to_drop, axis=1)

In [None]:
input_col.columns

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_inputs = scaler.fit_transform(input_col)
scaled_inputs

In [None]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(scaled_inputs, target, test_size=0.25, random_state=42)

In [None]:
print(train_input.shape)
print(test_input.shape)
print(train_target.shape)
print(test_target.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)
lr.fit(train_input, train_target)
preds = lr.predict(test_input)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(test_target, preds)

In [None]:
confusion_matrix(test_target, preds, normalize='true')

In [None]:
from sklearn.tree import DecisionTreeClassifier
dc = DecisionTreeClassifier(random_state=42)
dc.fit(train_input, train_target)
dc_preds = dc.predict(test_input)

In [None]:
accuracy_score(test_target, dc_preds)

In [None]:
confusion_matrix(test_target, dc_preds, normalize='true')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rc = RandomForestClassifier(random_state=42)
rc.fit(train_input, train_target)
rc_preds = rc.predict(test_input)

In [None]:
accuracy_score(test_target, rc_preds)

In [None]:
confusion_matrix(test_target, rc_preds, normalize='true')

In [None]:
output = pd.DataFrame({'Id': test_target.index,
                       'target': preds})
output.to_csv('submission.csv', index=False)