For explanation read the code comments, basically takes this CSV and predicts with 97% accuracy using NNs.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install fastai
from fastai.tabular.all import *

In [None]:
# Open CSV
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

# Randomise dataset
df = df.sample(frac=1).reset_index(drop=True)

# Show the dataset
df.head()

In [None]:
# Split into TEST and TRAIN set
msk = np.random.rand(len(df)) < 0.9
df_train = df[msk]
df_test = df[~msk]

print('test set', len(df_test))
print('train set', len(df_train))

In [None]:
# Split df_train into 80% for training and 20% for validation
splits = IndexSplitter(list(range(int(len(df_train)*.8),len(df_train))))(range_of(df_train))

# Load training data into dataloader for fastai
dep_var = 'diagnosis' # this is what we will predict/optimise for
cat_names = [] # no category columns
# Everything in this dataset is continuous (float) so they are defined here, for speed I copied the columns and split by tab
cont_names = 'radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst'.split('	')
print('cont_names', cont_names)
procs = [Categorify, FillMissing, Normalize]

to = TabularPandas(df_train, procs, cat_names, cont_names, y_names=dep_var, splits=splits)
dls = to.dataloaders(bs=64)

# Show data inside dataloader after transforms above
dls.show_batch()

In [None]:
# Define learner
learn = tabular_learner(dls, layers=[200,100], metrics=accuracy)

# Find optimum learning rate (the steepest point)
lr_min, lr_steep = learn.lr_find()
print(f"Minimum/10: {lr_min:.2e}, steepest point: {lr_steep:.2e}")

In [None]:
# Just a sanity check what the tabular dataloaders have decided the predictions/labels to be
print('learn.dls.vocab', learn.dls.vocab)

In [None]:
# Train (fit) using the optimum learning rate
learn.fit(1, float(f"{lr_steep:.2e}"))

### Test on Data the model has never seen

This is from the `df_test` we defined further up.

In [None]:
# How many are we testing on?
print('test set size', len(df_test))
print('training size', len(df_train))

# Define a dataloader that is for testing using df_test
test_dl = dls.test_dl(df_test, with_labels=True)

# Get the predictions for this test set
preds, tgt = learn.get_preds(dl=test_dl)

# What was the accuracy of these predictions?
print(str(accuracy(preds,tgt).item()*100) + "% accurate on ")