# Breast cancer analysis using fastai tabular application

In [2]:
from fastai.tabular.all import *;

In [3]:
df = pd.read_csv('breast_cancer_dataset.csv')
headers = list(df.columns)
headers.pop()

'label'

In [4]:
dls = TabularDataLoaders.from_csv('breast_cancer_dataset.csv', 
                                  y_names="label",
                                  cont_names = headers,
                                  procs = [Categorify, FillMissing, Normalize])

The last part is the list of pre-processors we apply to our data:

- `Categorify` is going to take every categorical variable and make a map from integer to unique categories, then replace the values by the corresponding index.
- `FillMissing` will fill the missing values in the continuous variables by the median of existing values (you can choose a specific value if you prefer)
- `Normalize` will normalize the continuous variables (substract the mean and divide by the std)



In [5]:
splits = RandomSplitter(valid_pct=0.3)(range_of(df))

**Note**: Since the labels are encoded (as 0 and 1), we explicitly pass `y_block = CategoryBlock` in the constructor so that `fastai` does not presume we are doing regression.

In [6]:
to = TabularPandas(df, procs=[Categorify, FillMissing, Normalize],
                   cont_names = headers,
                   y_names='label',
                   splits=splits,
                   y_block = CategoryBlock)

Once we build our `TabularPandas` object, our data is completely preprocessed as seen below:

In [7]:
to.xs.iloc[:2]

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
476,0.054317,0.283434,0.049671,-0.072386,-0.537555,0.112433,-0.489844,-0.473545,-1.164711,-0.416673,...,0.067179,0.225478,0.169559,-0.064072,-0.766973,0.523337,-0.135695,0.281628,-0.608579,-0.331126
162,1.630762,-0.273073,1.671527,1.698362,1.097688,1.171846,1.999865,2.092934,0.749274,-0.315651,...,2.25349,0.08577,2.073932,2.472656,0.459447,0.78086,1.856568,1.653756,1.107457,0.396869


Now we can build our `DataLoaders` again:

In [8]:
dls = to.dataloaders(bs=32)
dls.show_batch()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,label
0,13.59,17.84,86.239998,572.299987,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,0.258,1.166,1.683,22.219999,0.003741,0.005274,0.01065,0.005044,0.01344,0.001126,15.5,26.1,98.910004,739.099977,0.105,0.07622,0.106,0.05185,0.2335,0.06263,1
1,15.75,19.219999,107.099998,758.599978,0.1243,0.2364,0.2914,0.1242,0.2375,0.07603,0.5204,1.324,3.477,51.220002,0.009329,0.06559,0.09953,0.02283,0.05543,0.00733,17.360001,24.17,119.400001,915.299988,0.155,0.5046,0.6872,0.2135,0.4245,0.105,0
2,17.469999,24.68,116.099999,984.599958,0.1049,0.1603,0.2159,0.1043,0.1538,0.06365,1.088,1.41,7.337,122.300003,0.006174,0.03634,0.04644,0.01569,0.01145,0.00512,23.14,32.330002,155.300004,1660.000009,0.1376,0.383,0.489,0.1721,0.216,0.093,0
3,14.22,27.850001,92.550003,623.900024,0.08223,0.1039,0.1103,0.04408,0.1342,0.06129,0.3354,2.324,2.105,29.959999,0.006307,0.02845,0.0385,0.01011,0.01185,0.003589,15.75,40.540002,102.5,764.000001,0.1081,0.2426,0.3064,0.08219,0.189,0.07796,1
4,18.049999,16.15,120.199998,1006.000013,0.1065,0.2146,0.1684,0.108,0.2152,0.06673,0.9806,0.5505,6.311,134.800006,0.00794,0.05839,0.04658,0.0207,0.02591,0.007054,22.39,18.91,150.100006,1610.000018,0.1478,0.5634,0.3786,0.2102,0.3751,0.1108,0
5,11.8,16.58,78.989998,432.000008,0.1091,0.17,0.1659,0.07415,0.2678,0.07371,0.3197,1.426,2.281,24.719999,0.005427,0.03633,0.04649,0.01843,0.05628,0.004635,13.74,26.379999,91.93,591.700007,0.1385,0.4092,0.4504,0.1865,0.5774,0.103,0
6,13.0,21.82,87.5,519.79999,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.729999,106.199997,739.299986,0.1703,0.5401,0.539,0.206,0.4378,0.1072,0
7,12.72,13.78,81.779999,492.100001,0.09667,0.08393,0.01288,0.01924,0.1638,0.061,0.1807,0.6931,1.34,13.379999,0.006064,0.0118,0.006564,0.007978,0.01374,0.001392,13.5,17.479999,88.540002,553.700025,0.1298,0.1472,0.05233,0.06343,0.2369,0.06922,1
8,12.45,15.7,82.57,477.100011,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.400001,741.599973,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,0
9,17.08,27.15,111.199997,930.900018,0.09898,0.111,0.1007,0.06431,0.1793,0.06281,0.9291,1.152,6.051,115.199999,0.00874,0.02219,0.02721,0.01458,0.02045,0.004417,22.959999,34.490002,152.100008,1648.0,0.16,0.2444,0.2639,0.1555,0.301,0.0906,0


In [9]:
learn = tabular_learner(dls, metrics=accuracy)

We can train that model with the `fit_one_cycle` method.

In [10]:
import time
start_time = time.time_ns()
learn.fit_one_cycle(8)
training_time = time.time_ns() - start_time

epoch,train_loss,valid_loss,accuracy,time
0,0.614847,0.570006,0.870588,00:00
1,0.442061,0.215389,0.923529,00:00
2,0.315561,0.112957,0.958824,00:00
3,0.237764,0.095577,0.964706,00:00
4,0.206201,0.106556,0.964706,00:00
5,0.171593,0.102113,0.964706,00:00
6,0.15073,0.09534,0.964706,00:00
7,0.135169,0.096581,0.964706,00:00


We can then have a look at some predictions:

In [11]:
learn.show_results()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,label,label_pred
0,0.004596,0.451788,-0.061305,-0.116982,-0.437379,-0.672634,-0.937735,-0.777794,-1.1243,-0.657466,-0.541784,0.452327,-0.617886,-0.404157,0.127113,-0.652206,-0.787139,-0.292031,-0.930494,-0.343724,-0.171019,0.710444,-0.249828,-0.268311,-0.19034,-0.65991,-1.011261,-0.538475,-1.08517,-0.459658,1.0,1.0
1,-0.197213,-0.808536,-0.201084,-0.364935,0.794998,0.923131,0.33442,0.555194,0.437011,0.819123,-0.436963,-0.393129,-0.687501,-0.621576,-0.08302,0.570753,0.208926,0.312825,0.072436,0.276873,-0.128483,-0.428101,-0.29865,-0.450139,0.881163,1.364455,0.951752,0.839244,0.927962,0.896598,1.0,1.0
2,0.174232,-0.848286,0.061107,0.06586,-1.483819,-1.32385,-0.939601,-0.782769,-0.701828,-1.460111,-0.320925,-0.574546,-0.466526,-0.241152,-0.705649,-1.087885,-0.847734,-0.885078,-0.664505,-1.09458,0.069305,-0.659343,-0.085167,-0.04184,-1.378031,-1.156614,-1.080096,-0.863373,-0.614769,-1.484717,1.0,1.0
3,0.247351,-0.556003,0.154716,0.092023,-0.5426,-0.648142,-0.703404,-0.520937,-0.918575,-0.840137,-0.371982,-0.996754,-0.646892,-0.360005,-0.449605,-0.494873,-0.563148,-0.572755,-0.598871,-0.623695,0.037404,-0.85365,-0.127621,-0.156251,-0.487263,-0.63706,-0.734579,-0.537266,-0.622505,-0.880988,1.0,1.0
4,-0.337601,-0.308147,-0.340863,-0.405665,0.185296,-0.178811,-0.635866,-0.668086,-0.385892,-0.43328,-0.516255,-0.255981,-0.559346,-0.445206,-0.61312,-0.497105,-0.376183,-0.589033,-0.372031,-0.372202,-0.402836,-0.166348,-0.443601,-0.437125,-0.181734,-0.272082,-0.497714,-0.544066,-0.229472,-0.299126,1.0,1.0
5,-0.088997,-0.149145,-0.106204,-0.213309,0.172323,0.040287,-0.449794,-0.283717,-0.400587,0.737474,-0.308935,0.011371,-0.292489,-0.299624,-0.102444,-0.167932,-0.371586,0.263351,-0.231552,-0.039713,-0.217807,-0.036275,-0.214652,-0.292892,0.403505,-0.009618,-0.403153,0.198515,-0.121156,0.294469,1.0,1.0
6,-1.098038,-0.640181,-1.075337,-0.957166,-0.576472,-0.459802,-0.577532,-0.635881,-0.565902,0.558955,-1.100698,-0.380977,-0.975611,-0.756037,1.144579,0.296815,-0.21006,-0.14361,0.237097,0.752862,-1.128062,-0.612773,-1.074349,-0.915552,0.558421,-0.207238,-0.47361,-0.482562,-0.365641,0.533401,1.0,1.0
7,-0.665174,-1.082112,-0.63228,-0.644401,-1.137168,-0.156977,-0.283001,-0.583253,-1.190426,0.619845,-0.799,-1.042933,-0.787704,-0.619667,-0.016625,0.761004,0.220266,0.544235,0.217522,1.097556,-0.783526,-1.325769,-0.76595,-0.705167,-1.111231,0.130568,-0.053186,-0.146028,-0.500263,0.746732,1.0,1.0
8,-0.390247,1.057399,-0.362041,-0.427369,1.112101,0.404816,0.283425,0.525345,0.301085,0.387355,0.102613,-0.111889,0.04662,-0.060725,0.265553,0.053562,-0.09972,0.164404,-0.502147,-0.09371,0.203292,1.222709,0.160461,0.044013,2.23668,0.913636,0.565166,0.851333,0.70514,0.976598,0.0,0.0


Or use the predict method on a row:

In [12]:
row, clas, probs = learn.predict(df.iloc[10])

In [13]:
row.show()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,label
0,16.02,23.24,102.7,797.800002,0.08206,0.06669,0.03299,0.03323,0.1528,0.05697,0.3795,1.187,2.466,40.51,0.004029,0.009269,0.01101,0.007591,0.0146,0.003042,19.19,33.88,123.8,1149.99999,0.1181,0.1551,0.1459,0.09975,0.2948,0.08452,0


In [14]:
clas, probs

(tensor(0), tensor([0.7504, 0.2496]))

To get prediction on a new dataframe, you can use the `test_dl` method of the `DataLoaders`. That dataframe does not need to have the dependent variable in its column.

In [15]:
test_df = df.copy()
test_df.drop(['label'], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)

Use `Learner.get_preds` to get the predictions and them compare them with the actual labels.

In [16]:
pred = learn.get_preds(dl=dl)
pred = pd.DataFrame(pred[0]).iloc[:,0].apply(lambda x:0 if x >= 0.5 else 1)
pred = pred == df['label']
Overall_accuracy = np.count_nonzero(pred==True)*100/len(df)
print("Overall accuracy: {}, Training time: {}\n".format(round(Overall_accuracy,2), round(training_time/1000000000,2)))

Overall accuracy: 98.07, Training time: 2.64

