# We use Machine Learning to predict whether a patient presents a benign or a malignant tumor

![image of hospital care](https://media.consumeraffairs.com/files/news/Hospital-care.jpg)

In [16]:
#Importing the required libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing, neighbors, model_selection
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score


#setting up the data as a dataframe
df = pd.read_csv('breast-cancer-wisconsin.data.txt')
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [17]:
#Exploring the dataset...
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                        699 non-null int64
clump_thickness           699 non-null int64
unif_cell_size            699 non-null int64
unif_cell_shape           699 non-null int64
marg_adhesion             699 non-null int64
single_epith_cell_size    699 non-null int64
bare_nuclei               699 non-null object
bland_chrom               699 non-null int64
norm_nucleoli             699 non-null int64
mitoses                   699 non-null int64
class                     699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [18]:
#We get rid of null values, which in this dataset are marked as "?" (we learn from external sources) We replace these values with a high negative value that Pandas and the models will more easily interpret as an outlier
df.replace('?', -9999, inplace = True)

In [19]:
#now we drop unnecessary columns that are not relevant to the model
#we could also use "dropna" for this job
df.drop(['id'], 1, inplace=True)


In [20]:
#we verify that we dropped 'id' 
df.head()

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


### NOTE: from external sources we learn that in this dataset, column "class", which is our response column, assigns a "2" for benign tumors and a "4" for malignant ones
For all other classes the severity of the symptoms varies on a scale from 1 to 10

In [21]:
#Now we are ready to implement the predictive model
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

In [22]:
#we implement train_test_split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

#instantiate the classifier
clf = neighbors.KNeighborsClassifier()

#and fit the data to the model
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [23]:
#and we print the accuracy achieved with this model
accuracy = clf.score(X_test, y_test)
print('Accuracy for this model is', accuracy)

Accuracy for this model is 0.95


## Now that we have our model set and ready, we proceed to make some predictions with new data from patients

In [24]:
#we pass the model the latest data we have obtained from two new patients, in the form of a list of arrays

example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]])

#IMPORTANT: we must reshape the arrays appropriately as input for the model
#For just one set of patient inputs we would use...
#([4,2,1,1,1,2,3,2,1]) and then reshape to (1, -1)
#but to pass several inputs we can do it with len()

example_measures = example_measures.reshape(len(example_measures),-1)

In [25]:
#Finally, we make the prediction for the new patients with the input given
prediction = clf.predict(example_measures)
print('The prediction for these patients is', prediction)

The prediction for these patients is [2 2]


# Since we have seen above that a "2" means the tumor is benign, we conclude this analysis  predicting, with an accuracy score of 97.8%, that none of the two patients presented have malignant tumors