<a href="https://colab.research.google.com/github/santilema/data-sc_course/blob/main/randomForest_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

## Random forest

In [2]:
df = pd.read_csv('breast_cancer.csv')

In [3]:
#In column diagnosis, I replace 'M' for 0 and 'B' for 1
df['diagnosis'] = df['diagnosis'].map({'B': 1, 'M': 0})

In [4]:
#Delete unnamed column with NaN values
df = df.drop('Unnamed: 32', axis = 1)

In [5]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    int64  
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [7]:
x = df.drop('diagnosis', axis = 1) #Drop target variable to train the model
y = df.diagnosis

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    stratify = y,
                                                    test_size = 0.30, 
                                                    random_state = 11)

In [9]:
tree = DecisionTreeClassifier(random_state = 11)
tree.fit(x_train, y_train)

DecisionTreeClassifier(random_state=11)

In [10]:
y_test_pred = tree.predict(x_test)

### Accuracy of TreeClassifier

In [11]:
test_accuracy = accuracy_score(y_test, y_test_pred)

print('% of right guesses over evaluation set:',test_accuracy)

% of right guesses over evaluation set: 0.9122807017543859


In [12]:
#Create the random forest!
model = RandomForestClassifier(random_state=11, n_estimators=200,
                               class_weight="balanced", max_features="log2")
model.fit(x_train, y_train)

RandomForestClassifier(class_weight='balanced', max_features='log2',
                       n_estimators=200, random_state=11)

In [13]:
y_test_pred = model.predict(x_test)

### Accuracy gets better using random forest:

In [14]:
test_accuracy = accuracy_score(y_test, y_test_pred)

print('% of right guesses over evaluation set:',test_accuracy)

% of right guesses over evaluation set: 0.935672514619883


## KNN

I isolate one row of the df to evaluate it with KNN algorithm as if it were a new entry

In [23]:
isolated = df.iloc[[98]]
new_entry = isolated.drop('diagnosis', axis = 1)

In [24]:
isolated

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
98,862485,1,11.6,12.84,74.34,412.6,0.08983,0.07525,0.04196,0.0335,...,13.06,17.16,82.96,512.5,0.1431,0.1851,0.1922,0.08449,0.2772,0.08756


Once isolated the entry with id = 98, I remove the target variable as it will be predicted by the KNN algorithm later

In [25]:
new_entry

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
98,862485,11.6,12.84,74.34,412.6,0.08983,0.07525,0.04196,0.0335,0.162,...,13.06,17.16,82.96,512.5,0.1431,0.1851,0.1922,0.08449,0.2772,0.08756


In [26]:
new_entry.columns

Index(['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

I also remove the row from the dataframe

In [27]:
df_wo98 = df.drop(98)

In [28]:
knn = KNeighborsClassifier(n_neighbors=3)
n = df_wo98[['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']]
m = df_wo98['diagnosis']

knn.fit(n, m)

KNeighborsClassifier(n_neighbors=3)

In [29]:
prediction = knn.predict(new_entry)
print(prediction)

[1]


KNN predicts that the diagnosis of this tumor would be 1, which we know is true as...

In [30]:
isolated['diagnosis']

98    1
Name: diagnosis, dtype: int64