### Import datasets and basic libraries

In [2]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('./BreastCancer/dataset/breast-cancer-diagnostic.shuf.lrn.csv')
test_data = pd.read_csv('./BreastCancer/dataset/breast-cancer-diagnostic.shuf.tes.csv')
test_data_class = pd.read_csv('./BreastCancer/dataset/breast-cancer-diagnostic.shuf.sol.ex.csv')

## 1. KNN

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Splitting the dataset into attributes (X) and classes (Y)
X_dataset = dataset.drop('class', axis=1)
Y_classes = dataset['class'].astype(int)

# Defining variables for valiation
X_testing = test_data
Y_validation = test_data_class['class'].astype(int)


# Preprocessing the data by scaling it
scaler = StandardScaler()
X_train = scaler.fit_transform(X_dataset)
X_validation = scaler.transform(X_testing)

# Finding the best K value
# Creating a list of K values
k_values = list(range(1, 20))

# Creating a list of cross validation scores
cv_scores = []

for k in k_values:
    # Creating the KNN model
    knn_model = KNeighborsClassifier(n_neighbors=k, weights='uniform')
    
    # Doing cross validation
    scores = cross_val_score(knn_model, X_dataset, Y_classes, cv=50, scoring='accuracy')
    
    # Saving the mean of the scores
    cv_scores.append(np.mean(scores))

# Finding the optimal K value by finding the one with the highest accuracy
optimal_k = k_values[np.argmax(cv_scores)]
print(f"The optimal number of neighbors is: {optimal_k}")


# Creating the KNN model
"""
n_neighbors: Number of neighbors to use, in this case we are using the optimal_k
weights: Weight function used in prediction, in this case we are using uniform.
"""
knn_model = KNeighborsClassifier(n_neighbors=optimal_k, weights='uniform')
# Training the model
knn_model.fit(X_train, Y_classes)

# Validating the model
Y_predicted = knn_model.predict(X_testing)
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(Y_validation, Y_predicted)}')

# Calculating the F1 Score of the model (This is the one we are interested in for kaggle)
f1 = f1_score(Y_validation, Y_predicted, average='weighted')
print(f"F1 Score: {f1}")

print(X_train)
print(X_validation)


The optimal number of neighbors is: 1
Accuracy: 0.0
F1 Score: 0.0
[[-2.31981165e-01  3.89451197e-03 -5.44618879e-01 ...  3.61805382e-01
   3.13972024e-01 -2.17363790e-01]
 [ 5.46570356e-01 -7.25532969e-01  2.34998224e-01 ...  2.13385955e+00
   6.05102952e+00  4.61834945e+00]
 [-1.56176749e-01  2.13186996e+00 -1.00115142e+00 ...  1.23529087e+00
  -2.29200367e-01  1.49597645e-01]
 ...
 [-1.57133303e-01 -9.61017431e-01 -9.97923036e-02 ... -4.96039993e-01
   1.08273604e-01 -4.08866936e-01]
 [-1.57130990e-01 -1.02360672e-01  5.76812330e-01 ... -7.52331258e-01
  -6.85593736e-01 -7.00262264e-01]
 [-2.32064662e-01 -4.92920268e-01  4.30204391e-02 ... -7.19863971e-02
  -3.28835539e-01 -1.05049782e-01]]
[[-0.15617887  0.35712121 -1.4436368  ... -0.45885784 -0.61970596
  -0.83379689]
 [-0.23236802 -0.27179461  0.57212994 ...  1.3739862   2.41916679
   1.2126989 ]
 [-0.15803928 -0.19425705 -0.55164246 ... -0.2375355  -0.5891726
  -0.3902342 ]
 ...
 [-0.15713092 -0.30912752 -0.74127905 ... -0.812678



## 2. Neural Network

In [27]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Splitting the dataset into attributes (X) and classes (Y)
X_dataset = dataset.drop('class', axis=1)
Y_classes = dataset['class'].astype(int)

# Defining variables for valiation
X_testing = test_data
Y_validation = test_data_class['class'].astype(int)


# Preprocessing the data by scaling it
scaler = StandardScaler()
X_train = scaler.fit_transform(X_dataset)
X_validation = scaler.transform(X_testing)


# Initializing the MLP model
"""
- hidden_layer_sizes is the number of neurons in each layer, each number represents the number of neurons in a layer
- max_iter is the number of iterations the model will do
- activation is the activation function, in this case we are using the ReLU function (rectified linear unit)
- solver is the optimization algorithm, in this case we are using the Adam optimizer
- random_state is the seed for the random number generator, so the results are reproducible
"""
# While playing with the hidden layers I found that 5 neurons in the first layer and 3 in the second layer gave the best results for breast cancer dataset for this random seed 42
# But this is not always the case, if we change the seed we usually get a result of around 0.70
neural_network_model = MLPClassifier(hidden_layer_sizes=(5,3), max_iter=100, activation='relu', solver='adam', random_state=42)

# Training the model
neural_network_model.fit(X_train, Y_classes)

# Validating the model
Y_predicted = neural_network_model.predict(X_validation)


print(f"Accuracy: {accuracy_score(Y_validation, Y_predicted)}")
print(f"F1 Score: {f1_score(Y_validation, Y_predicted, average='weighted')}")




Accuracy: 1.0
F1 Score: 1.0




## 3. Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Splitting the dataset into attributes (X) and classes (Y)
X_dataset = dataset.drop('class', axis=1)
Y_classes = dataset['class'].astype(int)

# Defining variables for valiation
X_testing = test_data
Y_validation = test_data_class['class'].astype(int)


# Preprocessing the data by scaling it
scaler = StandardScaler()
X_train = scaler.fit_transform(X_dataset)
X_validation = scaler.transform(X_testing)

# Initializing the Random Forest model
random_forest_model = RandomForestClassifier(
                        max_leaf_nodes=2,
                        max_features=5,
                        max_depth=6,
                        random_state=42
                    )

# Fit the model to the data
random_forest_model.fit(X_train, Y_classes)

# Predict the classes of the validation set
Y_predicted = random_forest_model.predict(X_validation)

print(f"Accuracy: {accuracy_score(Y_validation, Y_predicted)}")
print(f"F1 Score: {f1_score(Y_validation, Y_predicted, average='weighted')}")

Accuracy: 0.6690140845070423
F1 Score: 0.8016877637130801
