# Breast Cancer Diagnostic Classification Project

Coded by Luna McBride

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split #Split the data into train and test
from sklearn.ensemble import RandomForestClassifier #Forest for prediction and regression
from sklearn.metrics import mean_squared_error #Error testing
from sklearn.metrics import classification_report #Report of Classification

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
cancer = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv") #Put our data into a dataframe
cancer.head() #Take a peek at the data

In [None]:
print(len(cancer.index)) #Print the number of rows

---

# Check for Null Values

In [None]:
print(cancer.isnull().any()) #Check if the columns have null values

In [None]:
print(cancer.loc[cancer["Unnamed: 32"].isnull() != True]) #Check to see if unnamed32 actually has values

Everything is not null except the column Unnamed: 32, which is entirely null. I will drop Unnamed: 32. I will also drop the ID, since the ID is not something I want to test against

In [None]:
cancer = cancer.drop(columns = {"Unnamed: 32", "id"}) #Drop the null column
cancer.head() #Take a peek and make sure it dropped

---

# Data Exploration

In [None]:
diagnosis = cancer["diagnosis"].copy() #Put the diagnosis into its own variable
characteristics = cancer.drop(columns = {"diagnosis"}).copy() #Put the characteristics in a separate dataframe

#For each column, print the min and max values for Malignant and Beneign tumors
for column in characteristics.columns:
    mal = cancer.loc[cancer["diagnosis"] == "M"][column] #Get the malignant values for the column
    ben = cancer.loc[cancer["diagnosis"] == "B"][column] #Get the beneign values for the column
    
    print("The max and min malignant values for {} are {}, {}".format(column, mal.max(), mal.min())) #Print malignant values
    print("The max and min beneign values for {} are {}, {}".format(column, ben.max(), ben.min())) #Print beneign values

It seems a malignant tumor tends to have higher min and max values overall, though with overlap to beneign values. This fits most variables, but variables like concavity appear to completely overlap the malignant numbers with the beneign. I believe characteristics like that will have low importance to the classification, but I will have to see.

---

# Build the Classifier (Full Data)

Here, I would like to see if a classifier has an easier/harder time when having all these worst/mean/deviation characteristics for the same variable versus just one or the other.

## Train-Test Split

In [None]:
diagnosis = pd.get_dummies(diagnosis) #Get the encoding for the diagnosis variable
print(diagnosis) #Take a peek at the diagnosis dummies

In [None]:
charaTrain, charaTest, diagTrain, diagTest = train_test_split(characteristics, diagnosis, test_size = 0.2) #Create train and test sets
print(diagTrain) #Print one of the splits to have an idea about the structure

## Fit a Random Forest Classifier

In [None]:
forest = RandomForestClassifier(n_estimators = 100) #Build a forest
forest.fit(charaTrain, diagTrain) #Fit the forest model

In [None]:
predict = forest.predict(charaTest) #Get a list of predictions

In [None]:
overallAccuracy = ("Overall", forest.score(charaTest, diagTest)) #Get the overall accuracy
print("Forest Accuracy: ", forest.score(charaTest, diagTest)) #Print the accuracy
print("Root Mean Square Error: ", np.sqrt(mean_squared_error(diagTest, predict))) #Print the root mean square error
print("Classification Report:\n ", classification_report(diagTest, predict, target_names = ["B", "M"])) #Print a classification report

In [None]:
attributes = characteristics.columns #Get the tested attributes
attributes = list(zip(attributes, forest.feature_importances_)) #Zip the attributes together with their coefficient
sortAtt = sorted(attributes, key = lambda x: x[1], reverse = True) #Sort the zipped attributes by their coefficients

print("According to the Random Forest, the most important factors for cancer status are: ") #Start printing the most important labels
i=0 #Counter variable so only the top five are printed

#For each attribute in the sorted attributes
for label, coef in sortAtt:
    if i<5: #If there has not been five printed yet
        print(label) #Print the label as an important factor
    i += 1 #Increase i by 1

The forest of overall characteristics was able to attain a 97% accuracy on the test set. The worst characteristics appear to be what the forest dubbed most important, so it should be fair to assume the worsts, when alone, will do best in a new classifier.

---

# Build a Forest for Each Worst/SE/Mean

In [None]:
#SplitData: splits this data based on whether it is the mean, se, or worst column for the dataset
#Input: the list of characteristic columns
#Output: A list that contains the lists of each column (se, mean, and worst)
def splitData(charactColumns):
    se = [] #A list holder for all SE columns
    mean = [] #A list holder for all columns that end in mean
    worst = [] #A list holder for all columns that end in worst
    
    #For each characteristics column, put it in the correct se, mean, or worst list
    for column in charactColumns:
        if column.find("se") > -1: #If the column name contains se (I checked; the only SE sequence is the _se at the end)
            se.append(column) #Add it to the SE list
        elif column.find("mean") > -1: #If the column contains mean
            mean.append(column) #Add it to the mean list
        else: #If the column contains neither, which means it contains worst
            worst.append(column) #Add it to the worst list
    
    return [se, mean, worst] #Return a list with all the previous lists inside

#RunForest: runs a forest for the specified characteristic type (colType), assuming the diagnosis is in its dummied form
#Input: the diagnosis, the characteristics, the column/characteristic type
#Output: None
def runForest(diag, chara, colType):
    charaTrain, charaTest, diagTrain, diagTest = train_test_split(chara, diag, test_size = 0.2) #Split the data into train and test
    
    forest = RandomForestClassifier(n_estimators = 100) #Build a forest for this data
    forest.fit(charaTrain, diagTrain) #Fit the forest
    
    predict = forest.predict(charaTest) #Make predictions for the test set
    
    print("Forest Accuracy for {}: {}".format(colType, forest.score(charaTest, diagTest))) #Print the accuracy
    print("Root Mean Square Error for {}: {}".format(colType, np.sqrt(mean_squared_error(diagTest, predict)))) #Print the root mean square error
    print("Classification Report for {}:\n {}".format(colType, classification_report(diagTest, predict, target_names = ["B", "M"]))) #Print a classification report

In [None]:
charact = characteristics.columns #Get the characteristics columns

columnList = splitData(charact) #Split the data into se, mean, and worst
colTypes = ["se", "mean", "worst"] #Set a list of types to the corresponding column types in order
i = 0 #Set an i variable to get the correct colType 

#For each column type, run a forest with just that type
for colList in columnList:
    chara = cancer[colList] #Get the characteristics of just the columns of the specified type
    runForest(diagnosis, chara, colTypes[i]) #Run a forest for this specific type
    
    i = i + 1 #Increase i so the column type remains consistent

100 estimators appears to be best for accuracy. The worst appears to be the best predictor with an accuracy and recall being 96+% (recall being most important in this case since we need to identify these correctly). Mean is also very close, being 95+% in accuracy and recall (plus or minus depending on the run). This is based on the data, which only has about 500 entries, so I bet these results would change with more data. Despite this, I would say either the worst values or the mean values would do fine if there is no access to the other types.