In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

<h1>Attribute Information:</h1>
<p><br>1) ID number
<br>2) Diagnosis (M = malignant, B = benign)
<br>3) Real-valued features are computed for each cell nucleus:
    
<br>a) radius (mean of distances from center to points on the perimeter)
<br>b) texture (standard deviation of gray-scale values)
<br>c) perimeter
<br>d) area
<br>e) smoothness (local variation in radius lengths)
<br>f) compactness (perimeter^2 / area - 1.0)
<br>g). concavity (severity of concave portions of the contour)
<br>h). concave points (number of concave portions of the contour)
<br>i). symmetry
<br>j). fractal dimension("coastline approximation" - 1)</p>


In [None]:
#Load dataset and save it in a variable
cancer = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
#View first 5 rows of the dataset
cancer.head()

In [None]:
#Count number of rows and columns in the dataset
cancer.shape

In [None]:
#Count the number of empty values in each column
cancer.isnull().sum()

In [None]:
#Drop the column with all missing values
cancer = cancer.dropna(axis = 1)

In [None]:
#Get the new count of the number of rows and columns
cancer.shape

In [None]:
#Get a count of the number of Malignant (M) or Benign (B) cells
cancer['diagnosis'].value_counts()

In [None]:
#Visualize the count
sns.countplot(cancer['diagnosis'], label = 'count', palette = "husl")

In [None]:
#Look at the data types to see which columns need to be encoded
cancer.dtypes

In [None]:
#Encode categorical data values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cancer.iloc[:, 1] = le.fit_transform(cancer.iloc[:,1].values)

In [None]:
#Create a pair plot
sns.pairplot(cancer.iloc[:,1:5], hue='diagnosis')

In [None]:
#Print first 5 rows of the data
cancer.head()

In [None]:
#Get the coreltion of the columns
cancer.iloc[:,1:12].corr()

In [None]:
#Visualize the corelation
plt.figure(figsize=(12, 9))
sns.heatmap(cancer.iloc[:,1:12].corr(), annot = True, fmt = '.0%')

In [None]:
#Split the data set into independent(X) and dependent (Y) data sets
X = cancer.iloc[:, 2:31].values
y = cancer.iloc[:,1].values

In [None]:
#Split the data set into 75% training and 25% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
#Scale the data (Feature Scaling)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
#Create a function for models
def models(X_train, y_train):
    
    #Logistic Regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state = 0)
    log.fit(X_train, y_train)
    
    #Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    tree.fit(X_train, y_train)
    
    #Random Forest Classifier
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    forest.fit(X_train, y_train)
    
    #Print the models accuracy on the training data
    print('Logistic Regression Training Accuracy:', log.score(X_train, y_train))
    print('Decision Tree Classifier Training Accuracy:', tree.score(X_train, y_train))
    print('Random Forest Classifier Training Accuracy:', forest.score(X_train, y_train))
    
    return log, tree, forest
    

In [None]:
#Getting all of the models
model = models(X_train, y_train)

In [None]:
#test model accuracy on test data on confusion matrix
from sklearn.metrics import confusion_matrix
for i in range (len(model)):
    print('Model ', i)
    cm = confusion_matrix(y_test, model[i].predict(X_test))

    TP = cm[0][0]
    TN = cm[1][1]
    FN = cm[1][0]
    FP = cm[0][1]

    print(cm)
    print('Testing Accuracy = ', (TP + TN) / (TP + TN + FN + FP))
    print()