In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### This work is in progress.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
import plotly.figure_factory as ff

from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot, download_plotlyjs


In [None]:
# Function defined to check medata of a dataframe
def master_dataframe(dataframe):
    df_metadata = pd.DataFrame({'Datatype': dataframe.dtypes,
                                "Null Values": dataframe.isna().sum(),  
                                "Null %": round(dataframe.isna().sum()/len(dataframe)*100, 2),
                                "No: Of Unique Values": dataframe.nunique()})
    
    df_describe = dataframe.describe(include='all').T
    
    df_metadata = df_metadata.join(df_describe)  

    return df_metadata

In [None]:
# Import data
raw_data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
# Check metadata by using the function master_dataframe()
master_dataframe(raw_data)

In [None]:
raw_data['Unnamed: 32'].unique()

In [None]:
# Dropping the ID & Unnamed 32 field.
data_col_drop = raw_data.copy()
data_col_drop.drop(columns = ['id', 'Unnamed: 32'], axis = 1, inplace = True)
data_col_drop.columns

In [None]:
# Check the unique value of our target variable diagnosis.
data_col_drop['diagnosis'].unique()

In [None]:
# Segregating the dependent and independent variables.
X = data_col_drop.iloc[:, 1:].values
y = data_col_drop.iloc[:, 0].values

In [None]:
print(X)

In [None]:
print(y)

In [None]:
# Lets encode the dependent variable i.e. diagnosis using LabelEncoder as it is a nominal categorical data(unordered).
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

In [None]:
# Let's split out dataset into train and test data before we create our model.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print('Shape of X_train', X_train.shape)
print('Shape of X_test', X_test.shape)
print('Shape of y_train', y_train.shape)
print('Shape of y_test', y_test.shape)

In [None]:
# Lets apply feature scaling to our train and test independent variables as they appear to be in different scale.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)

# Logistic Regression

In [None]:
# Lets create our Logistic Regression model by using sklearn library
from sklearn.linear_model import LogisticRegression
log_classifier = LogisticRegression(random_state = 0)
log_classifier.fit(X_train, y_train)

In [None]:
# Lets make the prediction using the model.
y_pred = log_classifier.predict(X_test)
print(y_pred)

In [None]:
# Lets measure the accuracy of the model using the confusion matrix and accruacy score.
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per)

In [None]:
# Lets check the confusion matrix
print(cm)

In [None]:
# Lets check the accuracy using the accuracy_score from sklearn.
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the model: ', round((accuracy*100), 2))

In [None]:
# Lets transform the encoded predicted variable back to it's original form for better understanding.
y_pred_d_enco = le.inverse_transform(y_pred)
y_test_d_enco = le.inverse_transform(y_test)

# Also lets store them in a dataframe and check them parallely
comparison = pd.DataFrame()
comparison['Actual Values'] = y_test_d_enco
comparison['Predicted Values'] = y_pred_d_enco

In [None]:
# Lets check all the data in the dataframe we just created.
pd.set_option('display.max_rows', 200)
comparison

# Good to see most of them match.

# K Nearest Neighbors Classifier

In [None]:
# Creating the model
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(X_train, y_train)

# Predicting the outcome.
y_pred_knn = knn_classifier.predict(X_test)

In [None]:
# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_knn)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_knn = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_knn)

In [None]:
# Lets check the accuracy using the accuracy_score from sklearn.
accuracy = accuracy_score(y_test, y_pred_knn)
print('Accuracy of the model: ', round((accuracy*100), 2))

# Support Vector Machine

In [None]:
# Lets create a SVM classification model.
from sklearn.svm import SVC
svc_classifier = SVC(kernel = 'linear')
svc_classifier.fit(X_train, y_train)

# Predicting the outcome.
y_pred_svm = svc_classifier.predict(X_test)

In [None]:
# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_svm)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_svm = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_svm)

In [None]:
# Lets check the accuracy using the accuracy_score from sklearn.
accuracy = accuracy_score(y_test, y_pred_svm)
print('Accuracy of the model: ', round((accuracy*100), 2))

# Kernel SVM

In [None]:
k_svm_classifier = SVC(kernel = 'rbf')
k_svm_classifier.fit(X_train, y_train)

# Predicting the outcome.
y_pred_k_svm = k_svm_classifier.predict(X_test)

In [None]:
# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_k_svm)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_k_svm = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_k_svm)

In [None]:
# Lets check the accuracy using the accuracy_score from sklearn.
accuracy = accuracy_score(y_test, y_pred_k_svm)
print('Accuracy of the model: ', round((accuracy*100), 2))

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Predicting the outcome.
y_pred_nb = nb_classifier.predict(X_test)

In [None]:
# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_nb)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_nb = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_nb)

In [None]:
# Lets check the accuracy using the accuracy_score from sklearn.
accuracy = accuracy_score(y_test, y_pred_nb)
print('Accuracy of the model: ', round((accuracy*100), 2))

# Decision Tree Clssifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(criterion = 'entropy')
dt_classifier.fit(X_train, y_train)

# Predicting the outcome.
y_pred_dt = dt_classifier.predict(X_test)

In [None]:
# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_dt)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_dt = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_dt)

In [None]:
# Lets check the accuracy using the accuracy_score from sklearn.
accuracy = accuracy_score(y_test, y_pred_dt)
print('Accuracy of the model: ', round((accuracy*100), 2))

# Random Forest Classfier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
rf_classifier.fit(X_train, y_train)

# Predicting the outcome.
y_pred_rf = rf_classifier.predict(X_test)

In [None]:
# Lets measure the accuracy of the model using the confusion matrix.
cm = confusion_matrix(y_test, y_pred_rf)

corr_pred = cm[0, 0] + cm[1, 1]
total = cm.sum()
corr_pred_per_rf = round(corr_pred/total*100, 2)
print('Percentage of correct predictions: ', corr_pred_per_rf)

In [None]:
# Lets check the accuracy using the accuracy_score from sklearn.
accuracy = accuracy_score(y_test, y_pred_rf)
print('Accuracy of the model: ', round((accuracy*100), 2))