
<h1 style="font-family: Trebuchet MS; padding: 12px; font-size: 48px; color: #D97271; text-align: center; line-height: 1.25;"><b>Breast Cancer<span style="color: #000000"> Diagnosis Prediction</span></b><br><span style="color: #D97271; font-size: 24px">with Various Machine Learning Models</span></h1>


> **Notes**
> 
> * **Colab Link**: [Click here](https://colab.research.google.com/drive/1ogt4xlQss13cZfJGgvUZsmu6jxl_9Yvv?usp=sharing)
> * **Proposal**: [Latex](https://www.overleaf.com/9823217632srwkshctbrmd)
> * **Report**: [Google doc](), [Latex](https://www.overleaf.com/4224522178csppcpwgcqst)
> * **Presentation**: [Slides](https://docs.google.com/presentation/d/1VW1I-qKfIku8DwwTyGibQOvgtBOpEakpLM2pziOAptI/edit?usp=sharing)


# <div style="font-family: Trebuchet MS; background-color: #D97271; color: #FFFFFF; padding: 12px; line-height: 1.5;"> Dataset 1: Breast Cancers</div>

<div class="warning" style='background-color:#f5dada; color: #000000; border-left: solid #d68181 4px; border-radius: 4px; padding:0.7em;'>
<span>
<p style='margin-top:1em'>
<b>Workflow:</b></p>
<p style='margin-left:1em;'>

1. Environment Setup
2. Data Download
3. Initial Data Explorations
4. EDA (wait for graph edit)
5. Feature Selections (wait for graph edit)
6. Models

</p>
<p style='margin-bottom:1em; margin-right:1em; text-align:right; font-family:Georgia'> <b></b> <i></i>
</p></span>
</div>


##  <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5; border-left: solid #d68181 4px; border-radius: 5px">Environment Setup</div>

In [None]:
# Import required package - Usual packages
# Color test
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    BOLD_RED_COLOR = '\033[1m' + '\033[91m'
    BOLD_CYAN_COLOR = '\033[1m' + '\033[96m'
    BOLD_GREEN_COLOR = '\033[1m' + '\033[92m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

print(color.BOLD_RED_COLOR + '\nImporting all the required libraries...'+ color.END)

## Basics
import warnings 
import pandas as pd
import numpy as np
import re
import copy
import pip
import sys
import os.path
import platform
import collections

# Unusual packages Check and install
packages = ['pywaffle', 'yellowbrick','missingno','ast','spacy','scispacy','httpx']
for p in packages:
    if not p in sys.modules:
        pip.main(['install', p])

## Graphs
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
import matplotlib.patches as patches
import seaborn as sns

from IPython.display import display
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.offline as py
import plotly.express as px
from pywaffle import Waffle
import missingno as msno

# Models
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
## Logistic
from sklearn.linear_model import LogisticRegression
## KNN
from sklearn.neighbors import KNeighborsClassifier
## SVC
from sklearn.svm import SVC
## ANN
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
# Model Visulization
from yellowbrick.classifier import ConfusionMatrix, ROCAUC, PrecisionRecallCurve
from yellowbrick.model_selection import LearningCurve, FeatureImportances
from yellowbrick.contrib.wrapper import wrap
from yellowbrick.style import set_palette

# Web Scrape
import json
import httpx
import time
import urllib
from tqdm.auto import tqdm
import requests
import bs4

# NLP
import spacy
from scispacy.abbreviation import AbbreviationDetector
import ast
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

# Settings
warnings.filterwarnings('ignore')
colors = ['#303030' ,'#98CDBE','#D97271','#FDF8F8']

sns.set_style('white')
plt.rcParams['figure.dpi'] = 500
mpl.rcParams['axes.spines.left'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = False
plt.rcParams["font.family"] = "serif"
pd.set_option('display.max_columns', None)

# END
print(color.BOLD_RED_COLOR + 'Finished\n'+ color.END);

In [None]:
sns.palplot(colors)
plt.suptitle('Color Palette', fontweight='heavy', ha='center', fontsize=5, color=colors[0]);

## <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5; border-left: solid #d68181 4px; border-radius: 5px">Jupyter Notebook Data Download </div>

In [None]:
if not os.path.exists('data.csv'):
    mysystem = platform.system()
    file_download_link = 'https://drive.google.com/uc?id=1mLS7-mhwMhkcBdt0NdBOHK3iz-EXzBeS'

    if mysystem != 'Windows':
        !wget -O cancer_data.zip --no-check-certificate "$file_download_link"
        !unzip cancer_data.zip
    else:
        if not 'gdown' in sys.modules:
            pip.main(['install', 'gdown'])
            import gdown

        file_id = '116MM-XRAVwAORT8iJXKnfA6Q0X-qZCjvi'
        output = "data.csv"
        gdown.download(
            f"https://drive.google.com/uc?export=download&confirm=pbef&id={file_id}",
            output
        )
    
    print('Download Completed')
else:
    print('Found data files.')

## <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5; border-left: solid #d68181 4px; border-radius: 5px">Initial Data Explorations </div>

### Check missing values

In [None]:
# Read data
df = pd.read_csv('data.csv')
df.head()

In [None]:
# Check null variables
msno.bar(df)
plt.title('Missing / Null Values Graph', fontweight='heavy', 
          ha='center', fontsize=25, color=colors[0]);

We can see that the column `Unnamed: 32` has 569 missing values. Therefore we remove this column.

In [None]:
# Set new dataframe
df = df.drop(df.columns[-1], axis=1)

### Delete an unnecessary column: `id`

In [None]:
# Final cleaned dataset
df = df.drop(df.columns[0], axis=1)

In [None]:
# Check columns
df.columns

### Data Info

In [None]:
# Info
df.info()

In [None]:
# Description of dataset
df.describe()

## <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5; border-left: solid #d68181 4px; border-radius: 5px"> EDA </div>

In [None]:
# Plot the Waffle plot of diagnosis distribution of the dataset
target = df['diagnosis']
targetDist = round(target.value_counts(normalize = True),2)*100
labels = ['Benign', 'Malignant']

# Main Figure
fig = plt.figure(FigureClass=Waffle, rows=5, colors=colors[1:3], values=targetDist, figsize=(9, 5))
# Title and subtitle
plt.suptitle('Breast Cancer Diagnosis Distributions', fontweight='heavy', y=0.79, ha='center', 
             fontsize=13, color=colors[0]) 
plt.title('It appears that the dataset has more benign data.\n', 
          style='italic', fontsize=8, loc='center', y=0.98, ha='center',  color=colors[0])

# Legends
plt.text(1.4, -0.25, '{0:.2f}%'.format(targetDist[0]), color=colors[1], 
         fontsize=16, ha='center', weight='bold', va='bottom')
plt.text(1.4, -0.28, labels[0], color=colors[0], fontsize=10, ha='center', va='top', weight='bold')
plt.text(3.43, -0.25, '{0:.2f}%'.format(targetDist[1]), color=colors[2], 
         fontsize=16, ha='center', weight='bold', va='bottom')
plt.text(3.43, -0.28, labels[1], color=colors[0], fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
# Pie chart
fig = go.Pie(labels = ['Benign','Malignant'], values = df['diagnosis'].value_counts(), 
               textfont=dict(size=18), marker = dict(colors=[colors[2], colors[1]], 
                line=dict(color='#000000', width=2)))
layout = dict(title = 'Breast Cancer Diagnosis Distributions')
fig = dict(data = [fig], layout=layout)
py.iplot(fig)

In [None]:
def barplot1(start,end):
    plt.figure(figsize = (20, 20))
    sub_plot = 0

    for column in df.columns[start:end]:
        ax = plt.subplot(5, 3, sub_plot+1)
        plot = sns.distplot(df[column])
        plot.set(xticklabels=[])
        plot.set(yticklabels=[])
        plot.set(ylabel=None)
        xlabel = ' '.join([w.capitalize() for w in column.split('_') ])
        plot.set_xlabel(xlabel, fontsize = 15, fontweight='heavy')
        sub_plot += 1
    plt.suptitle(f'Distribution plot for col {start} to col {end-1}', 
                 fontweight='heavy',color=colors[0],fontsize=20) 
    plt.tight_layout()
    plt.show()
    

In [None]:
barplot1(1,16)

In [None]:
barplot1(16,31)

Based the description of the dataset and distribution plots for each features, we can find that all the predictors are on a very different scale. Therefore before doing further data analysis in each features, we decide to standardize our dataset making the variables more comparable.

In [None]:
# Standarization of dataset
df_S = df.copy()
df_S[df_S.columns[1:]] = preprocessing.scale(df[df.columns[1:]])
features = df_S['diagnosis']
data = df_S[df_S.columns[1:]]

In [None]:
# Boxplot
def cancer_boxplot(start,end,label=''):
    bp = pd.concat([features,data.iloc[:,start:end]],axis=1)
    bp = pd.melt(bp, id_vars="diagnosis", var_name="features", value_name='value')
    
    # Plot
    fig, ax = plt.subplots()
    fig.set_size_inches(18, 8)
    plot = sns.boxplot(x="features", y="value", hue="diagnosis", data=bp,
                palette=[colors[2],colors[1]],width=0.4)
    # Adjust Graph
    plt.xticks(rotation=45)
    plt.suptitle(f'Descriptive Box Plot by Diagnosis Groups ({label})', 
                 fontweight='heavy', ha='center',y=0.93, 
             fontsize=18, color=colors[0])
    plt.grid(axis='y')
    plot.set(xlabel=None)
    plot.set(ylabel=None)
    plt.legend(title='Diagnosis')
    plt.show()

### Features Distribution

In [None]:
cancer_boxplot(0,10,'plot 1')

In [None]:
cancer_boxplot(10,20,'plot 2')

In [None]:
cancer_boxplot(20,31,'plot 3')

In [None]:
# Violinplot
df_S["all"] = ""
fig = plt.figure(figsize = (15, 25))
sub_plot = 0

for col in data.columns:
    plt.subplot(6, 5, sub_plot+1)
    
    plot=sns.violinplot(x='all', y=col, hue="diagnosis", data=df_S,palette=[colors[2],colors[1]], 
                        split=True, edgecolor = 'solidblack',linewidth = 2,inner="quart")
    plt.legend([],[], frameon=False)
    plot.set(xticklabels=[])
    plot.set(yticklabels=[])
    plot.set(ylabel=None)
    xlabel = ' '.join([w.capitalize() for w in col.split('_') ])
    plot.set_xlabel(xlabel, fontsize = 15, fontweight='heavy')
    sub_plot += 1

plt.tight_layout()

# Title
plt.suptitle('Breast Cancer Diagnosis Distributions', fontweight='heavy', 
             y=1.04,ha='center', fontsize=21, color=colors[0]) 
plt.show()

The distribution between malignant and benign tumors among all the cell image data shows that in most of the case the malignant tumors are more likely to have larger and faster-growing than original cells.

Since there are lots of features in the dataset, we need to delete some variables. 
The first step we should do is to use heatmap to see the correlation among each columns. 

In [None]:
# Interactive heatmap
df_corr_round = df.corr().round(2)
fig = px.imshow(df_corr_round, text_auto=True,color_continuous_scale='Viridis')
fig.layout.height = 1000
fig.layout.width = 1000
fig.update_layout(dragmode=False)
fig.show()

Based on the heatmap, we can see that some pairs of variables have high correlations, such as `radius_worst` and `perimeter_mean`, which means that there is multicollinearity in the dataset. To solve this, we will apply correlation filter in the latter steps. Since there are 30 features in our dataset, to avoid overfitting issue, we should use PCA and correlation filter to reduce the dimension of the dataset. 

### Multivariate Analysis
#### Check the correlation between few features by pair

We first plot top 4 pairs where their correlations between features are positive and high. 

In [None]:
palette = {'M': colors[2], 'B': colors[1]}

def scatter_plot(lst,title):
    plt.figure(figsize = (10, 8))
    sub_plot = 0

    for cols in lst:
        ax = plt.subplot(2, 2, sub_plot+1)
        plot = sns.scatterplot(x = df[cols[0]], y = df[cols[1]], hue = 'diagnosis', 
                    data = df, palette = palette)
        plot.patch.set_edgecolor('black')  
        plot.patch.set_linewidth('1')
        plt.legend([],[], frameon=False)
        xlabel = ' '.join([w.capitalize() for w in cols[0].split('_') ])
        ylabel = ' '.join([w.capitalize() for w in cols[1].split('_') ])
        plot.set_xlabel(xlabel, fontweight='heavy')
        plot.set_ylabel(ylabel, fontweight='heavy')
        sub_plot += 1

    plt.tight_layout()

    # Add global legend
    handles, labels = ax.get_legend_handles_labels()
    plt.legend(handles, labels,loc=(1.05, 2),title='Diagnosis')

    # Add title
    plt.suptitle(title, fontweight='heavy',y=1.04,ha='center', fontsize=15, color=colors[0]) 
    plt.show()

In [None]:
positive_corr = [['perimeter_mean','radius_worst'],
['area_mean','radius_worst'],
['area_worst','radius_worst'],
['texture_mean','texture_worst']]

In [None]:
scatter_plot(positive_corr,'Positive Correlated Features')

We then plot top 4 pairs where their correlations between features are positive and low. 

In [None]:
positive_uncorr = [['smoothness_mean','texture_mean'],
['texture_worst','symmetry_mean'],
['symmetry_se','texture_worst'],
['radius_mean','fractal_dimension_worst']]

In [None]:
scatter_plot(positive_uncorr,'Positive Uncorrelated Features')

We then plot top 4 pairs where their correlations between features are negative and high. 

In [None]:
negative_corr = [['area_mean','fractal_dimension_mean'],
['smoothness_se','perimeter_mean'],
['radius_mean','fractal_dimension_mean'],
['area_mean','smoothness_se']]

In [None]:
scatter_plot(negative_corr,'Negative Correlated Features')

We can see that the cancer cell features which are highly correlated, which means that we have multicolinear features. Those features could affect the predictions in the latter model training process.

## <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5; border-left: solid #d68181 4px; border-radius: 5px"> Feature Selections </div>


### Binarize labels

In [None]:
lb = preprocessing.LabelBinarizer()
df['diagnosis'] = lb.fit_transform(df['diagnosis'])
# Check the distribution
df['diagnosis'].value_counts(normalize=True)

### Original Standardized Dataset

In [None]:
# Split data 8:2
X_train, X_test, y_train, y_test = train_test_split(df.drop(['diagnosis'], axis=1), 
                                                    df.diagnosis, test_size=0.2, random_state=14)

# Standardization
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(color.BOLD_CYAN_COLOR + '\nTraining data shape:'+ color.END, X_train.shape)
print(color.BOLD_CYAN_COLOR + 'Testing data shape:'+ color.END, X_test.shape)

### PCA

In [None]:
# Screen plot of PCA
# Performing the principal component analysis
pca = PCA(n_components=10)
pca.fit_transform(X_train)
percent_var = np.round(pca.explained_variance_ratio_*100, decimals=1)   
labels = ['PC' + str(p) for p in range(1,len(percent_var)+1)]

In [None]:
PCA_df = pd.DataFrame(
    {'labels': labels,
     'percent_var': percent_var})

In [None]:
ax1 = sns.set_style(style=None, rc=None )
fig, ax1 = plt.subplots(figsize=(12,6))

plot1 = sns.barplot(data=PCA_df, x="labels", y="percent_var",color=colors[1],ci=None)
plot1.patch.set_edgecolor('black')  
plot1.patch.set_linewidth('1')
plt.xlabel('Principal Components')
plt.ylabel('Percentage of Variation explained')
plt.title(f'PCA Screen Plot First 7 Components ({percent_var.sum()}%)',fontweight='heavy')

ax2 = ax1.twinx()
plot2 = sns.lineplot(data=PCA_df, x="labels", y="percent_var",marker='o',color=colors[0], ax=ax2)
plot2.set(yticklabels=[])
plot2.set(ylabel=None)
fig.show()

Let us drop PCA plot with 2 and 3 components

#### PCA Plot (n = 2)

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(df[df.columns[1:]])
fig = px.scatter(components, x=0, y=1, color=df['diagnosis'],
                labels={'0': 'PC 1', '1': 'PC 2'})
fig.update_coloraxes(showscale=False)
fig.update_traces(marker=dict(size=10,line=dict(width=1.5,color=colors[0])),
                  selector=dict(mode='markers'))
fig.show()

#### PCA Plot (n = 3)

In [None]:
pca = PCA(n_components=3)
components = pca.fit_transform(df[df.columns[1:]])

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['diagnosis'],
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'})
fig.update_coloraxes(showscale=False)
fig.update_traces(marker=dict(size=5,line=dict(width=2,color=colors[0])),
                  selector=dict(mode='markers'))
fig.show()

After applying PCA and calculating the percentage of explained variances (PEV) of the first ten components, the first 5 PCs can explain 85% of the variance. Therefore, we will focus on the first 5 PCs in this project

In [None]:
# PCA
pca = PCA(n_components=5)
X_train_PCA = pca.fit_transform(X_train)
X_test_PCA = pca.fit_transform(X_test)

print(color.BOLD_CYAN_COLOR + '\nTraining data shape:'+ color.END, X_train_PCA.shape)
print(color.BOLD_CYAN_COLOR + 'Testing data shape:'+ color.END, X_test_PCA.shape)

### Feature Selections BASED ON CORRELATIONS

Based on the heatmap above we can see that some columns are highly correlated to each others. 

We dropped a total of nine features: “area-mean, perimeter-mean, radius-worst, area-worst, perimeter-worst, texture-worst, concavity-mean, perimeter-se, area-se.”

In [None]:
# Correlations
corr = df[df.columns[1:]].corr()
corr = corr.abs().unstack().drop_duplicates()
for index, value in corr.items():
    if value != 1 and value >= 0.85:
        col1 = ' '.join([w.capitalize() for w in index[0].split('_') ])
        col2 = ' '.join([w.capitalize() for w in index[1] .split('_') ])
        print(color.BOLD_CYAN_COLOR + f'\n    {col1} and {col2}:'+ color.END,
             round(value,3))

**Based on the above list, we will drop:**
1. `perimeter_mean`
2. `area_mean`
3. `radius_worst`
4. `area_worst`
5. `perimeter_worst`
6. `texture_worst`
7. `concavity_mean`
8. `perimeter_se`
9. `area_se`
10. `compactness_worst`

In [None]:
# Drop list
drop_lst = ['perimeter_mean','area_mean','radius_worst','area_worst',
           'perimeter_worst','texture_worst','concavity_mean','perimeter_se',
           'area_se','compactness_worst']    

In [None]:
# Interactive heatmap
df1 = df.drop(drop_lst,axis = 1)
df_corr_round = df1.corr().round(2)
fig = px.imshow(df_corr_round, text_auto=True, color_continuous_scale='Viridis')
fig.layout.height = 1000
fig.layout.width = 1000
fig.update_layout(dragmode=False)
fig.update_xaxes(tickangle=90)
fig.show()

In [None]:
# corr datasets
column_lst = list(df.columns[1:])
index_lst = []
for i in drop_lst:
    index_lst.append(column_lst.index(i))

X_train_CORR = np.delete(X_train, np.s_[index_lst], axis=1)
X_text_CORR = np.delete(X_test, np.s_[index_lst], axis=1)

print(color.BOLD_CYAN_COLOR + '\nTraining data shape:'+ color.END, X_train_CORR.shape)
print(color.BOLD_CYAN_COLOR + 'Testing data shape:'+ color.END, X_text_CORR.shape)

## <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5;border-left: solid #d68181 4px; border-radius: 5px">Models</div>

<div class="warning" style='padding:0.1em; background-color:#f5dada; color:#3c695bb; border-top: solid #d68181 5px; border-radius: 3px; padding:1em;' >
<span>
<p style='margin-top:1em; text-align:center'>
<p style='margin-left:1em;'>

**We will use three datasets to train models:**

1. Original: `X_train`, `X_text`
2. After PCA: `X_train_PCA`, `X_text_PCA`
3. After correlation filter: `X_train_CORR`, `X_text_CORR`
</p>
<p></p></span>
</div>


### Model Training

In [None]:
# Models dict
models = {}

In [None]:
# --- Performance Evaluation Function --- #
# Confusion matrix, ROC AUC

def dataset_type(data_type):
    """ Wait for edit
    """
    if data_type == 'PCA':
        train_data = X_train_PCA
        test_data = X_test_PCA
    elif data_type == 'CORR':
        train_data = X_train_CORR
        test_data = X_text_CORR
    elif data_type == 'ORI':
        train_data = X_train
        test_data = X_test
    return (train_data,test_data)


def evaluate_models(model_name, model_fun, data_type='ORI'):
    """Wait for edit
    
    * Has Code references  
    """
    classifier = model_fun
    train_data,test_data = dataset_type(data_type)
    
    print(color.BOLD_RED_COLOR + f'\n{model_name} - {data_type}: '+ color.END)
    
    # Fitting models
    print(color.BOLD_CYAN_COLOR + '\n.:. Model Fittings .:. '+ color.END)
    classifier.fit(train_data, y_train)
    print(classifier)
    y_pred = classifier.predict(test_data)
    
    # Classification Report
    print(color.BOLD_CYAN_COLOR + '\n.:. Classification Report .:. '+ color.END)
    print(classification_report(y_test, y_pred))
    
    print(color.BOLD_CYAN_COLOR + '\n.:. Models Evaluation Graphs .:. '+ color.END)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(color.BOLD_GREEN_COLOR + '      Accuracy: %f' % accuracy + color.END,'\n')
    
    # Add accuracy to the models dict
    models[model_name+'_'+data_type] = accuracy
    
    # Figure settings
    set_palette('sns_pastel')
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11,5))
    
    
    # Confusion Matrix
    conf_matrix = ConfusionMatrix(classifier, ax=ax1, cmap='bugn', classes=['B', 'M'],
                                  title=f'{model_name} Confusion Matrix')
    conf_matrix.fit(train_data, y_train)
    conf_matrix.score(test_data, y_test)
    conf_matrix.finalize()
    
    # ROC AUC
    try:
        logrocauc = ROCAUC(classifier, classes=['B', 'M'], cmap='sns_pastel',
                       ax=ax2, title=f'{model_name} ROC AUC')
        logrocauc.fit(train_data, y_train)
        logrocauc.score(test_data, y_test)
        logrocauc.finalize()
    except:
        logrocauc = ROCAUC(classifier, classes=['B', 'M'], cmap='sns_pastel',
                       ax=ax2, title=f'{model_name} ROC AUC',binary=True)
        logrocauc.fit(train_data, y_train)
        logrocauc.score(test_data, y_test)
        logrocauc.finalize()
    
    plt.tight_layout()

$\,$

In [None]:
# --- Logistic Models Function --- #
def log_reg():    
    classifier = LogisticRegression()
    return classifier

In [None]:
# Logistic Modles - Original
evaluate_models('Logistic Regression',log_reg());

In [None]:
# Logistic Modles - PCA
evaluate_models('Logistic Regression',log_reg(),'PCA');

In [None]:
# Logistic Models - Correlation Filter
evaluate_models('Logistic Regression',log_reg(),'CORR');

$\,$

In [None]:
# --- KNN Models Function --- #
def KNN_model():
    classifier = KNeighborsClassifier()
    return classifier

In [None]:
# Original
evaluate_models('KNN', KNN_model());

In [None]:
# PCA
evaluate_models('KNN', KNN_model(),'PCA');

In [None]:
# High Correlation Filter
evaluate_models('KNN', KNN_model(),'CORR');

$\,$

In [None]:
# --- SVM (linear) Models Function --- #
def SVM_model_linear():
    classifier = SVC(kernel='linear')
    return classifier

In [None]:
# Original
evaluate_models('SVM (Linear)',SVM_model_linear());

In [None]:
# PCA
evaluate_models('SVM (Linear)', SVM_model_linear(),'PCA');

In [None]:
# High Correlation Filter
evaluate_models('SVM (Linear)', SVM_model_linear(),'CORR');

$\,$

In [None]:
# --- SVM (rbf) Models Function --- #
def SVM_model_rbf():
    classifier = SVC(kernel='rbf')
    return classifier

In [None]:
# Original
evaluate_models('SVM (rbf)',SVM_model_rbf());

In [None]:
# PCA
evaluate_models('SVM (rbf)',SVM_model_rbf(),'PCA');

In [None]:
# High Correlation Filter
evaluate_models('SVM (rbf)',SVM_model_rbf(),'CORR');

$\,$

In [None]:
# --- ANN Models Function --- #
def ANN_model(data_type='ORI'):
    train_data,test_data = dataset_type(data_type)

    # Initialize ANN models
    classifier = Sequential()
    classifier.add(Dense(16, activation='relu', input_dim=train_data.shape[1]))
    classifier.add(Dense(15, activation='relu'))
    classifier.add(Dense(1, activation='sigmoid'))
    classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    classifier.fit(train_data, y_train, batch_size=10, epochs=100);
    y_pred = classifier.predict(test_data)

    # Set up threshold = 0.5
    y_pred = [1 if (y >= 0.5).all() else 0 for y in y_pred]

    # Model evaluation
    print(color.BOLD_CYAN_COLOR + '\n.:. Classification Report .:. '+ color.END)
    print(classification_report(y_test, y_pred))
    accuracy = accuracy_score(y_test, y_pred)
    print(color.BOLD_GREEN_COLOR + '      Accuracy: %f' % accuracy + color.END,'\n')
    models['ANN'+'_'+data_type] = accuracy

In [None]:
# Original
ANN_model('ORI')

In [None]:
# PCA
ANN_model('PCA')

In [None]:
# High Correlation Filter
ANN_model('CORR')

### Model Comparisons

In [None]:
# return to dataframe
models_df = pd.DataFrame(models.items())
models_df.columns = ['model','accuracy']
models_df['accuracy'] = round(models_df['accuracy']*100,3)
models_df

In [None]:
# !pip install matplotlib==3.4.1 if has error in bar_label
most_colors = ['#ED5564','#4FC1E8','#A0D568','#FFCE54','#AC92EB']
model_lst = [[0,3],[3,6],[6,9],[9,12],[12,15]]
plt.figure(figsize = (12, 5))
sub_plot = 0

for index,val in enumerate(model_lst):
    ax = plt.subplot(1, 5, sub_plot+1)
    df_model = models_df.iloc[val[0]:val[1],]
    values = df_model['accuracy'].to_list()
    clrs = ['#D3D3D3' if (x < max(values)) else most_colors[index] for x in values ]
    plot = sns.barplot(data=df_model, x="model", y="accuracy",color=colors[1],palette=clrs)
    plot.set(xticklabels=['ORI','PCA','CORR'])
    plot.set(yticklabels=[])
    plot.patch.set_edgecolor('black')  
    plot.patch.set_linewidth('1')
    plot.set(ylabel=None)
    plot.bar_label(ax.containers[0], fmt='%.2f%%')
    xlabel = ''.join(df_model['model'].to_list()[1].partition('_')[0:1])
    plot.set_xlabel(xlabel, fontsize = 12, fontweight='heavy')
    sub_plot += 1
    plt.suptitle(f'Models Accuracy', fontweight='heavy',color=colors[0],fontsize=16) 
plt.tight_layout()
plt.show()

# <div style="font-family: Trebuchet MS; background-color: #D97271; color: #FFFFFF; padding: 12px; line-height: 1.5;"> Dataset 2: Web Scrape of ScienceDirect </div>

<div class="warning" style='background-color:#f5dada; color: #000000; border-left: solid #d68181 4px; border-radius: 4px; padding:0.7em;'>
<span>
<p style='margin-top:1em'>
<b>Workflow:</b></p>
<p style='margin-left:1em;'>

1. Web Scrap from Elsevier by using API key and under Institutional VPN
2. Simple NLP implementation
3. Results and Discussion

</p>
<p style='margin-bottom:1em; margin-right:1em; text-align:right; font-family:Georgia'> <b></b> <i></i>
</p></span>
</div>

## <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5; border-left: solid #d68181 4px; border-radius: 5px"> Web Scrape - Raw Data </div>

In [None]:
# API keys
api1 = 'b0ae12317b37ed66678d9734039edf82'
api2 = '3c8dc36b6e2bc0cdf8478beca1fbd040'

In [None]:
# Functions
def lst_chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Class
class ScienceDirectScrape():
    """ Simple version of web scrape of ScienceDirect
    """
    SEARCH = "https://api.elsevier.com/content/search/"
    
    def __init__(self, query):
        self.query = query
        self._api_key = '' #Ciphertext storage
        self.search_type = 'sciencedirect'
        self.search_url = self.SEARCH + self.search_type + '?query=' + urllib.parse.quote(self.query)
        self.df1 = pd.DataFrame()
        self.df2 = pd.DataFrame()

    @property
    def api_key(self):
        return self._api_key
    @api_key.setter
    def api_key(self, api_key):
        if api_key == None or len(api_key) != 32:
            raise ValueError('Invalid API Key. Please Check in Elsevier Developer Portal.')
        self._api_key = api_key
    
    @property
    def total_result(self):
        return self._total_result
    
    def start_search(self):
        """Wait for edit
        """
        print(color.BOLD_GREEN_COLOR + '\n Start to scrape the results:'+ color.END)
        # Initilization
        headers={"X-ELS-APIKey":self.api_key,"Accept":'application/json'}
        timeout = httpx.Timeout(10.0, connect = 30.0)
        client = httpx.Client(timeout = timeout,headers = headers)
        url = self.search_url
        
        # Step 1
        response = client.get(url)
        doc = json.loads(response.text)
        print(response)
        self._total_result = int(doc['search-results']['opensearch:totalResults'])
        index = len(doc['search-results']['entry'])
        
        pbar = tqdm(initial = 25, total = self.total_result)
        while index <= self.total_result:
            results = doc['search-results']['entry']
            
            # get doi
            try:
                for r in results:
                    self.df1 = self.df1.append({'doi': r['prism:doi']}, ignore_index = True)
                 # get next page url
                for l in doc['search-results']['link']:
                    if l['@ref'] == 'next':
                        url = l['@href']

                # connect to next page
                time.sleep(1.25)
                response = client.get(url)
                doc = json.loads(response.text)
                index += len(doc['search-results']['entry'])
                pbar.update(len(doc['search-results']['entry']))
            except:
                continue
              
        print(color.BOLD_RED_COLOR + '\n Complete to scrape the results'+ color.END)
        pbar.close()

    def start_scrape_doi(self,doi_lst,index):
        """wait for edit
        """
        print(color.BOLD_GREEN_COLOR + f'\n Start to Find keywords and Abstracts - {index}:'+ color.END)
        
        # Initilization
        headers={"X-ELS-APIKey":self.api_key,"Accept":'application/json'}
        timeout = httpx.Timeout(10.0, connect=120.00)
        client = httpx.Client(timeout=timeout,headers=headers)
    
        for i in tqdm(range(len(doi_lst))):
            url = f"https://api.elsevier.com/content/article/doi/"+ doi_lst[i]
            time.sleep(1.25)
            try:
                response = client.get(url)
                doc = json.loads(response.text)
                self.df2 = self.df2.append({'title': doc['full-text-retrieval-response']['coredata']['dc:title'],
                            'author': [x['$'] for x in doc['full-text-retrieval-response']['coredata']['dc:creator']],
                            'publish_date': doc['full-text-retrieval-response']['coredata']['prism:coverDate'],
                            'keywords': [x['$'] for x in doc['full-text-retrieval-response']['coredata']['dcterms:subject']],
                            'abstract': doc['full-text-retrieval-response']['coredata']['dc:description'],
                            'link': doc['full-text-retrieval-response']['coredata']['link'][1]['@href']},
                             ignore_index = True)
            except:
                continue
        print(color.BOLD_RED_COLOR + f'Completed - {index}\n'+ color.END)
    
    def extract_doi(self):
        """200 doi per times
        """
        doi_lst = self.df1['doi'].to_list()
        doi_lst_chunks = list(lst_chunks(doi_lst,200))
        print(color.BOLD + f' Total: {len(doi_lst_chunks)} times'+ color.END)
        for index,lst in enumerate(doi_lst_chunks):
            self.start_scrape_doi(lst,index)
            print('     Time sleep for 5 seconds....')
            time.sleep(5)
        print(color.BOLD_RED_COLOR + f'Completed\n'+ color.END) 
        

In [None]:
# Web scrape
search = ScienceDirectScrape('wisconsin-breast-cancer-machine-learning-model')
search.api_key = api2
search.start_search()

In [None]:
search.api_key = api1
search.extract_doi()

In [None]:
# Convert to xlsx file
df = search.df2
df.to_excel("output.xlsx") 

## <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5; border-left: solid #d68181 4px; border-radius: 5px"> Simple NLP Implementation</div>

In [None]:
# Set up
nltk.download('stopwords')
nltk.download('wordnet') 
nltk.download('omw-1.4')

stop_words = set(stopwords.words("english"))
!python -m spacy download en_core_web_lg

In [None]:
"""Code reference: 
 - https://www.kaggle.com/code/ananyaroy1011/automated-keyword-extraction-from-articles-nlp
"""

def corpus(df):
    lst= []
    for i in range(0, int(df.word_count.describe()['count'])):
        text = re.sub('[^a-zA-Z]', ' ', df['abstract'].to_list()[i])
        text = text.lower()
        text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
        text = re.sub("(\\d|\\W)+"," ",text)
        text = text.split()

        # Stemming
        ps = PorterStemmer()
        # Lemmatisation
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in  stop_words] 
        text = " ".join(text)
        lst.append(text)
    return corpus

# Frequently occuring words function
def freq_words(corpuslst, n=20, num_words = 1, figsize = (13,8)):
    
    if num_words == 1:
        vec = CountVectorizer().fit(corpuslst)
    else:
        vec = CountVectorizer(ngram_range=(num_words,num_words), max_features=2000).fit(corpuslst)

    # bag of words model
    bag_of_words = vec.transform(corpuslst)
    sum_words = bag_of_words.sum(axis=0) 

    # words_freq
    freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    top_words = freq[:n]
    display(top_words)
    print('\n')

    # graph
    top_df = pd.DataFrame(top_words)
    top_df.columns=["Word", "Freq"]
    sns.set(rc={'figure.figsize':figsize})
    g = sns.barplot(x="Word", y="Freq", data=top_df)
    g.set_xticklabels(g.get_xticklabels(), rotation=90)


### Identification

In [None]:
df.shape

Our raw data after doing web scraping by using keyword `wisconsin breast cancer machine learning model` is .

### Screening
#### Abstracts

In [None]:
# Count each abstract's word counts
df['word_count'] = df['abstract'].apply(lambda x: len(str(x).split(" ")))
df[['abstract','word_count']].head()

In [None]:
# drop row if some paper has wrong format of abstracts
for index,val in enumerate(df['abstract'].to_list()):
    if type(val) != str:
        df = df.drop(index)

To avoid bias and we applly strict rules in selecting if the reasearch paper is relative about building models.

In [None]:
# Visualization of top n words for the abstracts
corpus_filter = corpus(df)
freq_words(corpus_filter, n=20, num_words=1)

In [None]:
freq_words(corpus_filter, n=20, num_words=2)

In [None]:
freq_words(corpus_filter, n=20, num_words=3)

In [None]:
# first filter
new_df = pd.DataFrame()
for index,val in enumerate(df['abstract'].to_list()):
    try:
        if 'deep learning' in val.lower() or 'machine learning' in val.lower() or 'machine learning algorithm' in val.lower():
        new_df = new_df.append(df.iloc[index,:])
    except:
        continue

# Second filter
new_df2 = pd.DataFrame()
for index,val in enumerate(new_df['abstract'].to_list()):
    try:
        if 'accuracy' in val.lower() or 'performance' in val.lower() or 'predict' in val.lower or 'classifier' in val.lower():
            new_df2 = new_df2.append(new_df.iloc[index,:])
    except:
        continue

In [None]:
# Visualization of top n words for the abstracts
corpus_filter = corpus(new_df2)
freq_words(corpus_filter, n=20, num_words=1)

In [None]:
freq_words(corpus_filter, n=20, num_words=2)

In [None]:
freq_words(corpus_filter, n=20, num_words=3)

In [None]:
# Numbers of published paper by year
new_df2['publish_date'] = pd.to_datetime(new_df2['publish_date'])
new_df2['publish_date'].dt.year.value_counts()

In [None]:
# Visualization


Some irrelavant papers are screened out by using abstracts. In the next step we will use keywords as the second screening. 

#### Keywords

In [None]:
# return keywords as a list
t = []
for i in new_df2['keywords'].to_list():
    t.extend(ast.literal_eval(i))

In [None]:
# keywords preprocessing
import copy
t1 = t.copy()

def check(s):
    regex = re.compile(r'\((\w+)\)')
    return bool(regex.search(s))

for i in t:
    if check(i):
        t1.insert(0, t1.pop(t1.index(i)))

In [None]:
# abbre
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('abbreviation_detector')

def replace_acronyms(text):
    doc = nlp(text)
    altered_tok = [tok.text for tok in doc]
    for abrv in doc._.abbreviations:
        altered_tok[abrv.start] = str(abrv._.long_form)

    return(" ".join(altered_tok))

def convert_into_uppercase(a):
    return a.group(1) + a.group(2).upper()

In [None]:
t2 = [re.sub("(^|\s)(\S)", convert_into_uppercase, i) for i in t1]
t3 = ', '.join(t2)
t3 = t3.replace('+','')
t4 = replace_acronyms(t3).split(", ")

for index,val in enumerate(t4):
    result = re.sub(r'\([^)]*\)', '', val)
    head, sep, tail = result.partition('(')
    t4[index] = head[:-1]

In [None]:
lst = []
for i in t4:
    i = re.sub("(\\d|\\W)+"," ", i)
    i = i.lower()
    i = replace_acronyms(i).split(" ")

    for index,s in enumerate(i):
        i[index] = nltk.WordNetLemmatizer().lemmatize(s)

    i = ' '.join(i)
    lst.append(i)

## <div style="font-family: Trebuchet MS; background-color: #f1acab; color: #FFFFFF; padding: 12px; line-height: 1.5; border-left: solid #d68181 4px; border-radius: 5px"> Results and Discussion </div>

# Conclusion

# References

**Code References:**
1. []()

**Literatures:**
1. []()