# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# utils
import pickle
import random

In [2]:
# for reproducecibility
random_state = 60251014
random.seed(random_state)

# Data Loading
***
Load data Breast Cancer **Diagnostic** and **Prognosis** <br>

In [3]:
breast_cancer_diagnosis = pd.read_csv('../dataset/wdbc_data.csv')
breast_cancer_prognosis = pd.read_csv('../dataset/Prognosis Breast Cancer Dataset.csv', na_values='?')

# Data Cleaning

## Check number of missing values in each dataset

In [4]:
breast_cancer_prognosis.isnull().sum()

id                         0
outcome                    0
reccurence_time            0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
tumor_size                 0
lymph_node_sta

**Penjelasan** <br>
Pada dataset Breast Cancer Prognosis, ada 4 kasus missing values pada atribut *lymph_node_status*. Berdasarkan paper rujukan, maka baris yang memuat missing values dihapuskan

In [5]:
breast_cancer_diagnosis.isnull().sum()

radius_mean               0
perimeter_mean            0
area_mean                 0
compactness_mean          0
concavity_mean            0
concave_points_mean       0
radius_se                 0
perimeter_se              0
area_se                   0
radius_largest            0
perimeter_largest         0
area_largest              0
compactness_largest       0
concavity_largest         0
concave_points_largest    0
diagnosis                 0
dtype: int64

### Remove Missing Values (if any)

In [6]:
breast_cancer_prognosis.dropna(inplace=True)

In [7]:
breast_cancer_prognosis.isnull().sum().sum()

0

## Check number of rows distribution

In [8]:
breast_cancer_prognosis['outcome'].value_counts()

N    147
R     46
Name: outcome, dtype: int64

In [9]:
breast_cancer_diagnosis['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

**Penjelasan** <br>
Pada Kedua dataset terjadi imbalanced class. Berdasarkan paper rujukan, imbalanced class dapat ditangani dengan upsampling atau downsampling. Namun sebelum disampling, dataset harus dibagi dulu ke dalam training set dan testing set. Hal ini mengacu dari paper rujukan yang menyebutkan menggunakan strategi *cost-sensitive learning*, salah satunya dengan menambahkan atau mengurangi jumlah instance dari data dengan label minoritas atau mayoritas

# Data Preprocessing
***
Split data into 80% training set and 20% test set

## Split data into training and testing set

In [10]:
X_diagnosis = breast_cancer_diagnosis.drop('diagnosis',axis=1)
y_diagnosis = breast_cancer_diagnosis['diagnosis'].copy()
X_prognosis = breast_cancer_prognosis.drop('outcome', axis=1)
y_prognosis = breast_cancer_prognosis['outcome'].copy()


# split into train and test
X_train_diagnosis, X_test_diagnosis, y_train_diagnosis, y_test_diagnosis = train_test_split(
    X_diagnosis, y_diagnosis, test_size=0.20, random_state=42)
X_train_prognosis, X_test_prognosis, y_train_prognosis, y_test_prognosis = train_test_split(
    X_prognosis, y_prognosis, test_size=0.20, random_state=42)

In [11]:
X_train_diagnosis.head()

Unnamed: 0,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave_points_mean,radius_se,perimeter_se,area_se,radius_largest,perimeter_largest,area_largest,compactness_largest,concavity_largest,concave_points_largest
68,9.029,58.79,250.5,0.1413,0.313,0.04375,0.3274,1.885,17.67,10.31,65.5,324.7,0.4365,1.252,0.175
181,21.09,142.7,1311.0,0.2832,0.2487,0.1496,0.6298,4.414,81.46,26.68,176.5,2089.0,0.7584,0.678,0.2903
63,9.173,59.2,260.9,0.08751,0.05988,0.0218,0.4098,2.608,23.52,10.01,65.59,310.1,0.1678,0.1397,0.05087
248,10.65,68.01,347.0,0.07234,0.02379,0.01615,0.2497,1.497,16.64,12.25,77.98,455.7,0.1398,0.1125,0.06136
60,10.17,64.55,311.9,0.08061,0.01084,0.0129,0.5158,3.312,34.62,11.02,69.86,368.6,0.09866,0.02168,0.02579


In [12]:
X_train_prognosis.head()

Unnamed: 0,id,reccurence_time,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,tumor_size,lymph_node_status
84,877486,5,19.18,22.49,127.5,1148.0,0.08523,0.1428,0.1114,0.06772,...,166.4,1688.0,0.1322,0.5601,0.3865,0.1708,0.3193,0.09221,3.0,1.0
112,889719,37,17.19,22.07,111.6,928.3,0.09726,0.08995,0.09061,0.06527,...,140.5,1436.0,0.1558,0.2567,0.3889,0.1984,0.3216,0.0757,8.5,6.0
166,917627,39,24.63,21.6,165.5,1841.0,0.103,0.2106,0.231,0.1471,...,205.7,2642.0,0.1342,0.4188,0.4658,0.2475,0.3157,0.09671,1.0,1.0
37,857010,8,18.65,17.6,123.7,1076.0,0.1099,0.1686,0.1974,0.1009,...,150.6,1567.0,0.1679,0.509,0.7345,0.2378,0.3799,0.09185,1.8,0.0
139,90312,13,19.55,23.21,128.9,1174.0,0.101,0.1318,0.1856,0.1021,...,142.0,1313.0,0.1251,0.2414,0.3829,0.1825,0.2576,0.07602,4.0,13.0


In [13]:
y_train_diagnosis.head()

68     0
181    1
63     0
248    0
60     0
Name: diagnosis, dtype: int64

In [14]:
y_train_prognosis.head()

84     N
112    R
166    N
37     R
139    N
Name: outcome, dtype: object

Label pada prognosis masih dalam bentuk karakter, sehingga diperlukan Label Encoding untuk mengubah karakter menjadi numerikal

## Concat X and Y for sampling purposes

In [15]:
df_train_diagnosis = pd.concat([X_train_diagnosis, y_train_diagnosis], axis=1)
df_train_prognosis = pd.concat([X_train_prognosis, y_train_prognosis], axis=1)

## Downsampling the majority class

In [16]:
# print the number of rows of minority class
minor_sample_prognosis = df_train_prognosis[
    df_train_prognosis['outcome']=='R'
]['outcome'].count()
minor_sample_diagnosis = df_train_diagnosis[
    df_train_diagnosis['diagnosis']==1
]['diagnosis'].count()
print('Number of minority class of Prognosis = ', minor_sample_prognosis)
print('Number of minority class of Diagnosis = ', minor_sample_diagnosis)

Number of minority class of Prognosis =  35
Number of minority class of Diagnosis =  169


In [17]:
# random downsampling of majority class in each dataset
df_train_diagnosis_downsampling = pd.concat([
    df_train_diagnosis[
        (df_train_diagnosis['diagnosis'] == 0)
    ].sample(minor_sample_diagnosis, random_state=random_state),
    df_train_diagnosis[
        (df_train_diagnosis['diagnosis'] == 1)
    ]
],axis=0).sample(frac=1)
df_train_prognosis_downsampling = pd.concat([
    df_train_prognosis[
        (df_train_prognosis['outcome'] == 'N')
    ].sample(minor_sample_prognosis, random_state=random_state),
    df_train_prognosis[
        (df_train_prognosis['outcome'] == 'R')
    ]
],axis=0).sample(frac=1)

In [18]:
df_train_diagnosis_downsampling['diagnosis'].value_counts()

0    169
1    169
Name: diagnosis, dtype: int64

In [19]:
df_train_prognosis_downsampling['outcome'].value_counts()

R    35
N    35
Name: outcome, dtype: int64

## Upsampling the minority class
***
Minority class in **Prognosis**: R <br>
Minority class in **Diagnossi**: 1

In [20]:
# print the number of rows of majority class
major_sample_prognosis = df_train_prognosis[
    df_train_prognosis['outcome']=='N'
]['outcome'].count()
major_sample_diagnosis = df_train_diagnosis[
    df_train_diagnosis['diagnosis']==0
]['diagnosis'].count()
print('Number of majority class of Prognosis = ', major_sample_prognosis)
print('Number of majority class of Diagnosis = ', major_sample_diagnosis)

Number of majority class of Prognosis =  119
Number of majority class of Diagnosis =  286


In [21]:
# random upsampling of minority class in each dataset
df_train_diagnosis_upsampling = pd.concat([
    df_train_diagnosis[
        (df_train_diagnosis['diagnosis'] == 1)
    ].sample(major_sample_diagnosis, random_state=random_state, replace=True),
    df_train_diagnosis[
        (df_train_diagnosis['diagnosis'] == 0)
    ]
],axis=0).sample(frac=1)
df_train_prognosis_upsampling = pd.concat([
    df_train_prognosis[
        (df_train_prognosis['outcome'] == 'R')
    ].sample(major_sample_prognosis, random_state=random_state, replace=True),
    df_train_prognosis[
        (df_train_prognosis['outcome'] == 'N')
    ]
],axis=0).sample(frac=1)

In [22]:
df_train_diagnosis_upsampling['diagnosis'].value_counts()

1    286
0    286
Name: diagnosis, dtype: int64

In [23]:
df_train_prognosis_upsampling['outcome'].value_counts()

N    119
R    119
Name: outcome, dtype: int64

## Split X and Y after sampling processes

In [24]:
# split X and Y in diagnosis breast cancer dataset after downsampling
X_train_diagnosis_downsampling = df_train_diagnosis_downsampling.drop(['diagnosis'], axis=1)
y_train_diagnosis_downsampling = df_train_diagnosis_downsampling['diagnosis'].copy()

# split X and Y in diagnosis breast cancer dataset after upsampling
X_train_diagnosis_upsampling = df_train_diagnosis_upsampling.drop(['diagnosis'], axis=1)
y_train_diagnosis_upsampling = df_train_diagnosis_upsampling['diagnosis'].copy()

# split X and Y in diagnosis breast cancer dataset after downsampling
X_train_prognosis_downsampling = df_train_prognosis_downsampling.drop(['outcome'], axis=1)
y_train_prognosis_downsampling = df_train_prognosis_downsampling['outcome'].copy()

# split X and Y in prognosis breast cancer dataset after upsampling
X_train_prognosis_upsampling = df_train_prognosis_upsampling.drop(['outcome'], axis=1)
y_train_prognosis_upsampling = df_train_prognosis_upsampling['outcome'].copy()

## Label Encoding for target in Prognosis Dataset
***
Convert categorical into numerical attributes in Prognosis Label dataset

In [25]:
label_encoder_prognosis = LabelEncoder()
label_encoder_prognosis.fit(y_train_prognosis)

y_train_prognosis = label_encoder_prognosis.transform(y_train_prognosis)
y_train_prognosis_downsampling = label_encoder_prognosis.transform(y_train_prognosis_downsampling)
y_train_prognosis_upsampling = label_encoder_prognosis.transform(y_train_prognosis_upsampling)
y_test_prognosis = label_encoder_prognosis.transform(y_test_prognosis)

In [26]:
y_train_prognosis_downsampling

array([1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1])

## Standardization for feature scaling
***
Di paper rujukan disebutkan: "Data standardization was done to ensure that the data was consistent" <br>

In [27]:
scaling_diagnosis = MinMaxScaler()
scaling_prognosis = MinMaxScaler()
scaling_diagnosis.fit(X_train_diagnosis)
scaling_prognosis.fit(X_train_prognosis)


# transform 
X_train_diagnosis = scaling_diagnosis.transform(X_train_diagnosis)
X_test_diagnosis = scaling_diagnosis.transform(X_test_diagnosis)
X_train_diagnosis_upsampling = scaling_diagnosis.transform(X_train_diagnosis_upsampling)
X_train_diagnosis_downsampling = scaling_diagnosis.transform(X_train_diagnosis_downsampling)

X_train_prognosis = scaling_prognosis.transform(X_train_prognosis)
X_test_prognosis = scaling_prognosis.transform(X_test_prognosis)
X_train_prognosis_upsampling = scaling_prognosis.transform(X_train_prognosis_upsampling)
X_train_prognosis_downsampling = scaling_prognosis.transform(X_train_prognosis_downsampling)

## Convert label from dataframe format into numpy array format

In [28]:
# before
y_train_diagnosis.head()

68     0
181    1
63     0
248    0
60     0
Name: diagnosis, dtype: int64

In [29]:
# convert into numpy array format for uniformity purposes
y_train_diagnosis = y_train_diagnosis.to_numpy()
y_train_diagnosis_downsampling = y_train_diagnosis_downsampling.to_numpy()
y_train_diagnosis_upsampling = y_train_diagnosis_upsampling.to_numpy()
y_test_diagnosis = y_test_diagnosis.to_numpy()

# Save preprocessed data

In [31]:
preprocessed_breast_cancer_data = {
    'diagnosis': {
        'original': {
            'X_train': X_train_diagnosis,
            'y_train': y_train_diagnosis,
            'X_test': X_test_diagnosis,
            'y_test': y_test_diagnosis,
        },
        'downsampling': {
            'X_train': X_train_diagnosis_downsampling,
            'y_train': y_train_diagnosis_downsampling,
        },
        'upsampling': {
            'X_train': X_train_diagnosis_upsampling,
            'y_train': y_train_diagnosis_upsampling,
        }
    },
    'prognosis': {
        'original': {
            'X_train': X_train_prognosis,
            'y_train': y_train_prognosis,
            'X_test': X_test_prognosis,
            'y_test': y_test_prognosis,
        },
        'downsampling': {
            'X_train': X_train_prognosis_downsampling,
            'y_train': y_train_prognosis_downsampling,
        },
        'upsampling': {
            'X_train': X_train_prognosis_upsampling,
            'y_train': y_train_prognosis_upsampling,
        }
    }
}

In [32]:
# save preprocessed data into pickle format
pickle.dump(preprocessed_breast_cancer_data, open('../dataset/preprocessed_breast_cancer_data.pkl', 'wb'))