In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


In [2]:
data= pd.read_csv('cancer_data.csv')
data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

Drop: 
* Column **id** berisi data unique, jadi disimpulkan aka di drop karena tidak dapat digunakan untuk prediksi
* Column **Unnamed:32** tidak memiliki data, jadi disimpulkan akan di drop

In [4]:
data.drop('Unnamed: 32', axis= 1, inplace= True)
data.drop('id', axis= 1, inplace= True)

In [5]:
data.isna().sum()

diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [6]:
data.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [7]:
data['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

M: Malignant Cancer (Kanker Ganas)

B: Benign Cancer (Kanker Jinak)

FP: Prediksi Ganas, kenyataannya Jinak

FN: Prediksi Jinak, kenyataannya Ganas

artinya, metode yang digunakan yaitu **Recall**, karena dapat berakibat fatal terhadap pasien

## EDA

In [8]:
q1= data.quantile(0.25)
q3= data.quantile(0.75)
iqr= q3 - q1

In [9]:
below= q1 - 1.5 * iqr
above= q3 + 1.5 * iqr

In [10]:
((data > above ) | (data < below)).sum()/len(data)*100

area_mean                   4.393673
area_se                    11.423550
area_worst                  6.151142
compactness_mean            2.811951
compactness_se              4.920914
compactness_worst           2.811951
concave points_mean         1.757469
concave points_se           3.339192
concave points_worst        0.000000
concavity_mean              3.163445
concavity_se                3.866432
concavity_worst             2.108963
diagnosis                   0.000000
fractal_dimension_mean      2.636204
fractal_dimension_se        4.920914
fractal_dimension_worst     4.217926
perimeter_mean              2.284710
perimeter_se                6.678383
perimeter_worst             2.636204
radius_mean                 2.460457
radius_se                   6.678383
radius_worst                2.987698
smoothness_mean             1.054482
smoothness_se               5.272408
smoothness_worst            1.230228
symmetry_mean               2.636204
symmetry_se                 4.745167
s

dari data diatas, terlihat column **area_se** memiliki data outlier sebesar lebih dari 10%, jadi dapat disimpulkan kita akan menggunakan *Robust scaler* untuk scalling

## Data Splitting

In [11]:
X= data.drop('diagnosis', axis= 1)
y= np.where(data['diagnosis'] == 'M', 1,0)

* 1: Malignant Cancer
* 0: Benign Cancer

In [12]:
X_train_val,X_test,y_train_val,y_test= train_test_split(X,y,test_size= 0.2,random_state=2021)

## Transformer

In [13]:
transformer= ColumnTransformer([
    ('robust', RobustScaler(),X.columns)
], remainder='passthrough')

## Model Selection: Benchmark

In [14]:
logreg= LogisticRegression()
tree= DecisionTreeClassifier()
knn= KNeighborsClassifier()

In [15]:
tree_pipe= Pipeline([
    ('tree', tree)
])

logreg_pipe= Pipeline([
    ('transformer',transformer),
    ('logreg',logreg)
])

knn_pipe= Pipeline([
    ('transformer',transformer),
    ('knn',knn)
])

## Cross Validation; Metric Accuracy

In [16]:
def model_evaluation(pipeline):
    skfold= StratifiedKFold(n_splits=5)
    df= pd.DataFrame(columns= ['Name','1','2','3','4','5','mean','std'])
    for i in pipeline:
        arr = cross_val_score(i, X_train_val, y_train_val, scoring='recall',cv=skfold)
        df = df.append({'Name':str(i[-1]), '1':arr[0], '2':arr[1], '3':arr[2], '4':arr[3], '5':arr[4], 'mean':arr.mean(),'std':arr.std()}, ignore_index=True)
    return df

In [17]:
pipe = [tree_pipe,logreg_pipe,knn_pipe]
model_evaluation(pipe)

Unnamed: 0,Name,1,2,3,4,5,mean,std
0,DecisionTreeClassifier(),0.911765,1.0,0.911765,0.735294,0.823529,0.876471,0.089983
1,LogisticRegression(),0.911765,0.941176,0.941176,0.970588,0.941176,0.941176,0.018602
2,KNeighborsClassifier(),0.882353,0.911765,0.852941,0.941176,0.970588,0.911765,0.041595


dari sini dapat disimpulkan bahwa nilai mean dan std **Logreg** adalah yang terbaik, jadi kita menggunakan **Logreg** sebagai model

## Model Performance with Test Set

In [18]:
logreg = LogisticRegression()
logreg_pipe = Pipeline([
    ('transformer',transformer),
    ('logreg',logreg)
])

In [19]:
logreg_pipe.fit(X_train_val,y_train_val)
y_pred = logreg_pipe.predict(X_test)

In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      0.98      0.99        42

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



In [21]:
recall_score(y_test,y_pred,pos_label=1)

0.9761904761904762

### Hyperparameter Tunning

In [22]:
hyperparam_space= {
    'logreg__C': [1000, 500, 100, 50, 10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001],
    'logreg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [23]:
logreg= LogisticRegression()
logreg_pipe= Pipeline([
    ('transformer', transformer),
    ('logreg',logreg)
])

skfold= StratifiedKFold(n_splits=5)
grid_search= GridSearchCV(
    logreg_pipe,
    param_grid= hyperparam_space,
    cv= skfold,
    scoring= 'recall',
    n_jobs= -1
)

In [24]:
grid_search.fit(X_train_val, y_train_val)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('robust',
                                                                         RobustScaler(),
                                                                         Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_me...
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object'))])),
                                       ('logreg', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'logreg__C': [1000, 500,

In [25]:
print('best score', grid_search.best_score_)
print('best param', grid_search.best_params_)

best score 0.9823529411764707
best param {'logreg__C': 0.001, 'logreg__solver': 'liblinear'}


In [26]:
y_pred=grid_search.predict(X_test)

In [27]:
recall_score(y_test,y_pred,pos_label=1)

1.0

## Final Results comparison

In [28]:
model= LogisticRegression()
estimator= Pipeline([
    ('preprocessing', transformer),
    ('model',logreg)
])
estimator.fit(X_train_val,y_train_val)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('robust', RobustScaler(),
                                                  Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object'))])),
                ('model', LogisticRegression())])

### Before Tunning

In [29]:
y_pred=estimator.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      0.98      0.99        42

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



In [30]:
accuracy_score(y_test,y_pred)

0.9912280701754386

In [31]:
recall_score(y_test,y_pred)

0.9761904761904762

### After Tunning

In [32]:
grid_search.best_estimator_.fit(X_train_val,y_train_val)

Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('robust', RobustScaler(),
                                                  Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smooth...
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object'))])),
                ('logreg', LogisticRegression(C=0.001, solver='liblinear'))])

In [33]:
y_pred=grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.86      0.93        72
           1       0.81      1.00      0.89        42

    accuracy                           0.91       114
   macro avg       0.90      0.93      0.91       114
weighted avg       0.93      0.91      0.91       114



In [34]:
accuracy_score(y_test,y_pred)

0.9122807017543859

In [35]:
recall_score(y_test,y_pred)

1.0