# Eksperimen pemilihan feature dan model terbaik

Menggunakan data breast cancer 

In [137]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

### Menggunakan Semua Feature + Random Forest

In [138]:
bc = load_breast_cancer()
df = pd.DataFrame(bc.data, columns=bc.feature_names)
df['target'] = bc.target
X = df.iloc[:,:-1]
y = df['target']

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [140]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21,stratify=y)
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [141]:
name = bc.feature_names
score = clf.feature_importances_
    
sc = pd.DataFrame(score, index=name, columns=['Scores'])
sc.sort_values(by='Scores', ascending=False)

Unnamed: 0,Scores
worst perimeter,0.230952
mean concave points,0.169765
worst radius,0.151362
worst concave points,0.120079
worst area,0.113035
worst concavity,0.055243
mean texture,0.01597
area error,0.015222
worst compactness,0.013809
worst fractal dimension,0.013742


In [142]:
clf.score(X_test, y_test)

0.951048951048951

In [143]:
from sklearn.metrics import accuracy_score
ac = accuracy_score(y_test, y_pred)
ac

0.951048951048951

In [210]:
#hasilnya sangat bagus

### Pilih 4 Feature Importance Terbaik + Random Forest

In [144]:
bc = load_breast_cancer()
df = pd.DataFrame(bc.data, columns=bc.feature_names)
df['target'] = bc.target
X_2 = df[['worst radius', 'worst area', 'mean concave points', 'worst perimeter']]
y_2 = df.target

In [145]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, stratify=y_2, random_state=21)
clf = RandomForestClassifier(n_estimators=5)
clf.fit(X_train2, y_train2)
y_predict = clf.predict(X_test2)

In [146]:
accuracy_score(y_test2, y_predict)

0.916083916083916

In [147]:
#performa model justru semakin menurun

### Pakai 4 Feature + MLPClassifier

In [150]:
model = MLPClassifier(hidden_layer_sizes=(50))
model.fit(X_train2, y_train2)
predict3 = model.predict(X_test2)

In [152]:
accuracy_score(y_test2, predict3)

0.8741258741258742

In [153]:
#hmmm sepertinya justru overfitting
# kembali dengan semua feature

### Menggunakan Semua Feature + MPLClassifier

In [154]:
bc = load_breast_cancer()
df = pd.DataFrame(bc.data, columns=bc.feature_names)
df['target'] = bc.target
X3 = df.iloc[:,:-1]
y3 = df['target']

In [155]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, random_state=21,stratify=y)
model2 = MLPClassifier(hidden_layer_sizes=(50,50))
model2.fit(X_train3,y_train3)
y_pred3 = model2.predict(X_test3)

In [157]:
accuracy_score(y_test3, y_pred3)

0.9020979020979021

Menimbang-nimbang dari segi kemudahan input data + akurasi validasi, maka model yang dipilih adalah model dengan input 4 feature dengan algoritma random forest 

### Tuning Hyperparameter Model 4 Feature + Random Forest

In [169]:
bc = load_breast_cancer()
df = pd.DataFrame(bc.data, columns=bc.feature_names)
df['target'] = bc.target
X_2 = df[['worst radius', 'worst area', 'mean concave points', 'worst perimeter']]
y_2 = df.target

In [170]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, stratify=y_2, random_state=21)

In [172]:
clf = RandomForestClassifier()
param = {'n_estimators':[10,20,50,100],
         'max_depth':[3,4,5]}

In [173]:
gscv = GridSearchCV(estimator=clf, param_grid=param, scoring='accuracy', cv=5)
gscv.fit(X_train2, y_train2)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [174]:
gscv.best_score_

0.9296032831737346

In [175]:
gscv.best_params_

{'max_depth': 3, 'n_estimators': 20}

In [179]:
clf_tuning = RandomForestClassifier(max_depth=3, n_estimators=20)
clf_tuning.fit(X_train2, y_train2)
y_predict_tuning = clf_tuning.predict(X_test2)

In [180]:
accuracy_score(y_test2,y_predict_tuning)

0.9300699300699301

In [181]:
#horeee meningkat, model masih dapat diatur hyperparameternya supaya lebih akurat

In [194]:
X_test2.shape

(143, 4)

In [203]:
#oke kita coba prediksi dengan data yang baru
import numpy as np
test =np.array([25,1800,0.2,185]).reshape(1,-1)
clf_tuning.predict(test)

array([0])

In [204]:
#nol berarti malignant