# Tugas Kecil 1
## Eksplorasi library Decision Tree Learning pada Jupyter Notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Modul six untuk Id3Estimator
import six, sys
sys.modules['sklearn.externals.six'] = six

# Modul-modul pengolahan data
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

# Modul model algoritma machine learning
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from id3 import Id3Estimator
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Modul pengecekan akurasi
from sklearn.metrics import accuracy_score, f1_score

### Load dataset

In [2]:
# Dataset breast cancer
breast_cancer = load_breast_cancer()
df_breast_cancer = pd.DataFrame(breast_cancer.data, 
                                columns=breast_cancer.feature_names)
df_breast_cancer['target'] = breast_cancer.target

df_breast_cancer.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
# Dataset play tennis
df_play_tennis = pd.read_csv('PlayTennis.csv')

# Melakukan encoding pada data-data kategorial
le = LabelEncoder()

df_play_tennis['Outlook'] = le.fit_transform(df_play_tennis['Outlook'])
df_play_tennis['Temperature'] = le.fit_transform(df_play_tennis['Temperature'])
df_play_tennis['Humidity'] = le.fit_transform(df_play_tennis['Humidity'])
df_play_tennis['Wind'] = le.fit_transform(df_play_tennis['Wind'])
df_play_tennis['Play Tennis'] = le.fit_transform(df_play_tennis['Play Tennis'])

df_play_tennis.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,2,1,0,1,0
1,2,1,0,0,0
2,0,1,0,1,1
3,1,2,0,1,1
4,1,0,1,1,1


In [4]:
X_breast_cancer = df_breast_cancer[breast_cancer.feature_names]
y_breast_cancer = df_breast_cancer['target']

In [5]:
X_bc_train, X_bc_test, y_bc_train, y_bc_test = train_test_split(
    X_breast_cancer, y_breast_cancer, test_size=0.2, random_state=42)

In [6]:
X_play_tennis = df_play_tennis[['Outlook', 'Temperature', 'Humidity', 'Wind']]
y_play_tennis = df_play_tennis['Play Tennis']

In [7]:
X_pt_train, X_pt_test, y_pt_train, y_pt_test = train_test_split(
    X_play_tennis, y_play_tennis, test_size=0.2, random_state=42)

### Decision Tree Classifier

#### 1. Dataset Breast Cancer

In [8]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree = decision_tree.fit(X_bc_train, y_bc_train)

res = export_text(decision_tree, feature_names=breast_cancer['feature_names'].tolist())
print(res)

|--- mean concave points <= 0.05
|   |--- worst radius <= 16.83
|   |   |--- area error <= 48.70
|   |   |   |--- worst smoothness <= 0.18
|   |   |   |   |--- smoothness error <= 0.00
|   |   |   |   |   |--- worst texture <= 27.76
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- worst texture >  27.76
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- smoothness error >  0.00
|   |   |   |   |   |--- worst texture <= 33.35
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- worst texture >  33.35
|   |   |   |   |   |   |--- worst texture <= 33.56
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- worst texture >  33.56
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |--- worst smoothness >  0.18
|   |   |   |   |--- class: 0
|   |   |--- area error >  48.70
|   |   |   |--- concavity error <= 0.02
|   |   |   |   |--- class: 0
|   |   |   |--- concavity error >  0.02
|   |   |   |   |--- class: 1
|   |--- worst radius >  16.

In [9]:
#visualisasi tree
#import graphviz

#dot_data = export_graphviz(decision_tree, out_file=None) 
#graph = graphviz.Source(dot_data) 


In [10]:
# Memprediksi hasil dari model
y_bc_pred = decision_tree.predict(X_bc_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_bc_test, y_bc_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_bc_test, y_bc_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 0.9473684210526315
Evaluasi hasil prediksi menggunakan metric F1: 0.9577464788732394


#### 2. Dataset Play Tennis

In [11]:
tennis_decision_tree = DecisionTreeClassifier(random_state=42)
tennis_decision_tree.fit(X_pt_train, y_pt_train)

res = export_text(tennis_decision_tree, feature_names=['Outlook', 'Temperature', 'Humidity', 'Wind'])
print(res)

|--- Outlook <= 0.50
|   |--- class: 1
|--- Outlook >  0.50
|   |--- Humidity <= 0.50
|   |   |--- Wind <= 0.50
|   |   |   |--- class: 0
|   |   |--- Wind >  0.50
|   |   |   |--- Outlook <= 1.50
|   |   |   |   |--- class: 1
|   |   |   |--- Outlook >  1.50
|   |   |   |   |--- class: 0
|   |--- Humidity >  0.50
|   |   |--- Wind <= 0.50
|   |   |   |--- Outlook <= 1.50
|   |   |   |   |--- class: 0
|   |   |   |--- Outlook >  1.50
|   |   |   |   |--- class: 1
|   |   |--- Wind >  0.50
|   |   |   |--- class: 1



In [12]:
# Memprediksi hasil dari model
y_pt_pred = tennis_decision_tree.predict(X_pt_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_pt_test, y_pt_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_pt_test, y_pt_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 1.0
Evaluasi hasil prediksi menggunakan metric F1: 1.0


### Id3Estimator

In [13]:
from id3 import export_text

#### 1. Dataset Breast Cancer

In [14]:
estimator = Id3Estimator()
estimator = estimator.fit(X_bc_train, y_bc_train)

res = export_text(estimator.tree_, feature_names=breast_cancer['feature_names'].tolist())
print(res)


mean concave points <=0.05
|   worst radius <=16.83
|   |   radius error <=0.63
|   |   |   worst texture <=30.15: 1 (225) 
|   |   |   worst texture >30.15
|   |   |   |   worst area <=641.60: 1 (25) 
|   |   |   |   worst area >641.60
|   |   |   |   |   mean radius <=13.45
|   |   |   |   |   |   mean texture <=28.79: 0 (3) 
|   |   |   |   |   |   mean texture >28.79: 1 (1) 
|   |   |   |   |   mean radius >13.45: 1 (6) 
|   |   radius error >0.63
|   |   |   mean smoothness <=0.09: 1 (1) 
|   |   |   mean smoothness >0.09: 0 (2) 
|   worst radius >16.83
|   |   mean texture <=16.19: 1 (6) 
|   |   mean texture >16.19
|   |   |   concave points error <=0.01: 0 (11) 
|   |   |   concave points error >0.01: 1 (2) 
mean concave points >0.05
|   worst perimeter <=114.45
|   |   worst texture <=25.65
|   |   |   worst concave points <=0.17: 1 (19) 
|   |   |   worst concave points >0.17: 0 (2) 
|   |   worst texture >25.65
|   |   |   perimeter error <=1.56
|   |   |   |   mean radius 

In [15]:
# Memprediksi hasil dari model
y_bc_pred = estimator.predict(X_bc_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_bc_test, y_bc_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_bc_test, y_bc_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 0.9385964912280702
Evaluasi hasil prediksi menggunakan metric F1: 0.9517241379310345


#### 2. Dataset Play Tennis

In [16]:
tennis_id3_estimator = Id3Estimator()
tennis_id3_estimator.fit(X_pt_train, y_pt_train)

res = export_text(tennis_id3_estimator.tree_, feature_names=['Outlook', 'Temperature', 'Humidity', 'Wind'])
print(res)


Outlook <=0.50: 1 (3) 
Outlook >0.50
|   Humidity <=0.50
|   |   Wind <=0.50: 0 (2) 
|   |   Wind >0.50: 0 (1/1) 
|   Humidity >0.50
|   |   Wind <=0.50
|   |   |   Temperature <=1.00: 0 (1) 
|   |   |   Temperature >1.00: 1 (1) 
|   |   Wind >0.50: 1 (2) 



In [17]:
# Memprediksi hasil dari model
y_pt_pred = tennis_id3_estimator.predict(X_pt_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_pt_test, y_pt_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_pt_test, y_pt_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 1.0
Evaluasi hasil prediksi menggunakan metric F1: 1.0


### K Means

#### 1. Dataset Breast Cancer

In [18]:
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_bc_train)

KMeans(n_clusters=2, random_state=42)

In [19]:
# import seaborn as sns
# X_copy = X_bc_train
# X_copy['Labels'] = kmeans.labels_

# # membuat plot KMeans dengan 5 klaster
# plt.figure(figsize=(8,4))
# sns.scatterplot(X_copy[''], X[''], hue=X_copy['Labels'],
#                 palette=sns.color_palette('hls', 5))
# plt.title('KMeans dengan 2 Cluster')
# plt.show()

In [20]:
# Memprediksi hasil dari model
y_bc_pred = kmeans.predict(X_bc_test)
y_bc_pred

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_bc_test, y_bc_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_bc_test, y_bc_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 0.12280701754385964
Evaluasi hasil prediksi menggunakan metric F1: 0.0


#### 2. Dataset Play Tennis

In [21]:
tennis_k_means = KMeans(random_state=42)
tennis_k_means.fit(X_pt_train)

# res = export_text(tennis_k_means, feature_names=['Outlook', 'Temperature', 'Humidity', 'Wind'])
# print(res)

KMeans(random_state=42)

In [22]:
# # Memprediksi hasil dari model
# y_pt_pred = tennis_k_means.predict(X_pt_test)

# # Mengevaluasi hasil prediksi
# # Menggunakan metric Accuracy
# print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_pt_test, y_pt_pred)))

# # Menggunakan metric F1
# print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_pt_test, y_pt_pred)))

### LogisticRegression

#### 1. Dataset Breast Cancer

In [23]:
bc_log_regression = LogisticRegression(random_state=42, max_iter=10000)
bc_log_regression.fit(X_bc_train, y_bc_train)

LogisticRegression(max_iter=10000, random_state=42)

In [24]:
# Memprediksi hasil dari model
y_bc_pred = bc_log_regression.predict(X_bc_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_bc_test, y_bc_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_bc_test, y_bc_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 0.956140350877193
Evaluasi hasil prediksi menggunakan metric F1: 0.9655172413793103


#### 2. Dataset Play Tennis

In [25]:
clf = LogisticRegression(random_state=42, max_iter=10000)
clf.fit(X_pt_train, y_pt_train)

LogisticRegression(max_iter=10000, random_state=42)

In [26]:
# Memprediksi hasil dari model
y_pt_pred = clf.predict(X_pt_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_pt_test, y_pt_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_pt_test, y_pt_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 0.6666666666666666
Evaluasi hasil prediksi menggunakan metric F1: 0.8


### Neural Network

#### 1. Dataset Breast Cancer

In [27]:
bc_neural_network = MLPClassifier(random_state=42)
bc_neural_network.fit(X_bc_train, y_bc_train)

MLPClassifier(random_state=42)

In [28]:
y_bc_pred = bc_neural_network.predict(X_bc_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_bc_test, y_bc_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_bc_test, y_bc_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 0.9385964912280702
Evaluasi hasil prediksi menggunakan metric F1: 0.953020134228188


#### 2. Dataset Play Tennis

In [29]:
clf = MLPClassifier(random_state=42, max_iter=1000)
clf.fit(X_pt_train, y_pt_train)

MLPClassifier(max_iter=1000, random_state=42)

In [30]:
# Memprediksi hasil dari model
y_pt_pred = clf.predict(X_pt_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_pt_test, y_pt_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_pt_test, y_pt_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 1.0
Evaluasi hasil prediksi menggunakan metric F1: 1.0


### SVM

#### 1. Dataset Breast Cancer

In [31]:
bc_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
bc_clf.fit(X_bc_train, y_bc_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [32]:
y_bc_pred = bc_clf.predict(X_bc_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_bc_test, y_bc_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_bc_test, y_bc_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 0.9824561403508771
Evaluasi hasil prediksi menggunakan metric F1: 0.9861111111111112


#### 2. Dataset Play Tennis

In [33]:
svc = SVC(random_state=42)
svc.fit(X_pt_train, y_pt_train)

SVC(random_state=42)

In [34]:
# Memprediksi hasil dari model
y_pt_pred = svc.predict(X_pt_test)

# Mengevaluasi hasil prediksi
# Menggunakan metric Accuracy
print("Evaluasi hasil prediksi menggunakan metric Accuracy: {}".format(accuracy_score(y_pt_test, y_pt_pred)))

# Menggunakan metric F1
print("Evaluasi hasil prediksi menggunakan metric F1: {}".format(f1_score(y_pt_test, y_pt_pred)))

Evaluasi hasil prediksi menggunakan metric Accuracy: 0.6666666666666666
Evaluasi hasil prediksi menggunakan metric F1: 0.8


Made by: 
- 13519107 Daffa Ananda 
(4,5,6 breast cancer dataset & 1,2,3 play tennis dataset)
- 13519113 Raihan Astrada Fathurrahman (1,2,3 breast cancer dataset  & 4,5,6 play tennis dataset)