In [None]:
import pandas as pd
df_iris = pd.read_csv('./data/iris.csv')
df_house = pd.read_csv('./data/house_prices.csv')
df_wine = pd.read_csv('./data/wine_data.csv')

---
## 머신러닝

---
### 지도 학습
| 지도학습 알고리즘      | 분류기                      | 회귀기                                     |
|-----------------------|----------------------------|--------------------------------------------|
| KNN                   | KNeighborsClassifier       | KNeighborsRegressor                        |
| 회귀 분석              | LogisticRegression         | LinearRegression, Ridge, Lasso, ElasticNet |
| 나이브 베이즈          | GaussianNB                 |                                            |
| 의사 결정 나무         | DecisionTreeClassifier     | DecisionTreeRegressor                      |
| 서포트 벡터 머신       | SVC                        | SVR                                        |
| Voting                | VotingClassifier           | VotingRegressor                            |
| RandomForest          | RandomForestClassifier     | RandomForestRegressor                      |
| Bagging               | BaggingClassifier          | BaggingRegressor                           |
| AdaBoost              | AdaBoostClassifier         | AdaBoostRegressor                          |
| GradientBoosting      | GradientBoostingClassifier | GradientBoostingRegressor                  |
| Stacking              | StackingClassifier         | StackingRegressor                          |

In [None]:
def preprocessing(df):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    from sklearn.model_selection import train_test_split
    X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_tn)
    X_tn_std = scaler.transform(X_tn)
    X_te_std = scaler.transform(X_te)
    
    return X_tn_std, X_te_std, y_tn, y_te

def print_result_classifier(y, pred):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sklearn.metrics import confusion_matrix, classification_report
    print(accuracy_score(y, pred))
    print(precision_score(y, pred, average='macro'))
    print(recall_score(y, pred, average='macro'))
    print(f1_score(y, pred, average='macro'))
    print(confusion_matrix(y, pred))
    print(classification_report(y, pred))
    
def print_result(y, pred):
    from sklearn.metrics import mean_squared_error, r2_score
    print(f'mse: {mean_squared_error(y, pred):.6f}, r2_score: {r2_score(y, pred):.6f}')

---
KNN

In [None]:
# KNN 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_iris)

from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# KNN 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.neighbors import KNeighborsRegressor
clf = KNeighborsRegressor(n_neighbors=2)
clf.fit(X_tn_std, y_tn)
print_result(y_te, clf.predict(X_te_std))

---
로지스틱 회귀

In [None]:
# 로지스틱 회귀 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='l2')
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

---
선형 회귀

In [None]:
# 선형 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
clf_linear = LinearRegression()
clf_linear.fit(X_tn_std, y_tn)
clf_ridge = Ridge(alpha=1)
clf_ridge.fit(X_tn_std, y_tn)
clf_lasso = Lasso(alpha=0.01)
clf_lasso.fit(X_tn_std, y_tn)
clf_elastic = ElasticNet(alpha=0.01, l1_ratio=0.01)
clf_elastic.fit(X_tn_std, y_tn)
print_result(y_te, clf_linear.predict(X_te_std))
print_result(y_te, clf_ridge.predict(X_te_std))
print_result(y_te, clf_lasso.predict(X_te_std))
print_result(y_te, clf_elastic.predict(X_te_std))

---
나이브 베이즈

In [None]:
# 나이브 베이즈 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

---
의사 결정 나무

In [None]:
# 의사 결정 나무 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# 의사 결정 나무 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor(random_state=0)
clf.fit(X_tn_std, y_tn)
pred = clf.predict(X_te_std)
print_result(y_te, pred)

---
서포트 벡터 머신

In [None]:
# 서포트 벡터 머신 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# 서포트 벡터 머신 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.svm import SVR
clf = SVR(kernel='linear')
clf.fit(X_tn_std, y_tn)
print_result(y_te, clf.predict(X_te_std))

---
보팅

In [None]:
# 보팅 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

clf = VotingClassifier(
    estimators=[
        ('logistic_regression', LogisticRegression()),
        ('svc', SVC(kernel='linear')),
        ('gaussianNB', GaussianNB())
    ],
    voting='hard',
    weights=[1, 1, 1]
)
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# 보팅 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor

clf = VotingRegressor(
    estimators=[
        ('linear_regression', LinearRegression()),
        ('svm', SVR(kernel='linear')),
        ('tree', DecisionTreeRegressor())
    ],
    weights=[1, 1, 1]
)
clf.fit(X_tn_std, y_tn)
print_result(y_te, clf.predict(X_te_std))

---
랜덤 포레스트

In [None]:
# 랜덤 포레스트 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2,
                             random_state=0)
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# 랜덤 포레스트 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(max_depth=2,
                            random_state=0)
clf.fit(X_tn_std, y_tn)
print_result(y_te, clf.predict(X_te_std))

---
배깅

In [None]:
# 배깅 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(estimator=GaussianNB(),
                        n_estimators=10,
                        random_state=0)
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# 배깅 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
clf = BaggingRegressor(estimator=SVR(),
                       n_estimators=10,
                       random_state=0)
clf.fit(X_tn_std, y_tn)
print_result(y_te, clf.predict(X_te_std))

---
AdaBoost

In [None]:
# AdaBoost 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(random_state=0)
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# AdaBoost 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.ensemble import AdaBoostRegressor
clf = AdaBoostRegressor(random_state=0)
clf.fit(X_tn_std, y_tn)
print_result(y_te, clf.predict(X_te_std))

---
GradientBoosting

In [None]:
# GradientBoosting 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(max_depth=2, learning_rate=0.1, random_state=0)
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# GradientBoosting 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor(max_depth=2, learning_rate=0.1, random_state=0)
clf.fit(X_tn_std, y_tn)
print_result(y_te, clf.predict(X_te_std))

---
Stacking

In [None]:
# Stacking 분류
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
clf1 = SVC(kernel='linear', random_state=1)
clf2 = GaussianNB()
clf = StackingClassifier(
    estimators=[
        ('svm', clf1),
        ('gnb', clf2)
    ],
    final_estimator=LogisticRegression())
clf.fit(X_tn_std, y_tn)
print_result_classifier(y_te, clf.predict(X_te_std))

In [None]:
# Stacking 회귀
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_house)

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
clf1 = SVR(kernel='linear')
clf2 = DecisionTreeRegressor()
clf = StackingRegressor(
    estimators=[
        ('svm', clf1),
        ('gnb', clf2)
    ],
    final_estimator=LinearRegression())
clf.fit(X_tn_std, y_tn)
print_result(y_te, clf.predict(X_te_std))

---
### 비지도 학습
| 비지도학습 알고리즘    | Class                      |
|-----------------------|----------------------------|
| k-means               | KMeans                     |
| DBSCAN                | DBSCAN                     |
| GMM                   | GaussianMixture            |
| One Class SVM         | OneClassSVM                |
| Local Oulier Factor   | LocalOutlierFactor         |
| Isolation Forest      | IsolationForest            |

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs, make_moons
from sklearn.metrics import silhouette_score

---
k-means

In [None]:
# k-means 분류 (거리)
X, y = make_blobs(n_samples=100, n_features=2, centers=5, random_state=10)
plt.scatter(X[:, 0], X[:, 1], c='gray', edgecolors='black', marker='o')

from sklearn.cluster import KMeans
clf = KMeans(n_clusters=5, init='random', n_init=10, max_iter=100, random_state=0)
clf.fit(X)
df = pd.DataFrame(X)
df['target'] = y
df['label'] = clf.labels_
markers = ['o', 'x', '^', 's', '*']
for i, mark in enumerate(markers):
    df_i = df[df['label']==i]
    plt.scatter(df_i[0], df_i[1], marker=mark, label=i)
plt.legend()
plt.show()

print(silhouette_score(X, clf.labels_))

---
DBSCAN

In [None]:
# DBSCAN 분류 (밀도)
X, y = make_moons(n_samples=300, noise=0.05, random_state=10)
plt.scatter(X[:, 0], X[:, 1], c='gray', edgecolors='black', marker='o')

from sklearn.cluster import DBSCAN
clf = DBSCAN(eps=0.2)
clf.fit(X)
df = pd.DataFrame(X)
df['target'] = y
df['label'] = clf.labels_
markers = ['o', 'x']
for i, mark in enumerate(markers):
    df_i = df[df['label']==i]
    plt.scatter(df_i[0], df_i[1], marker=mark, label=i)
plt.legend()
plt.show()

print(silhouette_score(X, clf.labels_))

---
Gaussian Mixture Model

In [None]:
# DBSCAN 분류
X, y = make_blobs(n_samples=200, n_features=2, centers=2, random_state=10)
plt.scatter(X[:, 0], X[:, 1], c='gray', edgecolors='black', marker='o')

from sklearn.mixture import GaussianMixture
clf = GaussianMixture(n_components=2, random_state=0)
clf.fit(X)
pred = clf.predict(X)
df = pd.DataFrame(X)
df['target'] = y
df['label'] = pred
markers = ['o', 'x', '^', 's', '*']
for i, mark in enumerate(markers):
    df_i = df[df['label']==i]
    plt.scatter(df_i[0], df_i[1], marker=mark, label=i)
plt.legend()
plt.show()

print(silhouette_score(X, pred))

---
One Class SVM (nu: 이상치 비율)

In [None]:
X = df_wine[['Alcohol', 'Magesium']]

from sklearn.svm import OneClassSVM
clf = OneClassSVM(gamma='auto', nu=0.5)
clf.fit(X)
print(silhouette_score(X, clf.predict(X)))

---
Local Outlier Factor (fit_predict)

In [None]:
X = df_wine[['Alcohol', 'Magesium']]

from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=3)
print(silhouette_score(X, clf.fit_predict(X)))

---
Isolation Forest

In [None]:
X = df_wine[['Alcohol', 'Magesium']]

from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state=0)
clf.fit(X)
silhouette_score(X, clf.predict(X))

---
## Cross validation

In [None]:
X_tn_std, X_te_std, y_tn, y_te = preprocessing(df_wine)

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
param_grid = {'kernel': ('linear', 'rbf'),
              'C': [0.5, 1, 10, 100]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
svc = SVC(random_state=0)
grid_cv = GridSearchCV(svc, param_grid, cv=kfold, scoring='accuracy')
grid_cv.fit(X_tn_std, y_tn)

grid_cv.best_score_
grid_cv.best_params_
clf = grid_cv.best_estimator_
pd.DataFrame(grid_cv.cv_results_).T

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_score = cross_validate(clf, X_tn_std, y_tn, cv=kfold, scoring=metrics)
cv_score2 = cross_val_score(clf, X_tn_std, y_tn, cv=kfold, scoring='accuracy')
print_result_classifier(y_te, clf.predict(X_te_std))

---
## PyCaret

In [None]:
from pycaret.classification import *

exp1 = setup(data=df_wine, target='class', session_id=123)
best_model = compare_models()
tuned_model = tune_model(best_model)
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model = finalize_model(tuned_model)
# save_model(final_model, 'wine_model')
# loaded_model = load_model('wine_model')
# new_data = df_wine.iloc[20:40, :-1]
# predictions = predict_model(loaded_model, data=new_data)

In [None]:
from pycaret.regression import *

exp2 = setup(data=df_house, target='MEDV', session_id=456)
best_model = compare_models()
tuned_model = tune_model(best_model)
evaluate_model(tuned_model)
predict_model(tuned_model)
final_model = finalize_model(tuned_model)
# save_model(final_model, 'boston_house_model')
# loaded_model = load_model('boston_house_model')
# new_data = df_house.iloc[20:40, :-1]
# predictions = predict_model(loaded_model, data=new_data)