<a href="https://colab.research.google.com/github/sgr1118/sgr1118/blob/main/%5BExp_01%5D_Sklearn_Classifiers_with_toy_datasets(digits%2C_wine%2C_breast_cancer).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1-11. 프로젝트 (1) load_digits : 손글씨를 분류해 봅시다

In [None]:
# (1) 필요한 모듈 import
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
pd.set_option('max_rows',500)
pd.set_option('max_columns',30)

# (2) 데이터 준비
dataset = load_digits()
feature = dataset.data
labels = dataset.target
digits_df = pd.DataFrame(data=feature, columns = dataset.feature_names)
digits_df['target'] = labels

# (3)데이터 이해하기
# Target Names 출력해 보기
#dataset.target_names array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
#print(digits_df['target'].value_counts()) 

# 데이터 Describe 해 보기
#print(digits.DESCR)

# 데이터 분해
X = digits_df.iloc[:, :-1]
Y = digits_df.iloc[:,-1]

scalerX = StandardScaler().fit_transform(X)
#print(scalerX.mean(), scalerX.std()) # 1.606332701182753e-18 0.9762812094883317

# 함수 작성
def get_scores(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    B = model.score(xtest, ytest)
    ypred = model.predict_proba(xtest)[:, 1]
    preds_1d = ypred.flatten()
    pred_class = np.where(preds_1d > 0.5, 2 , 1)
    C = accuracy_score(ytest, pred_class)  
    return '{:.4f} {:.4f} {:.4f}'.format(A, B, C)

def make_models(xtrain, xtest, ytrain, ytest):
  model1 = LogisticRegression(max_iter=5000).fit(xtrain, ytrain)
  print('model1', get_scores(model1, xtrain, xtest, ytrain, ytest))

  model2 = DecisionTreeClassifier(random_state=10).fit(xtrain, ytrain)
  print('model2', get_scores(model2, xtrain, xtest, ytrain, ytest))
  
  # overfitting 해결
  for d in range(3, 8):
      model2 = DecisionTreeClassifier(max_depth=d,random_state=10).fit(xtrain, ytrain)
      print('model2', d, get_scores(model2, xtrain, xtest, ytrain, ytest))

  model3 = RandomForestClassifier(random_state=0).fit(xtrain, ytrain)
  print('model3', get_scores(model3, xtrain, xtest, ytrain, ytest))

  # overfitting 해결
  for d in range(3, 8):
      model3 = RandomForestClassifier(500, max_depth=d, random_state=10).fit(xtrain, ytrain)
      print('model3', d, get_scores(model3, xtrain, xtest, ytrain, ytest))

  model4 = SVC(kernel = 'linear', C=1.0, random_state=0, probability=True).fit(xtrain, ytrain)      
  print('model4', get_scores(model4, xtrain, xtest, ytrain, ytest))

  model5 = SGDClassifier(loss='modified_huber', max_iter=500).fit(xtrain, ytrain)
  print('model5', get_scores(model5, xtrain, xtest, ytrain, ytest))

# (4) train, test 데이터 분리
# train, test  8:2 분할, random_state=10 적용
xtrain1, xtest1, ytrain1, ytest1 = train_test_split(scalerX, Y, 
                                                   test_size = 0.2,
                                                   random_state=10)

# 최적의 파라미터 찾기 (max_depth)
#params = {'max_depth': range(3, 10)}
#model = RandomForestClassifier(500, random_state=10)
#gs = GridSearchCV(model, params, cv=5)
#gs.fit(scalerX, Y)
#result = pd.DataFrame(gs.cv_results_)
#result
#model = gs.best_estimator_
#print(model.score(xtest1, ytest1), gs.score(xtest1, ytest1), gs.best_params_)

# (5) 모델의 학습 및 예측
make_models(xtrain1, xtest1, ytrain1, ytest1)

# (6) 모델 평가
#print(classification_report(ytest1, y_pred))

model1 0.9986 0.9694 0.0056
model2 1.0000 0.8500 0.0139
model2 3 0.4857 0.4278 0.0944
model2 4 0.5832 0.5194 0.0944
model2 5 0.6882 0.6389 0.0750
model2 6 0.8079 0.7500 0.0694
model2 7 0.8942 0.8167 0.0306
model3 1.0000 0.9694 0.0111
model3 3 0.8970 0.8778 0.0944
model3 4 0.9388 0.9139 0.0944
model3 5 0.9749 0.9333 0.0722
model3 6 0.9868 0.9444 0.0500
model3 7 0.9979 0.9472 0.0250
model4 1.0000 0.9750 0.0056
model5 0.9875 0.9472 0.0194


# 1-12. 프로젝트 (2) load_wine : 와인을 분류해 봅시다

In [None]:
# (1) 필요한 모듈 import
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
pd.set_option('max_rows',500)
pd.set_option('max_columns',30)

# (2) 데이터 준비
dataset = load_wine()
feature = dataset.data
labels = dataset.target

#dataset.target_names # array(['class_0', 'class_1', 'class_2'], dtype='<U7'
wine_df = pd.DataFrame(data=feature, columns = dataset.feature_names)
wine_df['target'] = labels

# (3)데이터 이해하기
#Target Names 출력해 보기
#print(dataset.target_names) # ['class_0' : 0,  'class_1' : 1, 'class_2']
#print(wine_df['target'].value_counts()) # 1: 71, 0: 59, 2: 48

# 데이터 Describe 해 보기
#print(wine.DESCR)

# 데이터 분해
X = wine_df.iloc[:, :-1]
Y = wine_df.iloc[:,-1]

scalerX = StandardScaler().fit_transform(X)
#print(scalerX.mean(), scalerX.std()) # 4.66735072755122e-16 1.0

# 함수 작성
def get_scores(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    B = model.score(xtest, ytest)
    ypred = model.predict_proba(xtest)[:, 1]
    preds_1d = ypred.flatten()
    pred_class = np.where(preds_1d > 0.5, 2 , 1)
    C = accuracy_score(ytest, pred_class)  
    return '{:.4f} {:.4f} {:.4f}'.format(A, B, C)

def make_models(xtrain, xtest, ytrain, ytest):
  model1 = LogisticRegression(max_iter=5000).fit(xtrain, ytrain)
  print('model1', get_scores(model1, xtrain, xtest, ytrain, ytest))

  model2 = DecisionTreeClassifier(random_state=10).fit(xtrain, ytrain)
  print('model2', get_scores(model2, xtrain, xtest, ytrain, ytest))
  
  # overfitting 해결
  for d in range(3, 8):
      model2 = DecisionTreeClassifier(max_depth=d,random_state=10).fit(xtrain, ytrain)
      print('model2', d, get_scores(model2, xtrain, xtest, ytrain, ytest))

  model3 = RandomForestClassifier(random_state=0).fit(xtrain, ytrain)
  print('model3', get_scores(model3, xtrain, xtest, ytrain, ytest))

  # overfitting 해결
  for d in range(3, 8):
      model3 = RandomForestClassifier(500, max_depth=d, random_state=10).fit(xtrain, ytrain)
      print('model3', d, get_scores(model3, xtrain, xtest, ytrain, ytest))

  model4 = SVC(kernel = 'linear', C=1.0, random_state=0, probability=True).fit(xtrain, ytrain)      
  print('model4', get_scores(model4, xtrain, xtest, ytrain, ytest))

  model5 = SGDClassifier(loss='modified_huber', max_iter=100).fit(xtrain, ytrain)
  print('model5', get_scores(model5, xtrain, xtest, ytrain, ytest))

# (4) train, test 데이터 분리
# train, test  8:2 분할, random_state=10 적용
xtrain1, xtest1, ytrain1, ytest1 = train_test_split(scalerX, Y, 
                                                   test_size = 0.2,
                                                   random_state=10)

# 최적의 파라미터 찾기 (max_depth)
#params = {'max_depth': range(3, 10)}
#model = RandomForestClassifier(500, random_state=10)
#gs = GridSearchCV(model, params, cv=5)
#gs.fit(scalerX, Y)
#result = pd.DataFrame(gs.cv_results_)
#result
#model = gs.best_estimator_
#print(model.score(xtest1, ytest1), gs.score(xtest1, ytest1), gs.best_params_)

# (5) 모델의 학습 및 예측
make_models(xtrain1, xtest1, ytrain1, ytest1)

# (6) 모델 평가
#print(classification_report(ytest1, y_pred))

4.66735072755122e-16 1.0
model1 1.0000 0.9167 0.0833
model2 1.0000 0.9444 0.0556
model2 3 0.9859 0.9444 0.0556
model2 4 0.9930 0.9444 0.0556
model2 5 1.0000 0.9444 0.0556
model2 6 1.0000 0.9444 0.0556
model2 7 1.0000 0.9444 0.0556
model3 1.0000 0.9444 0.0833
model3 3 1.0000 0.9167 0.1111
model3 4 1.0000 0.9444 0.1111
model3 5 1.0000 0.9444 0.0833
model3 6 1.0000 0.9444 0.0833
model3 7 1.0000 0.9444 0.0833
model4 1.0000 0.9167 0.0833
model5 0.9930 0.9167 0.0833


# 1-13. 프로젝트 (3) load_breast_cancer : 유방암 여부를 진단해 봅시다

In [None]:
# (1) 필요한 모듈 import
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
pd.set_option('max_rows',500)
pd.set_option('max_columns',30)

# (2) 데이터 준비
dataset = load_breast_cancer()
feature = dataset.data
labels = dataset.target
#breast_cancer.target_names # array(['malignant', 'benign'], dtype='<U9')
cancer_df = pd.DataFrame(data=feature, columns = dataset.feature_names)
cancer_df['target'] = labels
#cancer_df.head()

# (3)데이터 이해하기
#print(dataset.target_names) # ['malignant' : 0,  'benign' : 1]
#print(cancer_df['target'].value_counts()) 1: 357, 0: 212

X = cancer_df.iloc[:, :-1]
Y = cancer_df.iloc[:,-1]

scalerX = StandardScaler().fit_transform(X)
#print(scalerX.mean(), scalerX.std()) # -6.118909323768877e-16 1.0

# 함수 작성
def get_scores(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    B = model.score(xtest, ytest)
    ypred = model.predict_proba(xtest)[:, 1]
    C = roc_auc_score(ytest, ypred)  
    return '{:.4f} {:.4f} {:.4f}'.format(A, B, C)

def make_models(xtrain, xtest, ytrain, ytest):
  model1 = LogisticRegression(max_iter=5000).fit(xtrain, ytrain)
  print('model1', get_scores(model1, xtrain, xtest, ytrain, ytest))

  model2 = DecisionTreeClassifier(random_state=10).fit(xtrain, ytrain)
  print('model2', get_scores(model2, xtrain, xtest, ytrain, ytest))
  
  # overfitting 해결
  for d in range(3, 8):
      model2 = DecisionTreeClassifier(max_depth=d,random_state=10).fit(xtrain, ytrain)
      print('model2', d, get_scores(model2, xtrain, xtest, ytrain, ytest))

  model3 = RandomForestClassifier(random_state=0).fit(xtrain, ytrain)
  print('model3', get_scores(model3, xtrain, xtest, ytrain, ytest))

  # overfitting 해결
  for d in range(3, 8):
      model3 = RandomForestClassifier(500, max_depth=d, random_state=10).fit(xtrain, ytrain)
      print('model3', d, get_scores(model3, xtrain, xtest, ytrain, ytest))

  model4 = SVC(kernel = 'linear', C=1.0, random_state=0, probability=True).fit(xtrain, ytrain)      
  print('model4', get_scores(model4, xtrain, xtest, ytrain, ytest))

  model5 = SGDClassifier(loss='modified_huber', max_iter=100).fit(xtrain, ytrain)
  print('model5', get_scores(model5, xtrain, xtest, ytrain, ytest))

# train, test  8:2 분할, random_state=10 적용
xtrain1, xtest1, ytrain1, ytest1 = train_test_split(scalerX, Y, 
                                                   test_size = 0.2,
                                                   random_state=10)

# 최적의 파라미터 찾기 (max_depth)
#params = {'max_depth': range(3, 10)}
#model = RandomForestClassifier(500, random_state=10)
#gs = GridSearchCV(model, params, cv=5)
#gs.fit(scalerX, Y)
#result = pd.DataFrame(gs.cv_results_)
#result
#model = gs.best_estimator_
#print(model.score(xtest1, ytest1), gs.score(xtest1, ytest1), gs.best_params_)

# (5) 모델의 학습 및 예측
make_models(xtrain1, xtest1, ytrain1, ytest1)

# (6) 모델 평가
#print(classification_report(ytest1, y_pred))

NameError: ignored