In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.datasets import load_iris

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold    # train, test 데이터 분할
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression, SGDClassifier # 로지스틱 회귀 분석 / 선형 분류
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay, roc_auc_score

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
data = pd.read_csv(url, encoding="cp949", header=None)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


#### 전복(Abalone) 데이터
- column 1~7 이용, column 0 (성별) 예측 분류기

In [10]:
enc = LabelEncoder()

y = data.iloc[:, 0]

ey = enc.fit(y.unique())
y = ey.transform(y)

# print(y)

X = data.iloc[:,1:8]

scaler = MinMaxScaler()
X = scaler.fit(X).transform(X)

print(X)

[[0.51351351 0.5210084  0.0840708  ... 0.15030262 0.1323239  0.14798206]
 [0.37162162 0.35294118 0.07964602 ... 0.06624075 0.06319947 0.06826109]
 [0.61486486 0.61344538 0.11946903 ... 0.17182246 0.18564845 0.2077728 ]
 ...
 [0.70945946 0.70588235 0.18141593 ... 0.3527236  0.37788018 0.30543099]
 [0.74324324 0.72268908 0.13274336 ... 0.35642233 0.34298881 0.29347285]
 [0.85810811 0.84033613 0.17256637 ... 0.63517149 0.49506254 0.49177877]]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=10)

models = [DecisionTreeClassifier(max_depth=3),  KNeighborsClassifier(n_neighbors=3), LogisticRegression(solver='liblinear'), GaussianNB()]
model_names = ['DecisionTreeClassifier', 'KNeighborsClassifier', 'LogisticRegression', 'GaussianNB']

for model, model_name in zip(models,model_names) :
  m = model
  m.fit(X_train, y_train)
  
  print(model_name, '----------------------------------')
  print("Train score", m.score(X_train, y_train))
  print("Test score", m.score(X_test, y_test))
  print('test pred : ', m.predict(X_test))
  print('test target', y_test)
  
print('------------------------------------------------')

DecisionTreeClassifier ----------------------------------
Train score 0.539272030651341
Test score 0.5205741626794258
KNeighborsClassifier ----------------------------------
Train score 0.7340357598978289
Test score 0.508133971291866
LogisticRegression ----------------------------------
Train score 0.5491698595146871
Test score 0.539712918660287
GaussianNB ----------------------------------
Train score 0.5223499361430396
Test score 0.5052631578947369
------------------------------------------------
