In [64]:
import seaborn as sns
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [65]:
df.shape

(344, 7)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [67]:
df.dtypes

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [68]:
# 결측치 제거
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [69]:
df.sex.unique()

array(['Male', 'Female', nan], dtype=object)

In [70]:
df.sex.value_counts().index[0]

'Male'

In [71]:
df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

In [72]:
missing = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [73]:
for i in missing:
    df[i] = df[i].fillna(df[i].median())
df['sex'] = df['sex'].fillna(df.sex.value_counts().index[0])

In [74]:
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female


In [75]:
df.island.value_counts()

Biscoe       168
Dream        124
Torgersen     52
Name: island, dtype: int64

In [76]:
df.species.value_counts()

Adelie       152
Gentoo       124
Chinstrap     68
Name: species, dtype: int64

In [77]:
# label encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = df.select_dtypes('object').columns
df[label] = df[label].apply(le.fit_transform)

In [78]:
# 데이터 변환, 더미 처리

import pandas as pd
df.dtypes

species                int32
island                 int32
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                    int32
dtype: object

In [79]:
category = ['island','sex']
for i in category:
    df[i] = df[i].astype('category')
df.dtypes

species                 int32
island               category
bill_length_mm        float64
bill_depth_mm         float64
flipper_length_mm     float64
body_mass_g           float64
sex                  category
dtype: object

In [80]:
df = pd.get_dummies(df)
df.dtypes

species                int32
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
island_0               uint8
island_1               uint8
island_2               uint8
sex_0                  uint8
sex_1                  uint8
dtype: object

In [81]:
# 파생변수 만들기

df.head(3)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0


In [82]:
df['body_mass_g_qcut'] = pd.qcut(df['body_mass_g'], 5, labels = False)
df.head(3)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0,1
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0,0


In [83]:
pd.qcut(df['body_mass_g'], 5, labels = False)

0      1
1      1
2      0
3      2
4      0
      ..
339    2
340    3
341    4
342    4
343    4
Name: body_mass_g, Length: 344, dtype: int64

In [84]:
pd.qcut(df['body_mass_g'], 5)

0        (3475.0, 3800.0]
1        (3475.0, 3800.0]
2      (2699.999, 3475.0]
3        (3800.0, 4300.0]
4      (2699.999, 3475.0]
              ...        
339      (3800.0, 4300.0]
340      (4300.0, 4950.0]
341      (4950.0, 6300.0]
342      (4950.0, 6300.0]
343      (4950.0, 6300.0]
Name: body_mass_g, Length: 344, dtype: category
Categories (5, interval[float64, right]): [(2699.999, 3475.0] < (3475.0, 3800.0] < (3800.0, 4300.0] < (4300.0, 4950.0] < (4950.0, 6300.0]]

In [85]:
df.dtypes

species                int32
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
island_0               uint8
island_1               uint8
island_2               uint8
sex_0                  uint8
sex_1                  uint8
body_mass_g_qcut       int64
dtype: object

In [86]:
# 스케일링

from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
scaler = ['bill_length_mm','bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
mm.fit(df[scaler])
df[scaler] = mm.transform(df[scaler])

In [87]:
df.head(3)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,0.254545,0.666667,0.152542,0.291667,0,0,1,0,1,1
1,0,0.269091,0.511905,0.237288,0.305556,0,0,1,1,0,1
2,0,0.298182,0.583333,0.389831,0.152778,0,0,1,1,0,0


In [89]:
# 데이터 분리
from sklearn.model_selection import train_test_split

x_tr, x_val, y_tr, y_val = train_test_split(df.iloc[:,1:], df.species, test_size = 0.2, stratify = df.species, random_state = 1)

In [93]:
x_tr.shape, x_val.shape, y_tr.shape, y_val.shape

((275, 10), (69, 10), (275,), (69,))

In [94]:
# 7. 모형학습

from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(x_tr, y_tr)
pred1 = model1.predict(x_val)

In [95]:
from sklearn.ensemble import AdaBoostClassifier
model2 = AdaBoostClassifier()
model2.fit(x_tr, y_tr)
pred2 = model2.predict(x_val)

In [96]:
# 8. 앙상블 
from sklearn.ensemble import VotingClassifier
clf = VotingClassifier(estimators = [('rf', model1), ('ad', model2)], voting = 'hard')
clf.fit(x_tr, y_tr)
pred3 = clf.predict(x_val)

In [98]:
pred1

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [100]:
pred2

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 1, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [99]:
pred3

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [101]:
# 9. 모형평가

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('랜덤포레스트 정확도: ', accuracy_score(y_val, pred1))
print('에이다 정확도: ', accuracy_score(y_val, pred2))
print('보팅 정확도: ', accuracy_score(y_val, pred3))

랜덤포레스트 정확도:  1.0
에이다 정확도:  0.9855072463768116
보팅 정확도:  1.0


In [104]:
classification_report(y_val, pred1)

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00        30\n           1       1.00      1.00      1.00        14\n           2       1.00      1.00      1.00        25\n\n    accuracy                           1.00        69\n   macro avg       1.00      1.00      1.00        69\nweighted avg       1.00      1.00      1.00        69\n'

In [105]:
# 10. 하이퍼파라미터 튜닝

from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [50, 100], 'max_depth': [4,6]}
model4 = RandomForestClassifier()
gscv = GridSearchCV(estimator = model4, param_grid = parameters, cv = 3)
gscv.fit(x_tr, y_tr)
gscv.best_params_

{'max_depth': 4, 'n_estimators': 100}

In [106]:
# 11. 예측값 저장
pred3

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [111]:
pd.DataFrame({'id': y_val.index, 'pred': pred3}).to_csv('0000.csv', index = False)

In [112]:
check = pd.read_csv('0000.csv')
check.head()

Unnamed: 0,id,pred
0,57,0
1,173,1
2,213,1
3,50,0
4,25,0
