### 4. 

### 1. 데이터 읽어오기

In [None]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')      
submission = pd.read_csv('sample_submission.csv')

train.head(3)

### 2. 데이터 확인

In [None]:
# 파이썬 warning 무시
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 한글 폰트를 사용하기 위한 코드
fe = fm.FontEntry(fname = 'NotoSansKR-Regular.otf', name = 'NotoSansKR')
fm.fontManager.ttflist.insert(0, fe)
plt.rc('font', family='NotoSansKR')

In [None]:
train['quality'].value_counts().sort_index()

In [None]:
x = train['quality'].value_counts().sort_index().index
y = train['quality'].value_counts().sort_index().values

plt.figure(figsize=(4,3), dpi=150)

plt.title('와인 품질 분포')
plt.xlabel('와인 품질')
plt.ylabel('갯수')

plt.bar(x,y)
plt.show()

### 3. 데이터 전처리

In [None]:
train['type'] = train['type'].apply(lambda x : 0 if x == 'white' else 1)
train

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train[train.columns[2:-1]])
train[train.columns[2:-1]] = scaler.transform(train[train.columns[2:-1]])
train

In [None]:
test['type'] = test['type'].apply(lambda x : 0 if x == 'white' else 1)
test[test.columns[1:-1]] = scaler.transform(test[test.columns[1:-1]])
test

In [None]:
features = train.columns[2:]

X = train[features]
y = train['quality']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, shuffle=True)

#데이터 shape 확인
print(f"X_train.shape : {X_train.shape}")
print(f"y_train.shape : {y_train.shape}")
print(f"X_valid.shape : {X_valid.shape}")
print(f"y_valid.shape : {y_valid.shape}")

### 4. 데이터 학습

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_train, y_train)
predict = model.predict(X_valid)
print(predict[:10])

In [None]:
import numpy as np

def ACC(y_true, pred):   
    score = np.mean(y_true==pred)
    return score

acc = ACC(y_valid, predict)

print(f"모델의 정확도는 {acc*100:.2f}%입니다.")

In [None]:
import numpy as np

##### 모델 검증 시각화 #####
def make_plot(y_true, pred):
    
    acc = ACC(y_true, pred)
    df_validation = pd.DataFrame({'y_true':y_true, 'y_pred':pred})

    # 검증 데이터 정답지('y_true') 빈도수 (sorted)
    df_validation_count = pd.DataFrame(df_validation['y_true'].value_counts().sort_index())
    # 검증 데이터 예측치('y_pred') 빈도수 (sorted)
    df_pred_count =  pd.DataFrame(df_validation['y_pred'].value_counts().sort_index())

    # pd.concat - 검증 데이타 정답지, 예측치 빈도수 합치기
    df_val_pred_count = pd.concat([df_validation_count,df_pred_count], axis=1).fillna(0)

    ############################################################
    # 그래프 그리기
    ############################################################
    
    x = df_validation_count.index
    y_true_count = df_val_pred_count['y_true']
    y_pred_count = df_val_pred_count['y_pred']

    width = 0.35
    plt.figure(figsize=(5,3),dpi=150)

    plt.title('ACC : ' + str(acc)[:6])
    plt.xlabel('quality')
    plt.ylabel('count')

    p1 = plt.bar([idx-width/2 for idx in x], y_true_count, width, label='real')
    p2 = plt.bar([idx+width/2 for idx in x], y_pred_count,  width, label='pred')

    plt.legend()
    plt.show()
    
make_plot(y_valid, predict)

In [None]:
features = train.columns[2:]

X = train[features]
y = train['quality']

model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X, y)
predict = model.predict(test[features])

### 5. CSV파일로 저장

In [None]:
submission['quality'] = predict
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)