### 5. 

### 1. 데이터 읽어오기

In [None]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

### 2. 데이터 확인

In [None]:
# 파이썬 warning 무시
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 한글 폰트를 사용하기 위한 코드
fe = fm.FontEntry(fname = 'NotoSansKR-Regular.otf', name = 'NotoSansKR')
fm.fontManager.ttflist.insert(0, fe)
plt.rc('font', family='NotoSansKR')

In [None]:
import seaborn as sns

features = ['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'type']

plt.figure(figsize=(10,5))
ax = sns.heatmap(train[list(features) + ['quality']].corr(), annot=True)
plt.show()

In [None]:
white = train[train['type'] == 'white']
red = train[train['type'] == 'red']

print('화이트 와인 데이터 개수 : ', white.shape[0])
print('레드와인 와인 데이터 개수 : ', red.shape[0])

In [None]:
plt.style.use("ggplot")

sns.countplot(data=train, x='type', hue='quality')
plt.title("와인 type에 따른 품질등급별 데이터 개수") 
plt.show()

In [None]:
plt.style.use("ggplot")

plt.figure(figsize=(6,4))
plt.suptitle("white / red", fontsize=20)

## white
plt.subplot(1,2,1)
sns.barplot(x = white['quality'].value_counts().index, y = white['quality'].value_counts())

## red
plt.subplot(1,2,2)
sns.barplot(x = red['quality'].value_counts().index, y = red['quality'].value_counts())
plt.show()

In [None]:
total_count = sum(train['total sulfur dioxide'] >= train['free sulfur dioxide'])
same_count =  sum(train['total sulfur dioxide'] == train['total sulfur dioxide'])
sulfur_count = sum(train['total sulfur dioxide'] <= train['free sulfur dioxide'])              

print('total > free에 해당하는 개수 :', total_count)
print('두 변수가 같은 경우의 개수 :', same_count)
print('total < free에 해당하는 개수 :', sulfur_count)

In [None]:
train['free et sulfur dioxid'] = train['total sulfur dioxide'] - train['free sulfur dioxide']   
test['free et sulfur dioxid'] = test['total sulfur dioxide'] - test['free sulfur dioxide']    

In [None]:
train = train.drop(['total sulfur dioxide'], axis = 1)   
test = test.drop(['total sulfur dioxide'], axis = 1)

### 3. 데이터 전처리

In [None]:
train['type'] = train['type'].apply(lambda x : 0 if x == 'white' else 1)
test['type'] = test['type'].apply(lambda x : 0 if x == 'white' else 1)

In [None]:
from sklearn.preprocessing import MinMaxScaler

features = ['fixed acidity', 'volatile acidity', 'citric acid',       
       'residual sugar', 'chlorides', 'free sulfur dioxide',         
       'free et sulfur dioxid', 'density', 'pH', 'sulphates', 'alcohol']        

scaler = MinMaxScaler()          
scaler.fit(train[features])          
train[features] = scaler.transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
features = train.column[2:]

X = train[features]
y = train[features]

In [None]:
import numpy as np

##### 평가산식 : ACCURACY(정확도) #####
def ACC(y_true, pred):   
    score = np.mean(y_true==pred)
    return score

##### 모델 검증 시각화 #####
def make_plot(y_true, pred):
    
    acc = ACC(y_true, pred)
    df_validation = pd.DataFrame({'y_true':y_true, 'y_pred':pred})

    # 검증 데이터 정답지('y_true') 빈도수 (sorted)
    df_validation_count = pd.DataFrame(df_validation['y_true'].value_counts().sort_index())
    # 검증 데이터 예측치('y_pred') 빈도수 (sorted)
    df_pred_count =  pd.DataFrame(df_validation['y_pred'].value_counts().sort_index())

    # pd.concat - 검증 데이타 정답지, 예측치 빈도수 합치기
    df_val_pred_count = pd.concat([df_validation_count,df_pred_count], axis=1).fillna(0)

    ############################################################
    # 그래프 그리기
    ############################################################
    
    x = df_validation_count.index
    y_true_count = df_val_pred_count['y_true']
    y_pred_count = df_val_pred_count['y_pred']

    width = 0.35
    plt.figure(figsize=(5,3),dpi=150)

    plt.title('ACC : ' + str(acc)[:6])
    plt.xlabel('quality')
    plt.ylabel('count')

    p1 = plt.bar([idx-width/2 for idx in x], y_true_count, width, label='real')
    p2 = plt.bar([idx+width/2 for idx in x], y_pred_count,  width, label='pred')

    plt.legend()
    plt.show()

### 4. 데이터 학습

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

kfold = StratifiedKFold(n_splits=5, shuffle = True, random_state=42)

models = []

i=0

for train_idx, valid_idx in kfold.split(X, y):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = RandomForestClassifier(random_state=42)

    model.fit(X_train, y_train)

    models.append(model)

    predict = model.predict(X_valid)
    print(models[i])
    i += 1

    make_plot(y_valid, predict)

In [None]:
predict = models[0].predict(test[features])
predict[:10]

In [None]:
submission['quality'] = predict

In [None]:
pred0 = models[0].predict(test[features])       
pred1 = models[1].predict(test[features])          
pred2 = models[2].predict(test[features])          
pred3 = models[3].predict(test[features])         
pred4 = models[4].predict(test[features])

In [None]:
pred = pd.DataFrame({'pred0':pred0, 'pred1':pred1, 'pred2':pred2, 'pred3':pred3, 'pred4':pred4})
pred

In [None]:
pred.mode(axis=1) 

In [None]:
pred0 = models[0].predict_proba(test[features])
pred1 = models[1].predict_proba(test[features])
pred2 = models[2].predict_proba(test[features])
pred3 = models[3].predict_proba(test[features])
pred4 = models[4].predict_proba(test[features])

pd.DataFrame((pred0))

In [None]:
pred = pd.DataFrame((pred0 + pred1 + pred2 + pred3 + pred4)/5)

In [None]:
pred = pd.DataFrame(np.array(pred).argmax(axis=1)+3)
pred

### 5. CSV파일로 저장

In [None]:
submission['next_arrive_time'] = predict
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False, quoting=2, encoding="utf-8-sig", na_rep=0, float_format='%.6f')