In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns
import missingno 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Although I am still immature, please 'upvote' for me if I can be of any help.

## Introduction
Let's take a look at the wine rating data as a whole.\
If the quality is less than 5 points, we will evaluate it as 'bad' and if it is more than 6 points, we will evaluate it as 'good' and create a binary classification prediction model.

와인 등급데이터를 전체적으로 살펴보겠습니다.\
quality가  5점이하이면 'bad', 6점이상이면 'good'으로 평가하여 이진분류 예측모형을 만들어 보겠습니다.



## Import data(데이터 불러오기)

In [None]:
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
data.sample(5)

## Create columns ['bad', 'good'] based on quality
## (quality에 따른 ['bad', 'good'] 열 만들기)

In [None]:
def review(x):
    if x<=5:
        answer = 'bad'
    else:
        answer = 'good'
    return answer
data['Taste_review'] = data['quality'].apply(lambda x : review(x))

In [None]:
data.sample(3)

## Data summary(데이터 요약)

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
data.info()

In [None]:
missingno.matrix(data, figsize = (12,6))

In [None]:
data.columns

There seems to be nothing unusual about the data except that it is composed float data \
데이터가 전부 실수형으로 구성되어 있는 것 빼고는 데이터의 특이사항은 없는 것으로 보인다

## EDA

In [None]:
plt.figure(figsize=(20, 20))
plt.subplot(3,4,1)
plt.title('fixed acidity',fontsize=25)
sns.distplot(data['fixed acidity'])

plt.subplot(3,4,2)
plt.title('volatile acidity',fontsize=25)
sns.distplot(data['volatile acidity'])

plt.subplot(3,4,3)
plt.title('citric acid',fontsize=25)
sns.distplot(data['citric acid'])

plt.subplot(3,4,4)
plt.title('residual sugar',fontsize=25)
sns.distplot(data['residual sugar'])

plt.subplot(3,4,5)
plt.title('chlorides',fontsize=25)
sns.distplot(data['chlorides'])

plt.subplot(3,4,6)
plt.title('free sulfur dioxide',fontsize=25)
sns.distplot(data['free sulfur dioxide'])

plt.subplot(3,4,7)
plt.title('total sulfur dioxide',fontsize=25)
sns.distplot(data['total sulfur dioxide'])

plt.subplot(3,4,8)
plt.title('density',fontsize=25)
sns.distplot(data['density'])

plt.subplot(3,4,9)
plt.title('pH',fontsize=25)
sns.distplot(data['pH'])

plt.subplot(3,4,10)
plt.title('sulphates',fontsize=25)
sns.distplot(data['sulphates'])

plt.subplot(3,4,11)
plt.title('alcohol',fontsize=25)
sns.distplot(data['alcohol'])

plt.subplot(3,4,12)
plt.title('quality',fontsize=25)
sns.distplot(data['quality'])

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = data.corr(), annot=True, 
fmt = '.2f', linewidths=.5, cmap='Blues')

In [None]:
sns.set()
sns.pairplot(data, hue="Taste_review", diag_kind="hist")
plt.show()

## Create Predictive Model (예측 모형 생성)

#### Training data, separating test data(훈련 데이터, 테스트 데이터 분리하기)

In [None]:
X = data.iloc[:,:-2]
y = data.iloc[:,-1]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 19)

Training data, test data distribution(훈련 데이터, 테스트 데이터 분포)

In [None]:
print('train distribution')
print(round(y_train.value_counts()[0]/y_train.value_counts()[1],2))
print('###################')
print('test distribution')
print(round(y_test.value_counts()[0]/y_test.value_counts()[1],2))

## DecisionTree

In [None]:
tr_clf = DecisionTreeClassifier()
tr_clf.fit(X_train,y_train)
# test데이터 성능
pred = tr_clf.predict(X_test)
print(classification_report(y_test,pred))

## Logistic Regreesion

Logistic regression parameters(로지스틱 회귀 파라미터)

In [None]:
params ={'penalty':['l1','l2'],
        'C':[0.01, 0.1, 1, 5, 10]}

Logistic regression scaling, cross-validation and result \
(!!Logistic regression has a significant impact on predictive performance depending on whether it scales!!)\
로지스틱 회귀 스케일링, 교차검증, 결과 \
(!!로지스틱 회귀는 스케일링 여부에 따라 예측성능 영향이 크다!!)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
lr_clf = LogisticRegression(solver='liblinear',max_iter=1000)
lr_grid = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=2)
lr_grid.fit(X_train_scaled, y_train)
results = pd.DataFrame(lr_grid.cv_results_)
display(np.transpose(results))
print('bestparams: {}'.format(lr_grid.best_params_))

View test data performance based on trained models(훈련된 모델을 바탕으로 테스트 데이터 성능 보기)

In [None]:
X_test_scaled = scaler.transform(X_test)
lr_preds = lr_grid.predict(X_test_scaled)
print(classification_report(y_test,lr_preds))

## RandomForest

RandomForest parameters(랜덤 포레스트 피라미터)

In [None]:
params ={'max_depth':[ 10, 12, 14],
        'min_samples_leaf':[3,5,7],
        'min_samples_split':[2,3,5]}

RandomForest Cross-validation(랜덤 포레스트 교차검증)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=200)
rf_grid = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy',n_jobs=-1)
rf_grid.fit(X_train,y_train)
print('best_params:{}'.format(rf_grid.best_params_))

In [None]:
rf_pred = rf_grid.predict(X_test)
print(classification_report(y_test,rf_pred))