In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
df

In [None]:
df.iloc[:10,:-1]

In [None]:
df.corr()

In [None]:
df.columns = ['고정 산도','휘발성 산도','구연산','잔류 설탕','염화물','자유 황산','총 이산화황','밀도','pH','황산염','알코올','퀄리티']

In [None]:
mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

In [None]:
import matplotlib.font_manager as fm
font_list = [font.name for font in fm.fontManager.ttflist]
font_list

In [None]:
fig, ax = plt.subplots( figsize=(12,10) )
plt.rcParams['font.family'] = 'Malgun Gothic'
sns.heatmap(df.corr(),
               linewidths=.5,
               mask=mask,
               annot = True,      # 실제 값 화면에 나타내기
               cmap = 'RdYlBu_r',  # Red, Yellow, Blue 색상으로 표시
               vmin = -1, vmax = 1, #컬러차트 -1 ~ 1 범위로 표시
              )

In [None]:
fig, ax = plt.subplots( figsize=(12,10) )
plt.rcParams['font.family'] = 'Malgun Gothic'
sns.heatmap(df.corr(),
               linewidths=.5,
               annot = True,      # 실제 값 화면에 나타내기
               cmap = 'RdYlBu_r',  # Red, Yellow, Blue 색상으로 표시
               vmin = -1, vmax = 1, #컬러차트 -1 ~ 1 범위로 표시
              )

In [None]:
sns.clustermap(df.corr(),
               annot = True,      # 실제 값 화면에 나타내기
               cmap = 'RdYlBu_r',  # Red, Yellow, Blue 색상으로 표시
               vmin = -1, vmax = 1, #컬러차트 -1 ~ 1 범위로 표시
              )

In [None]:
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'fixed acidity', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'alcohol', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'volatile acidity', data = df)

In [None]:
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
X, y = df.iloc[:,:-1], df.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Feature Scaling # 각 feature의 평균을 0, 분산을 1로 변경, 모든 특성들이 같은 스케일을 갖게 됨.
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# train data는 fit_transform 스케일링
X_train = sc.fit_transform(X_train)

# test data는 transform으로 스케일링
X_test = sc.transform(X_test)

## 랜덤포레스트

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200, oob_score=True, \
                            criterion='gini', random_state=42)
rfc.fit(X_train, y_train)

In [None]:
rfc.fit(X_train, y_train)

print("Train R-squared: %.2f" %rfc.score(X_train,y_train) )
print("Test R-squared: %.2f" %rfc.score(X_test,y_test) )

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_predict=rfc.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, y_predict)

In [None]:
rf_conf_matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(rfc,X_test, y_test, cmap=plt.cm.Blues)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))

In [None]:
feature_imp = rfc.feature_importances_
ft_importances = pd.Series(feature_imp, index = X.columns)

plt.figure(figsize=(10,10))
plt.title("feature_importances")
sns.barplot(x=ft_importances, y=X.columns)
plt.show()

## GBC

In [None]:
# GradientBoosting#
from sklearn.ensemble import GradientBoostingClassifier

gbc1 = GradientBoostingClassifier(random_state=42)
gbc1

In [None]:
gbc1.fit(X_train, y_train)

print("Train R-squared: %.2f" %gbc1.score(X_train,y_train) )
print("Test R-squared: %.2f" %gbc1.score(X_test,y_test) )

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(gbc1,X_test, y_test, cmap=plt.cm.Blues)

In [None]:
y_predict=gbc1.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))

In [None]:
import seaborn as sns

ft_importances = pd.Series(gbc1.feature_importances_, index = X.columns)

plt.figure(figsize=(10,10))
plt.title("feature_importances")
sns.barplot(x=ft_importances, y=X.columns)
plt.show()

In [None]:
reviews = []
for i in df['quality']:
    if i >= 3 and i <= 4:
        reviews.append('1')
    elif i >=5  and i <= 6:
        reviews.append('2')
    elif i >= 7 and i <= 8:
        reviews.append('3')
df['Reviews'] = reviews

In [None]:
df

In [None]:
X, y = df.iloc[:,:-2], df.iloc[:,-1]

In [None]:
df["quality"].value_counts()

In [None]:
y.value_counts()

In [None]:
fig = plt.figure(figsize = (10,6))
sns.countplot(x="Reviews", data=df)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## RFC

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200, oob_score=True, \
                            criterion='gini', random_state=42)
rfc.fit(X_train, y_train)

In [None]:
rfc.fit(X_train, y_train)

print("Train R-squared: %.2f" %rfc.score(X_train,y_train) )
print("Test R-squared: %.2f" %rfc.score(X_test,y_test) )

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_predict=rfc.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, y_predict)

In [None]:
rf_conf_matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(rfc,X_test, y_test, cmap=plt.cm.Blues)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))

In [None]:
feature_imp = rfc.feature_importances_
ft_importances = pd.Series(feature_imp, index = X.columns)

plt.figure(figsize=(10,10))
plt.title("feature_importances")
sns.barplot(x=ft_importances, y=X.columns)
plt.show()

## GBC

In [None]:
# GradientBoosting#
from sklearn.ensemble import GradientBoostingClassifier

gbc1 = GradientBoostingClassifier(random_state=42)
gbc1

In [None]:
gbc1.fit(X_train, y_train)

print("Train R-squared: %.2f" %gbc1.score(X_train,y_train) )
print("Test R-squared: %.2f" %gbc1.score(X_test,y_test) )

In [None]:
y_predict=gbc1.predict(X_test)
gb_conf_matrix = confusion_matrix(y_test, y_predict)

In [None]:
gb_conf_matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(gbc1,X_test, y_test, cmap=plt.cm.Blues)

In [None]:
y_predict=gbc1.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))

In [None]:
import seaborn as sns

ft_importances = pd.Series(gbc1.feature_importances_, index = X.columns)

plt.figure(figsize=(10,10))
plt.title("feature_importances")
sns.barplot(x=ft_importances, y=X.columns)
plt.show()