**import packages**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as wr
wr.filterwarnings('ignore')

**load dataset**

In [None]:
csv_path = '../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
red_wine_quality_file = pd.read_csv(csv_path)
df = pd.DataFrame(red_wine_quality_file)

**Obtain basic information from dataset**

In [None]:
print("info of dataset : \n ")
df.info()

In [None]:
print("desribe of dataset : \n ")
df.describe

In [None]:
print("shape of dataset : \n ")
df.shape

**show some of plot for Gain insight from dataset**

In [None]:
df.hist(bins=10,figsize=(15,12))
plt.show()

In [None]:
plt.figure(figsize=(9, 5))
sns.stripplot(x="quality",y="alcohol",data=df)

In [None]:
plt.figure(figsize=(12, 6))
sns.stripplot(x="density",y="alcohol",data=df)

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(20,10))
index = 0
ax = ax.flatten()


for col, value in df.items():
    sns.boxplot(y=col, data=df, color='b', ax=ax[index])
    index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

**remove outlier data**

In [None]:
def mod_outlier(df):
        df1 = df.copy()
        df = df._get_numeric_data()
        q1 = df.quantile(0.25)
        q3 = df.quantile(0.75)

        iqr = q3 - q1

        lower_bound = q1 -(1.5 * iqr) 
        upper_bound = q3 +(1.5 * iqr)


        for col in df.columns:
            for i in range(0,len(df[col])):
                if df[col][i] < lower_bound[col]:            
                    df[col][i] = lower_bound[col]

                if df[col][i] > upper_bound[col]:            
                    df[col][i] = upper_bound[col]    


        for col in df.columns:
            df1[col] = df[col]

        return(df1)

df = mod_outlier(df)

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(20,10))
index = 0
ax = ax.flatten()


for col, value in df.items():
    sns.boxplot(y=col, data=df, color='g', ax=ax[index])
    index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', linewidths=2)

**import packages for develop model**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [None]:
X = df.drop('quality', axis=1)
y = df['quality']

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=4)
X, y = oversample.fit_resample(X, y)

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
def classify(model, X, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    # train the model
    model.fit(x_train, y_train)
    print("Accuracy:", model.score(x_test, y_test) * 100)
    
    # cross-validation
    score = cross_val_score(model, X, y, cv=5)
    print("CV Score:", np.mean(score)*100)

In [None]:
model = DecisionTreeClassifier()
classify(model, X, y)

In [None]:
model = RandomForestClassifier()
classify(model, X, y)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
classify(model, X, y)