In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

In [None]:
wine_data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', parse_dates=True, encoding = "cp1252")
wine_data.head()

In [None]:
fig = plt.figure(figsize = (10,6))
sns.countplot(data=wine_data, x='quality')

In [None]:
X = wine_data[['fixed acidity',
               'volatile acidity',
               'citric acid',
               'residual sugar',
               'chlorides',
               'free sulfur dioxide',
               'total sulfur dioxide',
               'density',
               'pH',
               'sulphates',
               'alcohol']]

y = wine_data['quality']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scl = scaler.fit_transform(X_train)
X_test_scl = scaler.transform(X_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, balanced_accuracy_score
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

In [None]:
models=[("Logistic Regression", LogisticRegression()),
        ("Stochastic Gradient Descent", SGDClassifier()),
        ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
        ("Decision Tree", DecisionTreeClassifier()),
        ("Random Forest", RandomForestClassifier()),
        ("Extra Trees", ExtraTreesClassifier()),
        ("Gradient Boostin", GradientBoostingClassifier()),
        ("KNeighbors", KNeighborsClassifier()),
        ("SVM", SVC()),
        ("Naive Bayes", GaussianNB()),
        ("Ada Boost", AdaBoostClassifier())]

for name, model in models:
    kfold = KFold(n_splits=10)
    results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    print(f"\x1b[96m{name}\x1b[0m: \x1b[95m{results.mean():.4f}\x1b[0m ± {results.std():.4f}")

# Synthetic Minority Oversampling Technique

In [None]:
import imblearn
print(imblearn.__version__)

from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
counter = Counter(y)
print(counter)

In [None]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

print(sorted(Counter(y_resampled).items()))

In [None]:
oversample = SMOTE(sampling_strategy = {5: 5000, 6: 5000, 7: 5000, 4: 5000, 8: 5000, 3: 5000})
X_smote, y_smote = oversample.fit_resample(X, y)

counter = Counter(y_smote)
print(counter)

In [None]:
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.25, random_state=0)

et = ExtraTreesClassifier()
et.fit(X_train_smote, y_train_smote)
et_predict = et.predict(X_test_smote)

print(f"Тrain: {et.score(X_train_smote, y_train_smote)*100} - Тest: {et.score(X_test_smote, y_test_smote)*100}")

rfc_eval = cross_val_score(estimator = et, X = X_train_smote, y = y_train_smote, cv = 10)
print("cross_val_score: ", rfc_eval.mean()*100)

print("accuracy_score: ", accuracy_score(y_test_smote, et_predict)*100)

print("balanced_accuracy_score: ", balanced_accuracy_score(y_test_smote, et_predict)*100)

In [None]:
oversample = ADASYN(sampling_strategy="minority")
X_smote, y_smote = oversample.fit_resample(X, y)

counter = Counter(y_smote)
print(counter)

In [None]:
oversample = SMOTEN()
X_smote, y_smote = oversample.fit_resample(X, y)

counter = Counter(y_smote)
print(counter)

In [None]:
oversample = BorderlineSMOTE()
X_smote, y_smote = oversample.fit_resample(X, y)

counter = Counter(y_smote)
print(counter)

In [None]:
over = SMOTE()
under = RandomUnderSampler()

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X_rsmote, y_rsmote = pipeline.fit_resample(X, y)

counter = Counter(y_rsmote)
print(counter)

In [None]:
X_train_rsmote, X_test_rsmote, y_train_rsmote, y_test_rsmote = train_test_split(X_rsmote, y_rsmote, test_size=0.25, random_state=0)

models=[("Logistic Regression", LogisticRegression()),
        ("Stochastic Gradient Descent", SGDClassifier()),
        ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
        ("Decision Tree", DecisionTreeClassifier()),
        ("Random Forest", RandomForestClassifier()),
        ("Extra Trees", ExtraTreesClassifier()),
        ("Gradient Boostin", GradientBoostingClassifier()),
        ("KNeighbors", KNeighborsClassifier()),
        ("SVM", SVC()),
        ("Naive Bayes", GaussianNB()),
        ("Ada Boost", AdaBoostClassifier())]

for name, model in models:
    kfold = KFold(n_splits=10)
    results = cross_val_score(model, X_train_rsmote, y_train_rsmote, cv=kfold, scoring='accuracy')
    print(f"\x1b[96m{name}\x1b[0m: \x1b[95m{results.mean():.4f}\x1b[0m ± {results.std():.4f}")