# Hello, world!
### In this notebook we will distinguish good wine from bad wine using machine learning algorithms:
1. RandomForestClassifier
2. CatBoostClassifier
3. GaussianNB

### At the end, we'll find out which algorithm did better.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
df.describe()

In [None]:
#Let's divide wine into good and bad
bins = (2, 6.5, 8)
labels = ['0', '1'] #bad == 0, good == 1
df['quality'] = pd.cut(df['quality'], bins = bins, labels = labels)

In [None]:
from sklearn.model_selection import train_test_split
X = df.iloc[: , df.columns!='quality']
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size= 0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [None]:
sns.countplot(x=y, data=df)

### *You may notice that the data is very unbalanced, so we balance it using the SMOTE method* 

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=2)
X_train_s, y_train_s = smote.fit_sample(X_train, y_train.ravel())

In [None]:
#Balanced data
sns.countplot(x=y_train_s, data=df)

# *RandomForestClassifier*

   Training on unbalanced data

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
parametrs = {'n_estimators':[10, 20 ,30], 'max_depth':[2, 5, 7 , 10]}
grid_search_cv_clf = GridSearchCV(rf, parametrs, cv = 5)
grid_search_cv_clf.fit(X_train, y_train)

rf_pred = grid_search_cv_clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, rf_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, rf_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test, rf_pred)
print("Accuracy:",result2)

> *low f1-score*

 Training on balanced data

In [None]:
grid_search_cv_clf_s = GridSearchCV(rf, parametrs, cv = 5)
grid_search_cv_clf_s.fit(X_train_s, y_train_s)
grid_search_cv_clf_s.best_params_

rf_pred_s = grid_search_cv_clf_s.predict(X_test)

In [None]:
result = confusion_matrix(y_test, rf_pred_s)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, rf_pred_s)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test, rf_pred_s)
print("Accuracy:",result2)

> *much better on balanced data*

# *CatBoostClassifier*

In [None]:
from catboost import CatBoostClassifier
# Initialize CatBoostRegressor
model = CatBoostClassifier(iterations=10,
                        learning_rate=1,
                        depth=5)
# Fit model
model.fit(X_train_s, y_train_s)
# Get predictions
pred_cb = model.predict(X_test)

In [None]:
result = confusion_matrix(y_test, pred_cb)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, pred_cb)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test, pred_cb)
print("Accuracy:",result2)

 # *GaussianNB*

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
pred_gs = clf.predict(X_test) 

In [None]:
result = confusion_matrix(y_test, pred_gs)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, pred_gs)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test, pred_gs)
print("Accuracy:",result2)

## Based on all the training we can conclude that the best prediction was made by RandomForestClassifier
