In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore")

# **Importing Dataset**

In [None]:
dataset = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
dataset

In [None]:
dataset.info()

Checking if there are any NULL values

In [None]:
dataset.isnull().sum()

Quality > 6.5 = 'good'

Quality <6.5 = 'bad'

In [None]:
dataset['quality'] = [1 if i > 6.5 else 0 for i in dataset['quality']]

In [None]:
dataset

In [None]:
dataset['quality'].value_counts()

In [None]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
x

In [None]:
y

# **Splitting dataset into Train and Test set**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 0)

# **Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

**Importing different models**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

# **Models of Selection**

In [None]:
models = []
models.append(['Logistic Regression 1', LogisticRegression(C = 0.1)])
models.append(['Logistic Regression 2', LogisticRegression(C = 0.5)])
models.append(['Logistic Regression 3', LogisticRegression(C = 1.0)])
models.append(['KNeighbours 1', KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)])
models.append(['KNeighbours 2', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)])
models.append(['SVM 1', SVC(kernel= 'linear')])
models.append(['SVM 2', SVC(kernel= 'rbf')])
models.append(['Naive Bayes', GaussianNB()])
models.append(['Decision Tree 1', DecisionTreeClassifier(criterion= 'gini')])
models.append(['Decision Tree 2', DecisionTreeClassifier(criterion= 'entropy')])
models.append(['Random Forest 1', RandomForestClassifier(n_estimators= 50, criterion= 'gini')])
models.append(['Random Forest 2', RandomForestClassifier(n_estimators= 100, criterion= 'gini')])
models.append(['Random Forest 3', RandomForestClassifier(n_estimators= 200, criterion= 'gini')])
models.append(['Random Forest 4', RandomForestClassifier(n_estimators= 50, criterion= 'entropy')])
models.append(['Random Forest 5', RandomForestClassifier(n_estimators= 100, criterion= 'entropy')])
models.append(['Random Forest 6', RandomForestClassifier(n_estimators= 200, criterion= 'entropy')])

for m in range(len(models)):
  model = models[m][1]
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  cm = confusion_matrix(y_test, y_pred)
  accuracies = cross_val_score(estimator = model, X = x_train, y = y_train, cv = 10)
  print(models[m][0])
  print(cm)
  print('Accuracy Score',accuracy_score(y_test, y_pred))
  print("Mean Accuracy: {:.2f} %".format(accuracies.mean()*100))
  print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
  print('-----------------------------------')

As RandomForest has better accuracies than other models, so now GridSearch is applied on RandomForest for a better hyperparameters tuning.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = [{'n_estimators': [50, 100, 200, 300, 400, 500, 1000], 'criterion': ['gini']},
              {'n_estimators': [50, 100, 200, 300, 400, 500, 1000], 'criterion': ['entropy']}]
grid_search = GridSearchCV(estimator = model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(x_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_std = grid_search.cv_results_['std_test_score'][grid_search.best_index_]
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print('Best Standard Deviation: {:.2f} %'.format(best_std*100))
print("Best Parameters:", best_parameters)

Above shows the best hyperparameters for RandomForest which can make the model more efficient.