In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
# reading data
data = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
# printing data
data

In [None]:
# printing null values and types of data since data has not got any object or string we don't need to evaluate types
data.info()

In [None]:
# printing statistical information about data
data.describe()

## Data Visualization

In [None]:
# This pie chart shows distribution of wine quality by quality classes
plt.figure(1, figsize=(10,10))
data['quality'].value_counts().plot.pie(autopct="%1.1f%%")

In [None]:
import seaborn as sns; sns.set()

plt.figure(figsize=(15, 15))

# this graph shows that the higher the alcohol, the higher the quality.
plt.subplot(4,4,1)
sns.barplot(x = 'quality', y = 'alcohol', data = data)

# this graph shows that quality decreases as volatile acidity decreases.
plt.subplot(4,4,2)
sns.barplot(x = 'quality', y = 'volatile acidity', data = data)

# this graph shows that the higher the citric acid, the higher the quality.
plt.subplot(4,4,3)
sns.barplot(x = 'quality', y = 'citric acid', data = data)

# this graph shows that the higher the sulphates, the higher the quality.
plt.subplot(4,4,4)
sns.barplot(x = 'quality', y = 'sulphates', data = data)

In [None]:
# this line shows us there is no NaN value in dataset.
data.isna().sum().sum()

In [None]:
# this line shows us quality column has 6 different quality type and how many value they have for each quality.
data.quality.value_counts()

In [None]:
# We will assing new values to quality column. If quality is lower than 6 it will be low quality above that it will
# be high quality
data["quality"] = data.quality.apply(lambda q: 'low' if q < 6 else 'high')

In [None]:
# Since ML algorithms don't work on string values we have to encode them. In order to do that we will use label encoder.
label_quality = LabelEncoder()

In [None]:
# Applying encoder to our column it will transform high values to 0 low values to 1.
data['quality'] = label_quality.fit_transform(data['quality'])

In [None]:
# checking that transformation is done correctly or not. Our 0 column should have 638+199+18 values and it is okay.
data.quality.value_counts()

In [None]:
# we will indicate feature and target columns. In this case quality column(y) will be our target 
# other columns are features(x)
x = data.drop("quality", axis=1)
y = data["quality"]

In [None]:
# splitting our database in order to avoid overfitting and testing it more accurately.
# we have used scikit-learns train_test_split method.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 43, shuffle=1)

In [None]:
# Defining standard scaler.
sc = StandardScaler()

In [None]:
# Apllying scaler to our x_train and x_test to obtain optimized results.
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [None]:
"""
Instead of writing same comments for each implementation. We will explain them in here.
1) We will create our model. We used scikit-learn's libraries for models.
2) We will fit our models by using x_train and y_train. Fitting basically means we will make our algorithm learn the
relationship between train and test data.
3) Our model will make predictions by taking our test samples(x_test)
4) Printing confusion matrix to check accuracy of our model. It will also show us wrong predictions as false true and 
false false
5) Printing accuracy score of our model.
6) Arranging parameters grid and implementing random search for Gradient Boosting and Random forest. We implemented
grid search for Support Vector Machines. We wanted to try both of them. More iteration could give us better results
since we don't have too much computation power we didn't obtain best parameters.
7) Taking best parameters of random search or grid search and retraining our model with best parameters. 
8) Taking predictions from retrained models.
9) Calculating accuray scores of new models. Except support vector machines all searches increased our accuracy 
around %1.
10) Applying cross validation and taking its mean. 
We used 3 classification algorithms which are support vector machines, gradient boosting and random forest 
in order to classify the wine quality.
In final random forest seems best algorithm with accuracy score %80. 
Gradient boosting seems second best algorithm with accuracy score %79
As last one support vector machine seems algorithm with less accuracy. It is around %76.
"""

## Gradient Boosting

In [None]:
GBModel = GradientBoostingClassifier()
GBModel.fit(x_train, y_train)

In [None]:
GBPredictions = GBModel.predict(x_test)

In [None]:
metrics.confusion_matrix(y_test, GBPredictions)

In [None]:
metrics.accuracy_score(y_test, GBPredictions)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor

num_estimators = [250, 500]
learn_rates = [0.02, 0.08]
max_depths = [2, 5]
min_samples_leaf = [5, 10]
min_samples_split = [5, 10]

param_grid = {'n_estimators': num_estimators,
              'learning_rate': learn_rates,
              'max_depth': max_depths,
              'min_samples_leaf': min_samples_leaf,
              'min_samples_split': min_samples_split}

random_search = RandomizedSearchCV(GradientBoostingRegressor(loss='huber'), param_grid, random_state=1, n_iter=20, cv=5, verbose=0, n_jobs=-1)

random_search.fit(x_train, y_train)

In [None]:
random_search.best_params_

In [None]:
randomGBModel = GradientBoostingClassifier(n_estimators=250, learning_rate=0.02, max_depth=5, min_samples_split=10, min_samples_leaf=5)
randomGBModel.fit(x_train, y_train)

In [None]:
newGBpredictions = randomGBModel.predict(x_test)

In [None]:
metrics.accuracy_score(y_test, newGBpredictions)

In [None]:
GBcross = cross_val_score(estimator = randomGBModel, X = x_train, y = y_train, cv = 3)

In [None]:
GBcross.mean()

## Support Vector Machines

In [None]:
SVModel = SVC()
SVModel.fit(x_train, y_train)

In [None]:
SVPredictions = SVModel.predict(x_test)

In [None]:
metrics.confusion_matrix(y_test, SVPredictions)

In [None]:
metrics.accuracy_score(y_test, SVPredictions)

In [None]:
from sklearn.model_selection import GridSearchCV

param = {
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_svc = GridSearchCV(SVModel, param_grid=param, scoring='accuracy', cv=10)
grid_svc.fit(x_train, y_train)

In [None]:
grid_svc.best_params_

In [None]:
gridSVModel = SVC(C=1.1, gamma=0.8, kernel="rbf")
gridSVModel.fit(x_train, y_train)

In [None]:
gridSVpredictions = gridSVModel.predict(x_test)

In [None]:
metrics.accuracy_score(y_test, gridSVpredictions)

In [None]:
SVMcross = cross_val_score(estimator = gridSVModel, X = x_train, y = y_train, cv = 3)

In [None]:
SVMcross.mean()

## Random Forest

In [None]:
RFModel = RandomForestClassifier()
RFModel.fit(x_train, y_train)

In [None]:
RFPredictions = RFModel.predict(x_test)

In [None]:
metrics.confusion_matrix(y_test,RFPredictions)

In [None]:
metrics.accuracy_score(y_test, RFPredictions)

In [None]:
from sklearn.ensemble import RandomForestRegressor

num_estimators = [500, 5000]
max_depths = [10, 50]
min_samples_leaf = [1, 15]
min_samples_split = [2, 20]
max_features = ["auto", "sqrt", "log2"]

param_grid = {'n_estimators': num_estimators,
              'max_depth': max_depths,
              'min_samples_leaf': min_samples_leaf,
              'min_samples_split': min_samples_split,
             "max_features": max_features}

random_search = RandomizedSearchCV(RFModel, param_grid, random_state=1, n_iter=5, cv=3, verbose=0, n_jobs=-1)

random_search.fit(x_train, y_train)

In [None]:
random_search.best_params_

In [None]:
randomRFModel = RandomForestClassifier(n_estimators=5000, min_samples_split=2, min_samples_leaf=1, max_features="sqrt", max_depth=50)
randomRFModel.fit(x_train, y_train)

In [None]:
randomRFPredictions = randomRFModel.predict(x_test)

In [None]:
metrics.accuracy_score(y_test, randomRFPredictions)

In [None]:
RFcross = cross_val_score(estimator = randomRFModel, X = x_train, y = y_train, cv = 10)

In [None]:
RFcross.mean()