In [None]:
# data analysis and wrangling
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualize
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Create Model to Predict Wine Ratings


## Import data

First we will acquire the data and take a look at what columns exist. Using the `head` and `tail` function, we can get a general idea of the data as well as know the size of it. As seen, the data has 12 columns and 1598 rows.

In [None]:
data = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
data.head()
#data.tail()

## Check data

We should also make sure there aren't any missing data. Also by checking the data type we might need to encode categorcial data.
Based on checking the data, we none of the columns are missing and are all float data types. Hence, there is no cleaning needed.

In [None]:
data.info()
data.isna().sum()

## Seperate Dependent and Independent Data

We are trying to predicts quality given other predictors such as fixed acidity, volatile acidity, and etc. So, set quality(*dependent data*) to `y` and the remaining predictors(*independent data*) as `X`.

In [None]:
y = data.iloc[:,-1].values
X = data.iloc[:,:-1].values

# Preprocess Data


### Split Training/Test set
In order to create and test whether our model is accurate, we need to divide the data set into a training set and test set. We will set the training set 70% of the original data and the remaining will be the test set.

### Scale Data
When viewing the data, we can notice each column have different magnitudes of data. For example, **alcohol** has figures around 10 whereas **chlorides** data are at $10^{-2}$. This makes some columns more influential than others based on the scale. Therefore we will scale the data to match each column.

In [None]:
# Split the data into training and test sets with a ratio of 7:3. 
# Set a random seed in order to get fixed results for the purpose of exploring.

from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state = 123)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Linear Regression

Using linear regression for the wine rating dataset, the predicted model's accuracy is 53.125%. 



In [None]:
# Using linear regression to predict model

from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred=regressor.predict(X_test).round(0).astype(int)

accuracy_score(y_pred,y_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_pred,y_test)


# K-Neighbor Classification
When using k-nearest neigbor classification, we have the option of choosing how much neighbors to classify the quality. Using a for loop to iterate in finding the optimal number of neighbors. After 100 iterations, as the graph above shows, 23 neighbors seem ideal as it presents a 69.583% accuracy.

In [None]:
# Use K-Neighbor classifier to predict model.

from sklearn.neighbors import KNeighborsClassifier as knn
ks = []
for i in range(1,300):
    knn_regressor = knn(n_neighbors = i,weights = 'distance')
    knn_regressor.fit(X_train,y_train)
    y_pred=knn_regressor.predict(X_test).round(0).astype(int)

    ks.append(accuracy_score(y_test, y_pred))
plt.plot(ks)

max_percent = max(ks)
index = ks.index(max_percent)+1
print(max_percent,index)


In [None]:
# With optimal neighbors

classifier = knn(n_neighbors = 23,weights='distance')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_pred,y_test)

# Random Forest Classifier

Using a random forest classifier, we alter the number of trees used to find the optimal number to use as our classifier. After iterating for 100 times, we notice the accuracy tends to converge between 66% and 68% along with some noise. Hence, we can choose the number of trees within this range and not have to check for numbers greater than 100. Among the best predictor is when `n` is 17. With this value the accuracy for predicting the wine quality is 67.5%.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = []
for i in range(1,100):
    classifier = RandomForestClassifier(n_estimators = i, criterion = 'entropy', random_state = 123)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    rfc.append(accuracy_score(y_test, y_pred))

max_percent = max(rfc)
index = rfc.index(max_percent)+1
print(max_percent,index)
plt.plot(rfc)

In [None]:
classifier = RandomForestClassifier(n_estimators = 17, criterion = 'entropy', random_state = 123)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

# Support Vector Machine
Applying the support vector machine to our data, the predicted data accuracy is 60.83%. 

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 123)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

# Conclusion

We have used 4 different methods to classify red wine based on fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, and alcohol.
* Linear Regression
* K-Neighbor Classification
* Random Forest Classification
* Support Vector Machine

For this dataset, k-neighbor classification had the best results with a 69.583% accuracy. Linear regression on the other hand had the lowest with a little over 50%. However, this does not mean k-neighbor is always the best to use and this may differ on many factors. 