In [None]:
import numpy as np 
import pandas as pd 
import plotly.express as px
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.tail()

# Making an simple EDA
Here we are going to look at different data parts so we can try to see trends on the data.
Graphs we are going to look at:
* Graph between different features mainly involving alcohol and other features with quality as a color scale  
* Coeficcients of correlation between the data 

In [None]:
px.histogram(df,x = 'quality')

Below we are going to create a new feature in the data that is the relative amount of free sulfur in relation to the amount of total sulfur

In [None]:
# finding the percentage between free aand total sulphur dioxide 
df['relative sulphur'] = df['free sulfur dioxide']/df['total sulfur dioxide']

In [None]:
#lets make some plots
print([i for i in df.columns])
fig = px.imshow(df.corr())
fig.show()
px.scatter(df , x = 'alcohol', y = 'fixed acidity',color = 'quality')

In [None]:
px.scatter(df,x = 'alcohol',y = 'pH', color = 'quality')

In [None]:
px.scatter(df,x = 'alcohol',y = 'volatile acidity' , color = 'quality')

In [None]:
px.scatter(df ,y = 'citric acid', x = 'alcohol', color = 'quality')

In [None]:
px.scatter(df , x = 'alcohol', y = 'fixed acidity',color = 'quality')

In [None]:
px.scatter(df , x = 'alcohol', y = 'relative sulphur',color = 'quality')

# Cleaning the Data and reshowing the data.
* Removing the outliers using z-score.
* Reshow the data.

Observations:


**BEWARE WITH THE CHANGE IN THE COLOR SCALE**


**COLOR SCALE IS ONLY GOOD TO SEE TREND IN DATA IN THIS CASE**

In [None]:
import scipy.stats as stats
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [None]:
px.histogram(df,x= 'quality')

In [None]:
fig = px.imshow(df.corr())
fig.show()

In [None]:
px.scatter(df , x = 'alcohol', y = 'fixed acidity',color = 'quality')

In [None]:
px.scatter(df,x = 'alcohol',y = 'pH', color = 'quality')

In [None]:
px.scatter(df,x = 'alcohol',y = 'volatile acidity' , color = 'quality')

In [None]:
px.scatter(df ,y = 'citric acid', x = 'alcohol', color = 'quality')

In [None]:
px.scatter(df , x = 'alcohol', y = 'fixed acidity',color = 'quality')

In [None]:
px.scatter(df , x = 'alcohol', y = 'relative sulphur',color = 'quality')

In [None]:
n_neigh = len(np.unique(df['quality']))
y = df.pop('quality')

x = df.values

# Scaling the data and building the models.

In [None]:
from sklearn.preprocessing import StandardScaler
SE = StandardScaler()
x_train,x_test, y_train, y_test = train_test_split(x,y,random_state = 42)
SE.fit(x_train)
x_train = SE.transform(x_train)
x_test = SE.transform(x_test)

In [None]:
model = KNeighborsClassifier(n_neighbors = n_neigh)
model.fit(x_train,y_train)
print(f'Average K neighbors precision {model.score(x_test,y_test)}')

In [None]:
model2 = RandomForestClassifier(max_depth =30 , n_estimators = 200,random_state= 42)
model2.fit(x_train,y_train)
print(f'Average Random Forest precision {model2.score(x_test,y_test)}')

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
prediction = model.predict(x_test)
print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))
plot_confusion_matrix(model, x_test, y_test) 

In [None]:
prediction2 = model2.predict(x_test)
print(classification_report(y_test,prediction2))
plot_confusion_matrix(model2, x_test, y_test) 
plt.show()

# Conclusion
What did we do on this notebook?
* First we looked at the data
* After that we removed the outliers
* Then we reshowed the data with the removal of the outliers 
* After that we scaled the data so the models could have a better prediction
* Then we builded 2 models of classification with multiple classes 
* Then we ploted the confusion matrix for each model prediction 
* We also showed the precision, recall and the f1 score for every class.

 As we can see the results were very pleasing with an mean precision of 69% for the random Forest and a 54% mean precision for the k neighbors model.
 
 Thank you for your time and if you liked the notebook please give it an up. Any comments on how to improve the notebook please leave it below
