# Machine learning classification for wine quality

In [None]:
#import the data
import pandas as pd

wine = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
#quick check the data
wine.head()

In [None]:
#check datatypes
wine.info()

In [None]:
#look at basic statistics
wine.describe()

In [None]:
#quick plot the data
%matplotlib inline
import matplotlib.pyplot as plt
wine.hist(bins=50,figsize=(20,15))
plt.show()

# Preprocessing and data analysis

In [None]:
import numpy as np

np.random.seed(42) #makes it reproducable

In [None]:
#divide into train and test
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(wine, test_size=0.2, random_state=42)
wine = train_set.copy() #create a copy to play with

In [None]:
import seaborn as sns

sns.pairplot(wine) #pairplot to look at correlations

In [None]:
#check correlations
sns.heatmap(wine.corr()) 

In [None]:
#look at a specific correlation
sns.scatterplot(x='fixed acidity',y='density',data=wine)

# Algorithm

In [None]:
#separate data into data and labels
wine_x = train_set.drop("density",axis=1)
wine_y = train_set["density"].copy()

In [None]:
#create regression model
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(wine_x, wine_y)

In [None]:
#apply regression model
predictions = lin_reg.predict(wine_x)

# Evaluation

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(wine_y, predictions))
print('MSE:', metrics.mean_squared_error(wine_y, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(wine_y, predictions)))

In [None]:
plt.title('Comparison of Y values in test and the Predicted values')
plt.ylabel('Test Set')
plt.xlabel('Predicted values')
plt.scatter(predictions, wine_y,  color='black',alpha=0.1)
plt.show()