# Quality_of_Red Wine

In [None]:
#Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

#Split Data Train and Test
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV

#Modelling
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, plot_roc_curve

# Data Selection

In [None]:
#Importing Data set
train_data=pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

# Exploratory Data Analysis

In [None]:
train_data.head(10)

In [None]:
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.hist(train_data['quality'],bins=10)
plt.show()

In [None]:
#Complete Data exploration
f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)
x=train_data['quality']
plt.plot(x,train_data['fixed acidity'],'r',label='Fixed acidity')
plt.plot(x,train_data['free sulfur dioxide'],'pink',label='free sulfur dioxide')
plt.plot(x,train_data['residual sugar'],'maroon',label='residual sugar')
plt.plot(x,train_data['total sulfur dioxide'],'lightseagreen',label='total sulfur dioxide')
plt.plot(x,train_data['volatile acidity'],'b',label='Volatile acidity')
plt.plot(x,train_data['citric acid'],'g',label='citric acid')
plt.plot(x,train_data['pH'],'y',label='pH')
plt.plot(x,train_data['alcohol'],'v',label='alcohol')
plt.plot(x,train_data['chlorides'],'c',label='chlorides')
plt.plot(x,train_data['sulphates'],'m',label='sulphates')
plt.plot(x,train_data['density'],'k',label='density')
plt.legend(loc=0)
plt.figure()
plt.show()

# Data Preprocessing

In [None]:
#Remove Dulpicates
train_data.drop_duplicates(inplace=True)

In [None]:
#Finding null values
train_data.isna().sum()

In [None]:
#Finding Co-relation between data features attributes
Corr=train_data.corr()
Corr_res=[]
for i in range(0,len(train_data.dtypes)):
  for j in range(0,len(train_data.dtypes)):
    value=Corr.iloc[i:i+1,j:j+1].values
    if value>0.8 and value!=1 :
     Corr_res.append(Corr.columns[i])

In [None]:
#Standardization
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
train_data.iloc[:,:-1]=std.fit_transform(train_data.iloc[:,:-1]) #Standardize data set except dependent value(Quality feature)

# Splitting the data

In [None]:
#Assigning dataframe to list of array values
X=train_data.iloc[:,:-1].values
Y=train_data.iloc[:,-1].values

In [None]:
#Split the data set in the ratio of 80:20 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size = 0.2, random_state = 42)

# Model Selection

In [None]:
#K-Nearest Neighbors
error_rate = []
for i in range(1, 40):
	
	knn = KNeighborsClassifier(n_neighbors = i)
	knn.fit(x_train, y_train)
	pred_i = knn.predict(x_test)
	error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize =(10, 6))
plt.plot(range(1, 40), error_rate, color ='blue',
				linestyle ='dashed', marker ='o',
		markerfacecolor ='red', markersize = 10)

plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')


In [None]:
#K=21 has lowest error rate
#Model Fit
classifier2 = KNeighborsClassifier(n_neighbors= 21, metric = 'manhattan', p = 2,weights='uniform')
classifier2.fit(x_train,y_train)

In [None]:
#Predicting the ouput from input data (x_train) and (y_train) 
y_pred1 = classifier2.predict(x_train)
y_pred2 = classifier2.predict(x_test)

In [None]:
#Accuracy score
from sklearn.metrics import accuracy_score
print("Accuracy score of train data set:",accuracy_score(y_train, y_pred1))
print("Accuracy score of test data set:",accuracy_score(y_test, y_pred2))

In [None]:
#Visualization
plt.figure()
plt.plot(y_test,'o',color = 'blue',label = 'Actual Values')
plt.plot(y_pred2,color = 'red',label = 'Predicted values')
plt.legend()

# Attempt through Classification 

In [None]:
train_data['quality'].value_counts()

In [None]:
#If quality value is less than or eqaul to 6 then it will be in class 0
#If quality value is greater than 6  then it will be in class 1
train_data['quality'] = np.where(train_data['quality'] > 6, 1, 0)
train_data['quality'].value_counts()

In [None]:
#Assigning dataframe to list of array values
X = train_data.drop(['quality'], axis = 1).values
y = train_data['quality'].values

In [None]:
#Splitting the data in the proportion of 70:30 and 86:14
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   stratify = y,
                                                   test_size = 0.3,
                                                   random_state = 1111)

In [None]:
k = range(1,50,2)
testing_accuracy = []
training_accuracy = []
score = 0
#Fitting the model
for i in k:
    knn = KNeighborsClassifier(n_neighbors = i)
    pipe_knn = Pipeline([('scale', MinMaxScaler()), ('knn', knn)])
    pipe_knn.fit(X_train, y_train)
    
    y_pred_train = pipe_knn.predict(X_train)
    training_accuracy.append(accuracy_score(y_train, y_pred_train))
    
    y_pred_test = pipe_knn.predict(X_test)
    acc_score = accuracy_score(y_test,y_pred_test)
    testing_accuracy.append(acc_score)
    
    if score < acc_score:
        score = acc_score
        best_k = i
        
print('Best Accuracy Score', score, 'Best K-Score', best_k)