In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df.head()

Let us do some EDA on the dataset

In [None]:
df.info()

Looks like the dataset has no null values in any of the features, labels.<br>
Therefore imputation is not necessary.<br>
Let us now find out the correlations between the features and labels.<br>

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(15,10))
sns.heatmap(df.corr())

since there are a lot of input quantities let us print the corr values

In [None]:
df.corr()['quality'].sort_values(ascending=False)

The above correlation helps us understand the parameters that increase the quality(parameters>0) and<br>
the parameters that decrease the quality(parameters<0)<br>
for residual sugar correlation is almost zero and we can drop it

Before plotting the distribution plots, let us find out the nature of the label(quality)

In [None]:
df.quality.unique() #gives the actual number of labels

In [None]:
sns.countplot(df.quality) #to see how the output labels are distributed

The above plot shows that the output lables are not uniformly distributed.<br>
Hence we have to perform some sampling on the input , which we will get to after we see how the data<br>
is distributed

In [None]:
df.hist(figsize=(20,15),bins=50)

Let us apply log transformation on features that are more concentrated to the left and make them uniform<br>
instead standard scaling can also be used

In [None]:
import numpy as np
df['chlorides'] = df['chlorides'].apply(lambda x : np.log(x))
df['free sulfur dioxide'] = df['free sulfur dioxide'].apply(lambda x : np.log(x))
df['total sulfur dioxide'] = df['total sulfur dioxide'].apply(lambda x : np.log(x))
df['sulphates'] = df['sulphates'].apply(lambda x : np.log(x))

In [None]:
df.hist(bins=50,figsize=(20,15))

Oversample the quality levels 3,4,7,8 along with the majority class to a sufficiently large value(5000)

In [None]:
df.quality.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy = {5: 5000, 6: 5000, 7: 5000, 4: 5000, 8: 5000, 3: 5000})
X_test_os,y_test_os = sm.fit_resample(df.drop(['quality','residual sugar'],axis=1),df['quality'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_test_os,y_test_os, test_size=0.3, random_state=42)

Now lets do some machine learning with knn<br>
Actual knn parameters found after hyper parameter tuning for best accuracy,scroll below to find the same.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1,leaf_size=10,p=1,metric='manhattan')
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(pred,y_test))
print(classification_report(pred,y_test))

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = [{'weights' : ['uniform', 'distance'],'leaf_size' :[10,20,30],'n_neighbors':[1,10,20,30],
           'p':[1,2,3]}]
knn_ = KNeighborsClassifier()
grid_search = GridSearchCV(knn_,params,cv=3,n_jobs=100,scoring='f1')
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
knn.effective_metric_