In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import plotly.express as px

In [None]:
df=pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
df.head()

In [None]:
print(df.columns)
print(df.shape)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.nunique()

In [None]:
sns.set(style='whitegrid')
print(df['quality'].value_counts())
fig=plt.figure(figsize=(10, 6))
sns.countplot('quality', data=df, palette='pastel')

In [None]:
sns.set(style='whitegrid')
fig, ax1 = plt.subplots(3, 4, figsize=(24, 30))
columns = list(df.columns)
k=0
for i in range(3):
    for j in range(4):
        sns.boxplot(df['quality'], df[columns[k]], ax = ax1[i][j], palette='pastel')
        k+=1
plt.show()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(df.corr(),annot=True, cmap='PuBuGn')

In [None]:
color=sns.color_palette('pastel')
columns = list(df.columns)
k = 0
fig, ax1 = plt.subplots(3, 4, figsize=(24, 30))
for i in range(3):
    for j in range(4):
        sns.distplot(df[columns[k]], ax = ax1[i][j], color='red')
        k+=1
plt.show()

In [None]:
def log_transform(col):
    return np.log(col[0])


df['residual sugar'] = df[['residual sugar']].apply(log_transform, axis=1)
df['sulphates'] = df[['sulphates']].apply(log_transform, axis=1)
df['free sulfur dioxide'] = df[['free sulfur dioxide']].apply(log_transform, axis=1)
df['total sulfur dioxide'] = df[['total sulfur dioxide']].apply(log_transform, axis=1)
df['chlorides']=df[['chlorides']].apply(log_transform, axis=1)

In [None]:
color=sns.color_palette('pastel')
columns = list(df.columns)
k=0
fig, ax1 = plt.subplots(3, 4, figsize=(24, 30))
for i in range(3):
    for j in range(4):
        sns.distplot(df[columns[k]], ax=ax1[i][j], color='green')
        k+=1
plt.show()

In [None]:
df.corr()['quality'].sort_values(ascending=False)

In [None]:
df_3 = df[df.quality==3]
df_4 = df[df.quality==4]
df_5 = df[df.quality==5]
df_6 = df[df.quality==6]
df_7 = df[df.quality==7]
df_8 = df[df.quality==8]

In [None]:
from sklearn.utils import resample

df_3_upsampled = resample(df_3, replace=True, n_samples=600, random_state=12)
df_4_upsampled = resample(df_4, replace=True, n_samples=600, random_state=12)
df_7_upsampled = resample(df_7, replace=True, n_samples=600, random_state=12)
df_8_upsampled = resample(df_8, replace=True, n_samples=600, random_state=12)

df_5_downsampled = df_5.sample(n=600).reset_index(drop=True)
df_6_downsampled = df_6.sample(n=600).reset_index(drop=True)

In [None]:
Balanced_df = pd.concat([df_3_upsampled, df_4_upsampled, df_5_downsampled, df_6_downsampled, df_7_upsampled, df_8_upsampled]).reset_index(drop=True)

In [None]:
Balanced_df['quality'].value_counts()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='quality', data=Balanced_df, order=[3, 4, 5, 6, 7, 8], palette='pastel')

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='quality', y='alcohol', data=df, palette='coolwarm')

In [None]:
plt.figure(figsize=(12, 6))
Balanced_df.corr().quality.apply(lambda x : abs(x)).sort_values(ascending=False).iloc[1:11][::-1].plot(kind='barh', color='green')
plt.title('Top 10 highly correlated features', size=20, pad=26)
plt.xlabel('correlation coefficient')
plt.ylabel('Features')

In [None]:
selected_features = ['volatile acidity', 'alcohol', 'sulphates', 'citric acid', 'chlorides', 'density', 'pH', 'free sulfur dioxide', 'fixed acidity', 'total sulfur dioxide']

In [None]:
x = Balanced_df[selected_features]
y=Balanced_df['quality']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=12)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = [5, 10, 15, 20]
for neighbors in n_neighbors:
    knn = KNeighborsClassifier(neighbors)
    knn.fit(X_train, y_train)
    score = knn.score(X_test, y_test)
    print("Score For neighbors ", neighbors, "is given as ", score)

In [None]:
for neighbor in n_neighbors:
    model = KNeighborsClassifier(neighbor, weights='distance')
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print("Score For neighbors ", neighbor, "is given as ", score)
    

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_model.fit(X_train, y_train)
train_preds = knn_model.predict(X_train)
test_preds = knn_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print("Train data confusion matrix: \n")
print(confusion_matrix(y_train, train_preds))

print("Train data classification report: \n")
print(classification_report(y_train, train_preds))


print("Test data confusion matrix: \n")
print(confusion_matrix(y_test, test_preds))

print("Train data classification report: \n")
print(classification_report(y_test, test_preds))
