## importing the required dependencies

In [None]:
import pandas as pd 
import numpy as np 
import  matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

## information

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['quality'].unique()

In [None]:
df.columns

In [None]:
df.shape

## Visualization

In [None]:
plt.figure(figsize=(6,4))
sns.set_style('darkgrid')
sns.countplot(x='quality',data=df)

In [None]:
for i, col in enumerate(df.columns[:11]):
    plt.grid()
    plt.figure(figsize=(10,6))
    plt.figure(i)
    sns.boxplot(x='quality', y=col, data=df)

In [None]:
qua = df.groupby(['quality']).mean()

In [None]:
for i in range(6):
    plt.figure(figsize=(6,6))
    plt.pie(qua.iloc[i,:])
    plt.legend(labels = qua.columns,loc = 0,fontsize = 10)
    plt.title(f'Quality {qua.index[i]}')
    plt.show()

In [None]:
# here the below code review the quality. i.e, 1-> bad  2-> good  3->v.good
reviews = []
for item in df['quality']:
    if item >=3 and item <5:
        reviews.append(1)
    elif item >=5 and item <7:
        reviews.append(2)
    else:
        reviews.append(3)

df['good'] = reviews

In [None]:
df.head()

In [None]:
df.good.dtype

# Machine Learning models

### 1) KMeans
### 2) Decision Tree
### 3) Random Forest
### 4) k-nearest neighbor

In [None]:
X = df.drop(['quality', 'good'],axis=1)
y = df['good']

## 1) KMeans

In [None]:
#Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler_X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)

In [None]:
#PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
reduced_X = pd.DataFrame(pca.fit_transform(scaler_X),columns=['PCA1','PCA2'])

In [None]:
#KMeans
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=6)
kmeans.fit(reduced_X)

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x ='PCA1',y = 'PCA2',data=reduced_X,c = kmeans.labels_,cmap='winter')
sns.scatterplot(x = kmeans.cluster_centers_[:,0],y = kmeans.cluster_centers_[:,1], color = 'red',marker = '*', s = 500)

#### in the above graph ' * ' represents the centers of the clusters

### to get the best n_components value from PCA

In [None]:
X = df.drop(['quality', 'good'],axis=1)
y = df['good']

#Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

#PCA
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaler)

In [None]:
# graph to find the best value for principal components
plt.figure(figsize=(10,6))
plt.plot(np.cumsum(pca.explained_variance_ratio_),color = 'blue',linestyle = '--',marker = "o",markerfacecolor = 'red')

In [None]:
pca_updated = PCA(n_components=8)
X_pca_updated = pca_updated.fit_transform(X_scaler)

In [None]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca_updated, y, test_size=0.2, random_state=75)
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

## 2)Decision Tree

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
prediction = dtree.predict(X_test)
print(classification_report(y_test,prediction))
print('\n')
print(confusion_matrix(y_test,prediction))
print('\n')
print(f'Accuracy of the maodel is: {accuracy_score(y_test,prediction)*100} %')

## 3)Random Forest

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
prediction = rfc.predict(X_test)
print(classification_report(y_test,prediction))
print('\n')
print(confusion_matrix(y_test,prediction))
print('\n')
print(f'Accuracy of the maodel is: {accuracy_score(y_test,prediction)*100} %')

## 4)K-Nearest Neighbors

#### finding the best value for k

In [None]:
# knn
from sklearn.neighbors import KNeighborsClassifier
err_lst = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    predi = knn.predict(X_test)
    err_lst.append(np.mean(predi != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),err_lst,color = 'blue',linestyle = '--',marker = '.',markerfacecolor='red',markersize=10)

#### from the above graph we can get the best value for k i.e., k = 16

In [None]:
knn = KNeighborsClassifier(n_neighbors=16)
knn.fit(X_train,y_train)
prediction = knn.predict(X_test)
print(classification_report(y_test,prediction))
print('\n')
print(confusion_matrix(y_test,prediction))
print('\n')
print(f'Accuracy of the maodel is: {accuracy_score(y_test,prediction)*100} %')

###                                                         Thank You