In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/wine-quality/winequalityN.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.dtypes

# **HANDLING MISSING VALUES**

In [None]:
df.isnull().sum()

In [None]:

for cols,value in df.items():
 if cols!='type':
  df[cols]=df[cols].fillna(df[cols].mean())


In [None]:
df.isnull().sum()

# **EDA**

In [None]:
 df.groupby('type')[['quality']].mean()

In [None]:
 df.groupby('alcohol')[['quality']].mean()

In [None]:
alco=pd.cut(df['alcohol'],[8,10,12,14])
df.pivot_table('quality',['type',alco])

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(20,10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    if col != 'type':
        sns.boxplot(y=col, data=df, ax=ax[index])
        index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(20,10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    if col != 'type':
        sns.distplot(value, ax=ax[index])
        index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

Free sulphur and density are not that normally distributed.So,trnasforming into normal distribution using boxcox

In [None]:
from scipy import stats
df['free sulfur dioxide']=stats.boxcox(df['free sulfur dioxide'])[0]


In [None]:
df['density']=stats.boxcox(df['density'])[0]

In [None]:
sns.distplot(df['free sulfur dioxide'])

In [None]:
sns.distplot(df['density'])

In [None]:
sns.countplot(df['quality'])

# **CO0RELATION MATRIX** 

In [None]:
corr=df.corr()
plt.figure(figsize=(20,15))
sns.heatmap(corr, annot=True, cmap='coolwarm')

Only free sulfhur and total sulfur are little bit co-related

# **IMBALANCE DATASET**

In [None]:
x=df.drop(columns=['type','quality'])

In [None]:
x.shape
x.dtypes

In [None]:
y=df['quality']
y1=df['quality']

In [None]:
y.value_counts()

In [None]:
y.dtypes

Oversampling to handle imbalance dataset

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=4)
x, y = oversample.fit_resample(x, y)

In [None]:
y.value_counts()

In [None]:
print(x.shape)
print(y.shape)

# **MODEL TRAINING**

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
##SCALING THE DATA SET
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train =sc.fit_transform(x_train)
x_test=sc.fit_transform(x_test)


In [None]:
def models(x_train, y_train,x_test,y_test):


  ## KNN

  from sklearn.neighbors import KNeighborsClassifier
  knn= KNeighborsClassifier(n_neighbors=5, metric='minkowski',p=2)
  knn.fit(x_train, y_train)

  ##SVC(LINEAR)

  from sklearn.svm import SVC
  svc_lin=SVC(kernel='linear',random_state=0)
  svc_lin.fit(x_train,y_train)

  ##svc(rbf kernel)

  from sklearn.svm import SVC
  svc_rbf=SVC(kernel='rbf',random_state=0)
  svc_rbf.fit(x_train,y_train)

  ##gaussianNB

  from sklearn.naive_bayes import GaussianNB
  gb=GaussianNB()
  gb.fit(x_train,y_train)

  ##DECSION TREE

  from sklearn.tree import DecisionTreeClassifier
  tree=DecisionTreeClassifier(criterion= 'entropy' , random_state=0)
  tree.fit(x_train,y_train)

  ##Randomforestclassifier

  from sklearn.ensemble import RandomForestClassifier
  forest=RandomForestClassifier(n_estimators=10, criterion = 'entropy', random_state=0)
  forest.fit(x_train,y_train)
    
  ##EXTRA TREE CLASSIFIER

  from sklearn.ensemble import ExtraTreesClassifier
  extra = ExtraTreesClassifier()
  extra.fit(x_train,y_train)

  print('[0]KNN  ACCURACY: ',knn.score(x_test,y_test))
  print('[1]SVC(LINEAR) ACCURACY: ',svc_lin.score(x_test,y_test))
  print('[2] SVC_RBF ACCURACY: ',svc_rbf.score(x_test,y_test))
  print('[3] GAUSSIAN NB ACCURACY: ',gb.score(x_test,y_test))
  print('[4]DECCISION TREE ACCURACY: ',tree.score(x_test,y_test))
  print('[5]RANDOM FOREST ACCURACY: ',forest.score(x_test,y_test))
  print('[6] EXTRA TREE CLASSIFIER:',extra.score(x_test,y_test))
  return knn,svc_lin,svc_rbf,gb,tree,forest,extra


In [None]:
model=models(x_train,y_train,x_test,y_test)

In [None]:
df=df.drop(['type'],axis=1)

In [None]:
df.columns

# **FEATURE  IMPORTANCE**

In [None]:
forest=model[5]
importances=pd.DataFrame({'feature':df.iloc[:,0:11].columns, 'importance' : np.round(forest.feature_importances_,3)} )
importances=importances.sort_values('importance',ascending=False).set_index('feature')
importances

# **PERMUTATION IMPORTANCE**

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model[5], random_state=1).fit(x_test, y_test)
eli5.show_weights(perm, feature_names = x.columns.tolist())

**as we can see the permutation and feature importance of sulphates and resuidual suagr are relatively low in comparison with others so we can remove it to increase the accuracy of the model**

# CROSS CHECKING BY TRAINING MODELS

In [None]:
x1=x_train
x1=df.drop(columns=['quality','residual sugar'])
x1.shape

In [None]:
y1=df['quality']

In [None]:
y1.value_counts()

Over sampling to handle imbalance data.

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=4)
x1, y1 = oversample.fit_resample(x1, y1)

In [None]:
y1.value_counts()

In [None]:
x_train1,x_test1,y_train1,y_test1=train_test_split(x1, y1, test_size=0.2, random_state=0)

In [None]:
def models(x_train, y_train,x_test,y_test):


  ## KNN

  from sklearn.neighbors import KNeighborsClassifier
  knn= KNeighborsClassifier(n_neighbors=5, metric='minkowski',p=2)
  knn.fit(x_train, y_train)

  ##SVC(LINEAR)

  from sklearn.svm import SVC
  svc_lin=SVC(kernel='linear',random_state=0)
  svc_lin.fit(x_train,y_train)

  ##svc(rbf kernel)

  from sklearn.svm import SVC
  svc_rbf=SVC(kernel='rbf',random_state=0)
  svc_rbf.fit(x_train,y_train)

  ##gaussianNB

  from sklearn.naive_bayes import GaussianNB
  gb=GaussianNB()
  gb.fit(x_train,y_train)

  ##DECSION TREE

  from sklearn.tree import DecisionTreeClassifier
  tree=DecisionTreeClassifier(criterion= 'entropy' , random_state=0)
  tree.fit(x_train,y_train)

  ##Randomforestclassifier

  from sklearn.ensemble import RandomForestClassifier
  forest=RandomForestClassifier(n_estimators=10, criterion = 'entropy', random_state=0)
  forest.fit(x_train,y_train)
    
  ##EXTRA TREE CLASSIFIER

  from sklearn.ensemble import ExtraTreesClassifier
  extra = ExtraTreesClassifier()
  extra.fit(x_train,y_train)

  print('[0]KNN  ACCURACY: ',knn.score(x_test,y_test))
  print('[1]SVC(LINEAR) ACCURACY: ',svc_lin.score(x_test,y_test))
  print('[2] SVC_RBF ACCURACY: ',svc_rbf.score(x_test,y_test))
  print('[3] GAUSSIAN NB ACCURACY: ',gb.score(x_test,y_test))
  print('[4]DECCISION TREE ACCURACY: ',tree.score(x_test,y_test))
  print('[5]RANDOM FOREST ACCURACY: ',forest.score(x_test,y_test))
  print('[6] EXTRA TREE CLASSIFIER:',extra.score(x_test,y_test))
  return knn,svc_lin,svc_rbf,gb,tree,forest,extra


In [None]:
model=models(x_train,y_train,x_test,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# **ACCURACY USING CROSS VALIDATION**

In [None]:
forest_eval = cross_val_score(estimator = model[5], X = x_train, y = y_train, cv = 5)
forest_eval

In [None]:
tree_eval = cross_val_score(estimator = model[6], X = x_train, y = y_train, cv = 4)
tree_eval.mean()

# **HYPERPARAMETER TUNING**

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 150, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,300]
# Minimum number of samples required to split a node
min_samples_split = [2,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 3]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [None]:
rf_Model = ExtraTreesClassifier()
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 4, verbose=2, n_jobs = 4)
rf_Grid.fit(x_train, y_train)
rf_Grid.best_params_


In [None]:
rf_Grid.best_score_

ACCURACY INCREASED FROM 88.01 TO 88.16

# ANY SUGGESTIONS WILL BE APPRICIATED.PLEASE DO UPVOTE IF U LIKED IT.