In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_columns',None)

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df

First and foremost will check is there any Nan values present in the Dataset.

In [None]:
df.isnull().sum()

There are no Nan values present in Dataset.

In [None]:
df.quality.value_counts()

As there are many labels, we will divide it into 3 labels. 

In [None]:
df['quality'] = np.where(df['quality']<=4,0,df['quality'])

In [None]:
df['quality'] = np.where((df['quality']<=6) & (df['quality']!=0 ),1,df['quality'])

In [None]:
df['quality'] = np.where( df['quality']>=7,2,df['quality'])

we have converted quality variable into three labels as 0-poor,1-good,2-best.

In [None]:
df.quality.value_counts()

As we can see here,Dataset is completely imbalanced.

so,we need to fix it. Otherwise your model will baised to single label.

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
smk = SMOTETomek(random_state=0)

In [None]:
X,y=smk.fit_sample(df.drop('quality',axis=1),df['quality'])

In [None]:
df = pd.concat([X,y],axis=1)

In [None]:
df.quality.value_counts()

Now, it is perfectly balanced dataset

In [None]:
df.head()

Here,All the variables(features) are of Numerical type.

will analyse it one by one. 

In [None]:
features = [feature for feature in df.columns if feature!='quality']

In [None]:
for feature in features:
    sns.boxplot(x=feature,data=df)
    plt.xlabel(feature)
    plt.show()

As we can see there are number of Outliers present in each feature.
so,here will use top encoding and bottom encoding technique to fix this.

In [None]:
dic = {}
for feature in features:
    IQR = df[feature].quantile(0.75) - df[feature].quantile(0.25)
    upper_bond = df[feature].quantile(0.75) + (IQR * 1.5)
    lower_bond = df[feature].quantile(0.25) - (IQR * 1.5)
    
    df[feature] = np.where(df[feature]>upper_bond,upper_bond,df[feature])
    df[feature] = np.where(df[feature]<lower_bond,lower_bond,df[feature])

In [None]:
for feature in features:
    sns.boxplot(x=feature,data=df)
    plt.xlabel(feature)
    plt.show()

Now we will move to feature selection part.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
selectk = SelectKBest(score_func=chi2,k=7)

In [None]:
Best = selectk.fit(df.drop('quality',axis=1),df['quality'])

In [None]:
Best.scores_

These are the scores related to each feature with respect to output variable(quality).

In [None]:
features

In [None]:
dfscores = pd.DataFrame(Best.scores_)
dffeatures = pd.DataFrame(features)

we are mapping each score with respect to each feature recpectively.

In [None]:
features_scores = pd.concat([dffeatures,dfscores],axis=1)

In [None]:
features_scores.columns = ['feature','scores']

In [None]:
features_scores.sort_values(by='scores',ascending=False)

we will take top 7 features

In [None]:
Best_features = features_scores[features_scores['scores']>30]['feature']

Feature Selection with the help of correlation

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)

From above we can notice that volatile acidity,citric acid,alcohol and sulphates are correlated more than fifty percent to target variable (quality).

Now we split our dataset into train and test dataset.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df[Best_features],df['quality'],test_size=0.2,random_state=0)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,classification_report

But Outliers do not impact much on tree based models.

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_predict = model.predict(X_test)

In [None]:
y_predict_proba_train = model.predict_proba(X_train)

In [None]:
y_predict_proba_test = model.predict_proba(X_test)

In [None]:
roc_auc_score(y_train,y_predict_proba_train,multi_class='ovo')

In [None]:
roc_auc_score(y_test,y_predict_proba_test,multi_class='ovo')

As we know Decision tree follows low bias and high variance. Which means for training dataset it gives high accuracy but for testing dataset it gives less accuracy.

This problem can be easily solved with the help of ensemble techniques. 

e.g - RandomForest,XGBoost.

In [None]:
confusion_matrix(y_test,y_predict)

In [None]:
accuracy_score(y_test,y_predict)

In [None]:
print(classification_report(y_test,y_predict))

From above we can see that for class 1 precision and recall is falling behind

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(model,df[Best_features],df['quality'],scoring='accuracy',n_jobs=-1).mean()

with cross_val_score, Decisison Tree is giving 82% accuracy. 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_predict = model.predict(X_test)

In [None]:
y_predict_proba_train = model.predict_proba(X_train)

In [None]:
y_predict_proba_test = model.predict_proba(X_test)

In [None]:
roc_auc_score(y_train,y_predict_proba_train,multi_class='ovo')

In [None]:
roc_auc_score(y_test,y_predict_proba_test,multi_class='ovo')

As we can see RandomForest fixed the problem of low bias high variance to low bias low variance.

In [None]:
confusion_matrix(y_test,y_predict)

In [None]:
accuracy_score(y_test,y_predict)

In [None]:
print(classification_report(y_test,y_predict))

Precision and Recall is improved with Random Forest

In [None]:
cross_val_score(model,df[Best_features],df['quality'],scoring='accuracy',n_jobs=-1).mean()

we can see Random  Forest Classifier is giving 88% with cross_val_score

Now we will check with XGBClassifier.

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_predict = model.predict(X_test)

In [None]:
y_predict_proba_train = model.predict_proba(X_train)

In [None]:
y_predict_proba_test = model.predict_proba(X_test)

In [None]:
roc_auc_score(y_train,y_predict_proba_train,multi_class='ovo')

In [None]:
roc_auc_score(y_test,y_predict_proba_test,multi_class='ovo')

In [None]:
confusion_matrix(y_test,y_predict)

In [None]:
accuracy_score(y_test,y_predict)

In [None]:
print(classification_report(y_test,y_predict))

Precision and Recall is further improved with XGBoost

In [None]:
cross_val_score(model,df[Best_features],df['quality'],scoring='accuracy',n_jobs=-1).mean()

XGBClassifier is giving 90% accuracy with cross_val_score

we will improve model accuracy by using Hyperparameter Optimization.

Here we are using RandomizedSearchCV.

In [None]:
params = {
    'n_estimators' : list(np.arange(5,101,1)) ,
    'max_depth' : list(np.arange(3,16,1)) ,
    'min_child_weight' : [1,3,4,5,6,7,8] ,
    'learning_rate' : list(np.arange(0.05,0.35,0.05)) ,
    'colsample_bytree' : [0.4,0.5,0.6,0.7],
    'gamma' : [0.0,0.1,0.2,0.3,0.4]    
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
random_search = RandomizedSearchCV(model,param_distributions=params,n_jobs=-1,scoring='accuracy',verbose=3,cv=5)

In [None]:
random_search.fit(df[Best_features],df['quality'])

In [None]:
random_search.best_estimator_

In [None]:
model = XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.4, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.3, max_delta_step=0, max_depth=10,
              min_child_weight=1, monotone_constraints=None,
              n_estimators=37, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [None]:
cross_val_score(model,df[Best_features],df['quality'],scoring='accuracy',n_jobs=-1).mean()

As we can see with the help of Hyperparameter Optimization we have improved 1% accuracy

#### I hope you enjoyed a lot.

#### Thank You