# Wine quality

In [None]:
#Import dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Load the data
data = pd.read_csv('../input/wine-quality/winequalityN.csv')

In [None]:
data.head()

## Exploratory Data Analysis

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
vals = data.isna().sum().sort_values(ascending=False).values
cols = list(data.isna().sum().sort_values(ascending=False).index)

pd.DataFrame(data=vals, index=cols, columns=['values']).T

## Target variable

In [None]:
#target class distribution 
(data.quality.value_counts() / len(data))*100

In [None]:
#plot target class distribution 
plt.figure(figsize=(16,3))
sns.countplot(data=data, x='quality')

## The Distribution of Independent Variables 

### Categorical variables

In [None]:
data.type.value_counts()

###  Continuous Variables 
#### Univariate Analysis & Bivariate Analysis

In [None]:
num_vars = data.select_dtypes(include=float).columns

In [None]:
def distplot(col, data): 
    plt.figure(figsize=(16,2))
    
    plt.subplot(1,3,1)
    mean = data[col].mean()
    sns.distplot(data[col], bins=88)
    plt.axvline(mean, 0,1, color='black')
    
    plt.subplot(1,3,2)
    sns.boxplot(x=col, data=data)
    
    plt.subplot(1,3,3)
    sns.violinplot(x='quality',y=col,data=data, hue='type',split=True)


In [None]:
for col in num_vars:
    distplot(col, data)

## Data Preprocessing

### Handling NaN  :

In [None]:
data=data.dropna()

In [None]:
#data.isna().sum()

### Encoding categorial variables:

In [None]:
cat_vars = ['type']
enc_cat_vars = pd.get_dummies(data[cat_vars])

In [None]:
data.drop('type', axis=1, inplace=True)

In [None]:
data = pd.concat([enc_cat_vars, data], axis=1)
data.head()

### Correlation Matrix

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(data.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);


## Split the data

In [None]:
X = data.drop('quality', axis=1)
y = data['quality']

## Handling Umbalanced Data

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
over_sample = SMOTE(k_neighbors=4)
X, y = over_sample.fit_resample(X,y)

In [None]:
from collections import Counter
counter = Counter(y)
for k,v in counter.items():
    dist = (v/len(y))*100
    print(f'class={k}, n={v} ({dist}%)')

### Split the data into Train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2)

## Modeling


In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, learning_curve, GridSearchCV, RandomizedSearchCV,cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline

from sklearn.metrics import confusion_matrix, classification_report,f1_score, recall_score, precision_score, make_scorer, accuracy_score

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC
import xgboost  as xgb

In [None]:
models_dict = {

'RFC_model' : make_pipeline(
                            SelectKBest(f_classif, k=13),
                            RandomForestClassifier(random_state=0, n_jobs=-1)),

'DTC_model' : make_pipeline(
                            SelectKBest(f_classif, k=13),
                            AdaBoostClassifier(DecisionTreeClassifier(),algorithm='SAMME')),

'SVC_model' : make_pipeline(
                            StandardScaler(),
                            SelectKBest(f_classif, k=13),
                            SVC(C=5,kernel='rbf',gamma=5,degree=3,coef0=1)),

'NB_model'  : make_pipeline(
                            SelectKBest(f_classif, k=13), 
                            GaussianNB()),

'DTC_model' : make_pipeline(
                            SelectKBest(f_classif, k=13), 
                            DecisionTreeClassifier()),

'XGB_model' : make_pipeline(
                            SelectKBest(f_classif, k=13), 
                            xgb.XGBClassifier(objective='multi:softmax',num_class=7))
    
}

In [None]:
def cross_validation_score(model):
    
    cv = StratifiedKFold(3)
    scores = cross_val_score(model, X_train,y_train, n_jobs=-1, cv=cv, scoring='f1_micro')
    scores = scores.mean()
    return scores


In [None]:
for mod_n, mod in models_dict.items():
    print('{}: Validation score  {}'.format(mod_n, cross_validation_score(mod)))

## Random Forest Classifier

In [None]:
RFC_model = make_pipeline(SelectKBest(f_classif, k=13),
                          RandomForestClassifier(random_state=0, n_jobs=-1))

In [None]:
def RandomizedSearchCV_(model, param_grid) :
    
    cv = StratifiedKFold(3)
    randomSCV = RandomizedSearchCV(model, param_grid, n_iter=30, cv=5, scoring='f1_micro', random_state=42)

    randomSCV.fit(X_train, y_train)
    model_best_params = randomSCV.best_estimator_
    
    print('best score :', randomSCV.best_score_ )
    print('best params :', randomSCV.best_params_ )
    
    return model_best_params

In [None]:
RFC_model.get_params().keys()

In [None]:
param_grid = {
    
    'selectkbest__k':np.arange(5,14,1),
    'randomforestclassifier__max_depth':np.arange(100,300,50),
    'randomforestclassifier__n_estimators':np.arange(300,500,50)
}

#Tunning Hyperparameters
RFC_model_best_hyp = RandomizedSearchCV_(RFC_model, param_grid)

In [None]:
RFC_model_best_hyp.fit(X_train, y_train)

y_pred = RFC_model_best_hyp.predict(X_test)

## Evaluate the Model:

In [None]:
cm = sns.light_palette("green", as_cmap=True) 
  
ct = pd.crosstab(y_test, y_pred)
ct.style.background_gradient(cmap=cm).set_precision(2)


In [None]:
print(classification_report(y_test, y_pred,labels=[3,4,5,6,7,8,9]))