## Creating model for the Prediction of Red Wine Quality taking it as Classifier Problem

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
matplotlib.style.use('fivethirtyeight')

In [None]:
df=pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

### Check for the basic data 

In [None]:
#checking the dataframe head
df.head()

In [None]:
df.info()

In [None]:
#Here we don't have any null values. Lets check the outlier values if any

### Exploratory Data Analysis

In [None]:
sns.pairplot(df)

### Checking the correlation

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
df.corr()

#### We can see little positive correlation between density and fixed acidity and negative correlation between ph and fixed acidity

In [None]:
plt.figure(figsize=(10,10))
for feature in (df.columns):
        plt.hist(df[feature])
        plt.title(feature)
        plt.show()

### Binning the Quality into Good and Bad 

In [None]:
bins=(2,6.5,8)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

In [None]:
from sklearn.preprocessing import  LabelEncoder
#for changing into 
label_quality = LabelEncoder()
df['quality'] = label_quality.fit_transform(df['quality'])

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10,10))
for feature in (df.columns):
        sns.barplot(y=df[feature],x=df['quality'])
        plt.title(feature)
        plt.show()

#### Scaling the data for getting better results

In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=df.drop(['quality'],axis=1)
y=df['quality']

In [None]:
ms=StandardScaler()
X= ms.fit_transform(X)

### Splitting the data into train and test

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=101)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier()

In [None]:
rf.fit(X_train,y_train)

In [None]:
prediction=rf.predict(X_test)

In [None]:
print("Score on Train Set",rf.score(X_train,y_train))

In [None]:
print("Score on Test Set",rf.score(X_test,y_test))

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
print("Classification Report on Random Forest\n",classification_report(y_test,prediction))

In [None]:
print("Classification Report on Random Forest\n",confusion_matrix(y_test,prediction))

### Random Forest with HyperParameter Tuning using RandomSearch CV

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Criterian to select
criterion=['gini','entropy']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]



In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion':criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
randomforest=RandomForestClassifier()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf_random = RandomizedSearchCV(estimator = randomforest, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=101, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
prediction=rf_random.predict(X_test)

In [None]:
print("Classification Report on Random Forest\n",confusion_matrix(y_test,prediction))

In [None]:
print("Classification Report on Random Forest\n",classification_report(y_test,prediction))

#### There is not much difference between HyperParameter Tuning and normal Random Forest

### XGBoost Model

In [None]:
from xgboost import XGBClassifier

In [None]:
xg=XGBClassifier()

In [None]:
xg.fit(X_train,y_train)

In [None]:
prediction=xg.predict(X_test)

In [None]:
print("Classification Report on XGBoost\n",confusion_matrix(y_test,prediction))

In [None]:
print("Classification Report on XGBoost\n",classification_report(y_test,prediction))

### Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
svc=SVC()

In [None]:
svc.fit(X_train,y_train)

In [None]:
prediction=svc.predict(X_test)

In [None]:
print("Classification Report on SVC\n",confusion_matrix(y_test,prediction))

In [None]:
print("Classification Report on SVC\n",classification_report(y_test,prediction))

### Hyperparameter tuning with SVC

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['linear','rbf','sigmoid']}  

In [None]:
sv=SVC()

In [None]:
rf_random = RandomizedSearchCV(estimator = sv, param_distributions = param_grid, n_iter = 100, cv = 5, verbose=2, random_state=101, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
prediction=rf_random.predict(X_test)

In [None]:
print("Classification Report on SVC\n",confusion_matrix(y_test,prediction))

In [None]:
print("Classification Report on SVC\n",classification_report(y_test,prediction))