# Red Wine Classification

In [None]:
# necessary imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

sns.set()
%matplotlib inline

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv') # loading data

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

It looks like there are no missing values.

In [None]:
df.isna().sum()

In [None]:
# let's see how data is distributed for every column

plt.figure(figsize = (25, 20))
plotnumber = 1

for col in df:
    if plotnumber <= 12:
        ax = plt.subplot(4, 3, plotnumber)
        sns.distplot(df[col])
        plt.xlabel(col, fontsize = 15)
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()

In [None]:
# heatmap

plt.figure(figsize = (16, 7))

corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask, annot = True, fmt = '0.2g', linewidths = 1)
plt.show()

In [None]:
# creating X and y

X = df.drop('quality', axis = 1)
y = df['quality']

In [None]:
# splitting data into training and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y)

In [None]:
# scaling data

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs')
lr.fit(X_train, y_train)

In [None]:
# accuracy score

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

lr_acc = accuracy_score(y_test, lr.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, lr.predict(X_train))}")
print(f"Accuracy Score of Training Data is {lr_acc}\n")

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
# accuracy score

knn_acc = accuracy_score(y_test, knn.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, knn.predict(X_train))}")
print(f"Accuracy Score of Training Data is {knn_acc}\n")

### SVC

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)

In [None]:
# accuracy score

svc_acc = accuracy_score(y_test, svc.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, svc.predict(X_train))}")
print(f"Accuracy Score of Training Data is {svc_acc}\n")

### SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV

sgd = SGDClassifier()
parameters = {
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'loss' : ['hinge', 'log'],
    'penalty' : ['l1', 'l2']
}

grid_search = GridSearchCV(sgd, parameters, cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)

In [None]:
# best parameter and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
sgd = SGDClassifier(alpha = 0.01, loss = 'log', penalty = 'l2')
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print(accuracy_score(y_train, sgd.predict(X_train)))

sgd_acc = accuracy_score(y_test, sgd.predict(X_test))
print(sgd_acc)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [None]:
# accuracy score

dtc_acc = accuracy_score(y_test, dtc.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"Accuracy Score of Training Data is {dtc_acc}\n")

In [None]:
# dtc = DecisionTreeClassifier(criterion = 'gini', max_depth = 11, min_samples_leaf = 1, min_samples_split = 2, splitter = 'best')
# dtc.fit(X_train, y_train)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# accuracy score

rf_acc = accuracy_score(y_test, rf.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, rf.predict(X_train))}")
print(f"Accuracy Score of Training Data is {rf_acc}\n")

### Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator = dtc)
ada.fit(X_train, y_train)

In [None]:
# accuracy score

ada_acc = accuracy_score(y_test, ada.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, ada.predict(X_train))}")
print(f"Accuracy Score of Training Data is {ada_acc}\n")

In [None]:
# hyper parameter tuning using grid search cv

grid_param = {
    'n_estimators' : [40, 50, 70, 80, 100],
    'learning_rate' : [0.01, 0.1, 0.05, 0.5, 1, 10],
    'algorithm' : ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(ada, grid_param, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)

In [None]:
# best parameters and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
ada = AdaBoostClassifier(base_estimator = ada, algorithm = 'SAMME.R', learning_rate = 1, n_estimators = 100)
ada.fit(X_train, y_train)

In [None]:
# accuracy score

ada_acc = accuracy_score(y_test, ada.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, ada.predict(X_train))}")
print(f"Accuracy Score of Training Data is {ada_acc}\n")

### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [None]:
# accuracy score

gb_acc = accuracy_score(y_test, gb.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, gb.predict(X_train))}")
print(f"Accuracy Score of Training Data is {gb_acc}\n")

### Stochastic Gradient Boosting (SGB)

In [None]:
sgb = GradientBoostingClassifier(subsample = 0.9, max_features = 0.8)
sgb.fit(X_train, y_train)

In [None]:
# accuracy score

sgb_acc = accuracy_score(y_test, sgb.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, sgb.predict(X_train))}")
print(f"Accuracy Score of Training Data is {sgb_acc}\n")

### XgBoost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(learning_rate = 0.1, loss = 'deviance', n_estimators = 100)
xgb.fit(X_train, y_train)

In [None]:
# accuracy score

xgb_acc = accuracy_score(y_test, xgb.predict(X_test))

print(f"Accuracy Score of Training Data is {accuracy_score(y_train, xgb.predict(X_train))}")
print(f"Accuracy Score of Training Data is {xgb_acc}\n")

### Light Gradient Boosting Classifier

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

lgbm_acc = accuracy_score(y_test, lgbm.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, lgbm.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {lgbm_acc} \n")

### Cat Boost Classifier

In [None]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier()
cat.fit(X_train, y_train)

In [None]:
cat_acc = accuracy_score(y_test, cat.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, cat.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {cat_acc} \n")

### Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

In [None]:
etc_acc = accuracy_score(y_test, etc.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, etc.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {etc_acc} \n")

In [None]:
models = pd.DataFrame({
    'Model' : ['Logistic Regression', 'KNN', 'SVC', 'SGD',  'Decision Tree', 'Random Forest','Ada Boost',
             'Gradient Boosting', 'SGB', 'XgBoost', 'LGBM', 'Cat Boost', 'Extra Tree'],
    'Score' : [lr_acc, knn_acc, svc_acc, sgd_acc, dtc_acc, rf_acc, ada_acc, gb_acc, sgb_acc, xgb_acc, lgbm_acc, cat_acc, etc_acc]
})


models.sort_values(by = 'Score', ascending = False)

In [None]:
plt.figure(figsize = (20, 8))

sns.barplot(x = 'Model', y = 'Score', data = models)
plt.ylim(0.45, 0.75)
plt.show()

#### If you like my kernel, please do upvote

### Thank You