# <center>My first Machine Learning project : <br>3 machine learning models to classify red wine</center>

<img src='https://hhp-blog.s3.amazonaws.com/2018/02/iStock-615737086-768x512.jpg'>

### Librairies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from mlxtend.plotting import plot_confusion_matrix
from matplotlib.colors import LinearSegmentedColormap
from sklearn.decomposition import PCA

# Objective 1 : predict the quality score of the wine
### Dataset import

In [None]:
wine = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
wine.head()

### Dataset exploration

In [None]:
wine.info()

In [None]:
wine['quality'].hist(figsize=(7,5), color="purple")
plt.show()

In [None]:
wine['quality'].value_counts(sort=False)/1599*100

In [None]:
wine['quality'].describe()

> The quality scores range from 3/10 to 8/10. The target distribution is skewed to the right.<br>
The mean is 5.65/10, the median is 6/10.
96% of the wines have a score of 5/10 or more.<br>
So we can say that the wines in this dataset are favourably rated.

In [None]:
wine_corr = wine.corr()
cmap = LinearSegmentedColormap.from_list(
    name='test', 
    colors=['black','red','pink','red','black'])
mask = np.zeros_like(wine_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(wine_corr, xticklabels=wine_corr.columns, yticklabels=wine_corr.columns, cmap=cmap,linewidths = .5, annot=True, ax=ax, mask=mask, vmin=-1)
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.show()

> Let's delete the "free sulfur dioxide" column because it looks like it's marginallly correlated to the target and this feature is already present in the "total sulfur dioxide" column.<br>
The same goes for the "pH" column which is also present in the fixed acidity (pH is a scale of acidity)

In [None]:
wine.drop(columns=['free sulfur dioxide','pH'], inplace=True)

> As the "alcohol" column seems the most correlated to the target, let's see the lineplot

In [None]:
f, ax = plt.subplots(figsize=(10, 5))
sns.lineplot(x="quality", y="alcohol", data=wine, color='purple')

### Scalling

In [None]:
X = wine.drop(columns = ['quality'])
y = wine['quality']

In [None]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

### Machine learning models for classification

#### Linear SVC

In [None]:
linearsvc = LinearSVC(dual=False)
pred_svc=cross_val_predict(linearsvc, X, y, cv=10)

In [None]:
cmap2 = LinearSegmentedColormap.from_list(
    name='test', 
    colors=['pink','red','purple'])
cm = confusion_matrix(y, pred_svc)
fig, ax = plot_confusion_matrix(conf_mat=cm, class_names=['3','4','5','6','7','8'],figsize=(10, 5), cmap=cmap2, colorbar=True)
plt.show()

In [None]:
print('Accuracy score :',accuracy_score(y, pred_svc))

In [None]:
print('F1 score:\n',classification_report(y, pred_svc, digits=3))

#### RandomForest 

> Randomized search and Grid search to find the best hyperparameters

In [None]:
from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [1, 3, 5, None],
              "max_features": sp_randint(1, 10),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=10, iid=False)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"max_depth": [1, 3, 5, None],
              "max_features": [2, 3, 9],
              "min_samples_split": [2, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10, iid=False)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

In [None]:
rfc = RandomForestClassifier(n_estimators=20, bootstrap=True, 
criterion='gini', max_depth=5,max_features=3,min_samples_split=10)
pred_rfc=cross_val_predict(rfc, X, y, cv=10)

In [None]:
cm = confusion_matrix(y, pred_rfc)
fig, ax = plot_confusion_matrix(conf_mat=cm, class_names=['3','4','5','6','7','8'],figsize=(10, 5), cmap=cmap2, colorbar=True)
plt.show()

In [None]:
print('Accuracy score :',accuracy_score(y, pred_rfc))

In [None]:
print('F1 score:\n',classification_report(y, pred_rfc, digits=3))

#### XGBoost

In [None]:
xgboost = XGBClassifier(objective='multi:softmax', num_class=10, 
        n_jobs=-1,booster="gbtree",tree_method = "hist",
        grow_policy = "depthwise")
pred_xgboost=cross_val_predict(xgboost, X, y, cv=10)

In [None]:
cm = confusion_matrix(y, pred_xgboost)
fig, ax = plot_confusion_matrix(conf_mat=cm, class_names=['3','4','5','6','7','8'],figsize=(10, 5), cmap=cmap2, colorbar=True)
plt.show()

In [None]:
print('Accuracy score :',accuracy_score(y, pred_xgboost))

In [None]:
print('F1 score :\n',classification_report(y, pred_xgboost, digits=3))

#### Predicted target distribution 

In [None]:
X['pred_svc'] = pred_svc
X['pred_svc'].hist()

In [None]:
linearsvc.fit(X, y)
feature=X.columns
importance= linearsvc.coef_
feat_imp = pd.DataFrame(importance, columns=X.columns)
feat_imp.plot(kind = "bar", figsize=(12,4))

In [None]:
X['pred_rfc'] = pred_rfc
X['pred_rfc'].hist()

In [None]:
rfc.fit(X, y)
feature=X.columns
importance=rfc.feature_importances_
feat_imp = pd.DataFrame(importance, feature, columns=['Importance'])
feat_imp.plot(kind = "bar", figsize=(12,4))

In [None]:
X['pred_xgboost'] = pred_xgboost
X['pred_xgboost'].hist()

In [None]:
xgboost.fit(X, y)
feature=X.columns
importance=xgboost.feature_importances_
feat_imp = pd.DataFrame(importance, feature, columns=['Importance'])
feat_imp.plot(kind = "bar", figsize=(12,4))

# Objective n°2 : predict if a wine is good or bad

> Ths will allow us to have a better score

In [None]:
wine = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
pd.set_option('mode.chained_assignment', None)

## Data transformation : scalling the target to 0 (bad) and 1 (good)

In [None]:
wine.quality[wine['quality']<=5]=0
wine.quality[wine['quality']>5]=1

In [None]:
wine.head()

### Scalling et split

In [None]:
X = wine.drop(columns = ['quality'])
y = wine['quality']

In [None]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

### Machine learning models for classification

#### Linear SVC 

In [None]:
linearsvc = LinearSVC(dual=False)
pred_svc=cross_val_predict(linearsvc, X, y, cv=10)

In [None]:
cmap2 = LinearSegmentedColormap.from_list(
    name='test', 
    colors=['pink','red','purple'])
cm = confusion_matrix(y, pred_svc)
fig, ax = plot_confusion_matrix(conf_mat=cm, class_names=['0','1'],figsize=(10, 5), cmap=cmap2, colorbar=True)
plt.show()

In [None]:
print('Accuracy score :',accuracy_score(y, pred_svc))

In [None]:
print('F1 score :\n',classification_report(y, pred_svc, digits=3))

#### RandomForest 

In [None]:
from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [1, 3, 5, None],
              "max_features": sp_randint(1, 10),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=10, iid=False)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"max_depth": [1, 3, 5, None],
              "max_features": [2, 3, 9],
              "min_samples_split": [2, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10, iid=False)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

In [None]:
rfc = RandomForestClassifier(n_estimators=20, bootstrap=True, criterion='gini', max_depth=3,max_features=4,min_samples_split=10)
pred_rfc=cross_val_predict(rfc, X, y, cv=10)

In [None]:
cm = confusion_matrix(y, pred_rfc)
fig, ax = plot_confusion_matrix(conf_mat=cm, class_names=['0','1'],figsize=(10, 5), cmap=cmap2, colorbar=True)
plt.show()

In [None]:
print('Accuracy score :',accuracy_score(y, pred_rfc))

In [None]:
print('F1 score:\n',classification_report(y, pred_rfc, digits=3))

#### XGBoost

In [None]:
xgboost = XGBClassifier(objective='reg:squarederror', n_jobs=-1,
                        booster="gbtree",tree_method = "hist",
                        grow_policy = "depthwise")
pred_xgboost=cross_val_predict(xgboost, X, y, cv=10)

In [None]:
cm = confusion_matrix(y, pred_xgboost)
fig, ax = plot_confusion_matrix(conf_mat=cm, class_names=['0','1'],figsize=(10, 5), cmap=cmap2, colorbar=True)
plt.show()

In [None]:
print('Accuracy score :',accuracy_score(y, pred_xgboost))

In [None]:
print('F1 score :\n',classification_report(y, pred_xgboost, digits=3))

#### Predicted target distribution

In [None]:
X['pred_svc'] = pred_svc
X['pred_svc'].hist()

In [None]:
linearsvc.fit(X, y)
feature=X.columns
importance= linearsvc.coef_
feat_imp = pd.DataFrame(importance, columns=feature)
feat_imp.plot(kind = "bar", figsize=(12,4))

In [None]:
X['pred_rfc'] = pred_rfc
X['pred_rfc'].hist()

In [None]:
rfc.fit(X, y)
feature=X.columns
importance=rfc.feature_importances_
feat_imp = pd.DataFrame(importance, feature, columns=['Importance'])
feat_imp.plot(kind = "bar", figsize=(12,4))

In [None]:
X['pred_xgboost'] = pred_xgboost
X['pred_xgboost'].hist()

In [None]:
xgboost.fit(X, y)
feature=X.columns
importance=xgboost.feature_importances_
feat_imp = pd.DataFrame(importance, feature, columns=['Importance'])
feat_imp.plot(kind = "bar", figsize=(12,4))

> To conclude : as excepted, the objective n°2 has better scores, because the target has become binary.<br>
The general quality of the wines in this dataset is positively skewed but the algorithm Random Forest succeeded to rendera similar distribution, inspite of the low quantity of data

# Thank you for reading, please leave a comment if you see a way to ameliorate my code, it's always fun to learn something new !