In [41]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb

import seaborn as sns
%matplotlib inline

In [42]:
DATAPATH = 'data/final_preprocessed.csv'
data = pd.read_csv(DATAPATH)

In [43]:
data = data.iloc[:,1:]
data.head()

Unnamed: 0,quality,style,norm_fixed_acidity,norm_volatile_acidity,norm_citric_acid,norm_residual_sugar,norm_chlorides,norm_free_sulfur_dioxide,norm_total_sulfur_dioxide,norm_density,norm_pH,norm_sulphates,norm_alcohol
0,0,0,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899
1,0,0,0.330579,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087
2,0,0,0.330579,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087
3,1,0,0.61157,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087
4,0,0,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899


In [44]:
y = data['quality']
X = data.drop('quality', axis=1)

In [45]:
X.head()

Unnamed: 0,style,norm_fixed_acidity,norm_volatile_acidity,norm_citric_acid,norm_residual_sugar,norm_chlorides,norm_free_sulfur_dioxide,norm_total_sulfur_dioxide,norm_density,norm_pH,norm_sulphates,norm_alcohol
0,0,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899
1,0,0.330579,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087
2,0,0.330579,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087
3,0,0.61157,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087
4,0,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899


In [46]:
y.head()

0    0
1    0
2    0
3    1
4    0
Name: quality, dtype: int64

### lets jump into different classifiers

#### 1. Random Forest Classifier

In [47]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rf_clf.fit(X, y)
#increasing depth yields better accuracy

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [48]:
rf_probs = rf_clf.predict(X)
print('random forest training accuracy {}'.format(accuracy_score(y, rf_probs)))

random forest training accuracy 0.8482376481452978


In [49]:
list(zip(X, rf_clf.feature_importances_))

[('style', 0.004379549937571535),
 ('norm_fixed_acidity', 0.059556179237053905),
 ('norm_volatile_acidity', 0.11792174426509526),
 ('norm_citric_acid', 0.07299350930921135),
 ('norm_residual_sugar', 0.0712487852979196),
 ('norm_chlorides', 0.07994144859693195),
 ('norm_free_sulfur_dioxide', 0.07593316794606204),
 ('norm_total_sulfur_dioxide', 0.07541473729302452),
 ('norm_density', 0.10189172654316538),
 ('norm_pH', 0.0675307898466862),
 ('norm_sulphates', 0.07517891236900988),
 ('norm_alcohol', 0.19800944935826834)]

In [50]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y, rf_probs))
cm

Unnamed: 0,0,1,2
0,2067,302,15
1,255,2524,57
2,27,330,920


#### 2. Logistic Regression

In [51]:
lr_clf = LogisticRegression(penalty='l2',  C=1.0)
lr_clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [52]:
lr_probs = lr_clf.predict(X)
print('random forest training accuracy {}'.format(accuracy_score(y, lr_probs)))

random forest training accuracy 0.5838079113436971


#### 3. XGBoost

In [53]:
# This example uses the current build of XGBoost, from https://github.com/dmlc/xgboost
xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=1000, learning_rate=0.01, subsample=0.9, nthread=4,
                            colsample_bytree=0.9, min_child_weight=1)

In [54]:
xgb_clf.fit(X, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9)

In [55]:
xgb_probs = xgb_clf.predict(X)
print('XGB training accuracy {}'.format(accuracy_score(y, xgb_probs)))

XGB training accuracy 0.9287363398491612


  if diff:


#### 4. lightGBM

In [56]:
param = {'num_leaves':150, 'objective':'multiclass','max_depth':9,'learning_rate':.05,'max_bin':200,'num_threads':4,
        'num_class':3, 'metric':'multi_logloss'}

In [57]:
num_round = 50
train_data = lgb.Dataset(X,label=y)
lgbm_clf = lgb.train(param,train_data,num_round)

In [58]:
lgbm_probs = lgbm_clf.predict(X)
lgb_scores = [np.argmax(line) for line in lgbm_probs]

In [59]:
print('LightGBM training accuracy {}'.format(accuracy_score(y, lgb_scores)))

LightGBM training accuracy 0.8351546867785131


#### Accuracy details<br>
Random Forest after better parametre tuning giving accuracy of 84.8% <br>
Logistic Regression after better parametre tuning giving accuracy of 58.36% <br>
Random Forest after better parametre tuning giving accuracy of 92.8% <br>
Random Forest after better parametre tuning giving accuracy of 83.5% <br>

In [60]:
#splitting into train and test 90-10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [61]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5847, 12), (5847,), (650, 12), (650,))

In [62]:
### Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rf_clf.fit(X_train, y_train)
rf_probs1 = rf_clf.predict(X_train)
print('random forest training accuracy {}'.format(accuracy_score(y_train, rf_probs1)))
rf_probs2 = rf_clf.predict(X_test)
print('random forest testing accuracy {}'.format(accuracy_score(y_test, rf_probs2)))

random forest training accuracy 0.8556524713528305
random forest testing accuracy 0.6584615384615384


In [63]:
### XGBoost
xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=1000, learning_rate=0.01, subsample=0.9, nthread=4,
                            colsample_bytree=0.9, min_child_weight=1)
xgb_clf.fit(X_train, y_train)
xgb_probs1 = xgb_clf.predict(X_train)
print('XGBoost training accuracy {}'.format(accuracy_score(y_train, xgb_probs1)))
xgb_probs2 = xgb_clf.predict(X_test)
print('XGBoost testing accuracy {}'.format(accuracy_score(y_test, xgb_probs2)))

XGBoost training accuracy 0.9368907131862494
XGBoost testing accuracy 0.7138461538461538


  if diff:
  if diff:


In [64]:
#light GBM
param = {'num_leaves':150, 'objective':'multiclass','max_depth':9,'learning_rate':.05,'max_bin':200,'num_threads':4,
        'num_class':3, 'metric':'multi_logloss'}
num_round = 50
train_data = lgb.Dataset(X_train,label=y_train)
lgbm_clf = lgb.train(param,train_data,num_round)
lgbm_probs1 = lgbm_clf.predict(X_train)
lgb_scores1 = [np.argmax(line) for line in lgbm_probs1]
print('LightGBM training accuracy {}'.format(accuracy_score(y_train, lgb_scores1)))
lgbm_probs2 = lgbm_clf.predict(X_test)
lgb_scores2 = [np.argmax(line) for line in lgbm_probs2]
print('LightGBM testing accuracy {}'.format(accuracy_score(y_test, lgb_scores2)))

LightGBM training accuracy 0.8388917393535146
LightGBM testing accuracy 0.676923076923077


#### Accuracy details after train and test split 90-10 ratio<br>
Random Forest - train accuracy - 84.8% - test accuracy - 66%  <br>
XGBoost       - train accuracy - 93.6% - test accuracy - 72%  <br>
Light GBM     - train accuracy - 83.8% - test accuracy - 67%  <br>

#### Conclusion

The better model overall is XGBoost and still the all the model can be improved with good amount of data. Relationship among red and white wine and features importance regarding the style is given above. By considering all these analyses will improve the quality of wine and improved market for the wine.
