In this part of file I'll be writing model code and predict the accuracy

In [57]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb

import seaborn as sns
%matplotlib inline

In [58]:
DATAPATH = 'data/basic_preprocessed.csv'
data = pd.read_csv(DATAPATH)

In [59]:
data = data.iloc[:,1:]
data.head()

Unnamed: 0,quality,norm_fixed_acidity,norm_volatile_acidity,norm_citric_acid,norm_residual_sugar,norm_chlorides,norm_free_sulfur_dioxide,norm_total_sulfur_dioxide,norm_density,norm_pH,norm_sulphates,norm_alcohol,red,white
0,1,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0
1,1,0.330579,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087,1,0
2,1,0.330579,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087,1,0
3,0,0.61157,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087,1,0
4,1,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0


In [60]:
y = data['quality']
X = data.drop('quality', axis=1)

In [61]:
X.head()

Unnamed: 0,norm_fixed_acidity,norm_volatile_acidity,norm_citric_acid,norm_residual_sugar,norm_chlorides,norm_free_sulfur_dioxide,norm_total_sulfur_dioxide,norm_density,norm_pH,norm_sulphates,norm_alcohol,red,white
0,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0
1,0.330579,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087,1,0
2,0.330579,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087,1,0
3,0.61157,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087,1,0
4,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0


In [62]:
y.head()

0    1
1    1
2    1
3    0
4    1
Name: quality, dtype: int64

### lets jump into different classifiers

#### 1. Random Forest Classifier

In [63]:
rf_clf = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0)
rf_clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [64]:
rf_probs = rf_clf.predict(X)
print('random forest training accuracy {}'.format(accuracy_score(y, rf_probs)))

random forest training accuracy 0.7029398183777128


In [65]:
list(zip(X, rf_clf.feature_importances_))

[('norm_fixed_acidity', 0.006336210636877726),
 ('norm_volatile_acidity', 0.1696735700512086),
 ('norm_citric_acid', 0.020544008540987254),
 ('norm_residual_sugar', 0.008552169994364908),
 ('norm_chlorides', 0.18514934204750633),
 ('norm_free_sulfur_dioxide', 0.020689439445125146),
 ('norm_total_sulfur_dioxide', 0.0),
 ('norm_density', 0.1817711645423561),
 ('norm_pH', 0.006321497695115736),
 ('norm_sulphates', 0.0),
 ('norm_alcohol', 0.4009625970464582),
 ('red', 0.0),
 ('white', 0.0)]

We can see from the above feature importance, columns like red, white, norm_supplier, norm_total_sulfur_dioxide, 
norm_fixed_acidity did not any or much importance. So, we can also run the training without these columns which will
save time while computing

So the above accuracy is using only basic features for random forest, let's try other models like XGBoost, lightGBM, CatBoost

In [66]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y, rf_probs))
cm

Unnamed: 0,0,1,2
0,3466,449,0
1,1283,1101,0
2,196,2,0


#### 2. Logistic Regression

In [67]:
lr_clf = LogisticRegression(penalty='l2',  C=1.0)
lr_clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [68]:
lr_probs = lr_clf.predict(X)
print('LR training accuracy {}'.format(accuracy_score(y, lr_probs)))

LR training accuracy 0.7140218562413422


#### 3. XGBoost

In [69]:
# This example uses the current build of XGBoost, from https://github.com/dmlc/xgboost
xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=1000, learning_rate=0.01, subsample=0.9, nthread=4,
                            colsample_bytree=0.9, min_child_weight=1)

In [70]:
xgb_clf.fit(X, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9)

In [71]:
xgb_probs = xgb_clf.predict(X)
print('XGBoost training accuracy {}'.format(accuracy_score(y, xgb_probs)))

XGBoost training accuracy 0.9413575496382945


  if diff:


#### 4. lightGBM

In [72]:
param = {'num_leaves':150, 'objective':'multiclass','max_depth':7,'learning_rate':.05,'max_bin':200,'num_threads':4,
        'num_class':3, 'metric':'multi_logloss'}

In [73]:
num_round = 50
train_data = lgb.Dataset(X,label=y)
lgbm_clf = lgb.train(param,train_data,num_round)

In [74]:
lgbm_probs = lgbm_clf.predict(X)
lgb_scores = [np.argmax(line) for line in lgbm_probs]

In [75]:
print('lightGBM training accuracy {}'.format(accuracy_score(y, lgb_scores)))

lightGBM training accuracy 0.8263814068031399
