In [1]:
import sklearn
import numpy 
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

### Load the data into Python:

In [2]:
db = pandas.read_csv('winequality-red.csv', header=0, delimiter=';')
db = db.drop(['Unnamed: 0'], axis=1)

### Split the labels and features; make a classification task instead of regression

In [3]:
y = db['quality']
x = db.drop(['quality'],axis=1)
dic = {1:False,2:False,3:False,4:False,5:False,6:True,7:True,8:True,9:True}
y = y.map(dic)

### Split the dataset for training and testing

In [4]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.75)



### Use Random Forest Classifier and find the most accurate settings by maximizing F_score

### Using `GridSearchCV` from `Scikit-learn`

In [5]:
parameters = {'max_features':('sqrt', 'log2', None), 'n_estimators':[100, 500, 1000], 'class_weight':({False:1,True:9}, {False:2,True:8}, {False:3,True:7}, {False:4,True:6}, {False:1,True:1}, {False:6,True:4}, {False:7,True:3}, {False:8,True:2}, {False:9,True:1})}
RFC = RandomForestClassifier()
RFC_GridSearch = GridSearchCV(RFC, parameters, scoring='f1_macro')
RFC_GridSearch.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'class_weight': ({False: 1, True: 9}, {False: 2, True: 8}, {False: 3, True: 7}, {False: 4, True: 6}, {False: 1, True: 1}, {False: 6, True: 4}, {False: 7, True: 3}, {False: 8, True: 2}, {False: 9, True: 1}), 'max_features': ('sqrt', 'log2', None), 'n_estimators': [100, 500, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0)

In [6]:
for dicts in RFC_GridSearch.grid_scores_:
    print ("Params: " , dicts[0], "Score: %.2f"%dicts[1])
print ( "\n Best f1_score: %.2f"%f1_score(y_test,RFC_GridSearch.best_estimator_.predict(X_test)), "\n Best estimator: ", RFC_GridSearch.best_estimator_)

Params:  {'class_weight': {False: 1, True: 9}, 'max_features': 'sqrt', 'n_estimators': 100} Score: 0.78
Params:  {'class_weight': {False: 1, True: 9}, 'max_features': 'sqrt', 'n_estimators': 500} Score: 0.79
Params:  {'class_weight': {False: 1, True: 9}, 'max_features': 'sqrt', 'n_estimators': 1000} Score: 0.79
Params:  {'class_weight': {False: 1, True: 9}, 'max_features': 'log2', 'n_estimators': 100} Score: 0.79
Params:  {'class_weight': {False: 1, True: 9}, 'max_features': 'log2', 'n_estimators': 500} Score: 0.78
Params:  {'class_weight': {False: 1, True: 9}, 'max_features': 'log2', 'n_estimators': 1000} Score: 0.78
Params:  {'class_weight': {False: 1, True: 9}, 'max_features': None, 'n_estimators': 100} Score: 0.77
Params:  {'class_weight': {False: 1, True: 9}, 'max_features': None, 'n_estimators': 500} Score: 0.77
Params:  {'class_weight': {False: 1, True: 9}, 'max_features': None, 'n_estimators': 1000} Score: 0.78
Params:  {'class_weight': {False: 2, True: 8}, 'max_features': 'sqr



### Using for loop

In [7]:
accuracy = 0.0
for max_features in ('sqrt','log2', None):
    for n_estimators in [100, 500, 1000]:
        for class_w_f in range(1,9):
            class_weight = {False:class_w_f,True:10 - class_w_f} #cutoff = [True]/([True]+[False])
            MyRFClassifier = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, class_weight=class_weight, warm_start=False)
            MyRFClassifier.fit(X_train,y_train)
            new_accuracy = f1_score(y_test, MyRFClassifier.predict(X_test), average='weighted')
            if (new_accuracy > accuracy):
                accuracy = new_accuracy
                cm = confusion_matrix(y_test,MyRFClassifier.predict(X_test))
                sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
                bestRFC = {"Num. of estimators: ": n_estimators,"Max. Features: ": max_features, "Class weight: ": class_weight}
                print ("New max accuracy: %.2f"%accuracy, "Sensitivity: %.2f"%sensitivity)
print ("Best setting: ", bestRFC, "\nAccuracy: %.2f"%accuracy, "Sensitivity: %.2f"%sensitivity)

New max accuracy:  0.8200900090009 Sensitivity:  0.8617021276595744
New max accuracy:  0.8276457518385228 Sensitivity:  0.8404255319148937
New max accuracy:  0.8301488430268918 Sensitivity:  0.8563829787234043
Best setting:  {'Max. Features: ': 'sqrt', 'Class weight: ': {False: 1, True: 9}, 'Num. of estimators: ': 500} 
Accuracy: 0.83 Sensitivity: 0.86


### Use SGDClassifier for training SVM

In [10]:
db_norm = pandas.read_csv('winequality-red.csv', header=0, delimiter=';')
db_norm = db_norm.drop(['Unnamed: 0'], axis=1)
#Normalize Data
for columns in db:
    if columns != "quality":
        db_norm[columns] = (db_norm[columns] - db_norm[columns].mean())/db_norm[columns].std()
y = db_norm['quality']
x = db_norm.drop(['quality'],axis=1)
dic = {1:False,2:False,3:False,4:False,5:False,6:True,7:True,8:True,9:True}
y = y.map(dic)
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(x, y, random_state=1, train_size=0.75)



In [16]:
parameters = {'alpha':[1.0/((2.0**(-3))*1199),1.0/((2.0**(-2))*1199), 1.0/((2.0**(-1))*1199),1.0/((2.0**0)*1199), 1.0/((2.0**1)*1199), 1.0/((2.0**2)*1199), 1.0/((2.0**3)*1199)], 'penalty':('none', 'l1', 'l2', 'elasticnet'), 'class_weight':({False:1,True:9}, {False:2,True:8}, {False:3,True:7}, {False:4,True:6}, {False:1,True:1}, {False:6,True:4}, {False:7,True:3}, {False:8,True:2}, {False:9,True:1})}
SVC = SGDClassifier(loss='hinge', tol=0.0001)
SVC_GridSearch = GridSearchCV(SVC, parameters, scoring='f1_macro')
SVC_GridSearch.fit(X_train_norm, y_train_norm)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ('none', 'l1', 'l2', 'elasticnet'), 'class_weight': ({False: 1, True: 9}, {False: 2, True: 8}, {False: 3, True: 7}, {False: 4, True: 6}, {False: 1, True: 1}, {False: 6, True: 4}, {False: 7, True: 3}, {False: 8, True: 2}, {False: 9, True: 1}), 'alpha': [0.006672226855713094, 0.003336113427856547, 0.0016680567139282735, 0.0008340283569641367, 0.0004170141784820684, 0.0002085070892410342, 0.0001042535446205171]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0)

In [22]:
for dicts in SVC_GridSearch.grid_scores_:
    None
    #print ("Params: " , dicts[0], "Score: %.2f"%dicts[1])
print ( "\n Best f1_score: %.2f"%f1_score(y_test_norm,SVC_GridSearch.best_estimator_.predict(X_test_norm)), "\n Best estimator: ", SVC_GridSearch.best_estimator_)


 Best f1_score: 0.75 
 Best estimator:  SGDClassifier(alpha=0.006672226855713094, average=False,
       class_weight={False: 1, True: 1}, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', max_iter=None, n_iter=None, n_jobs=1, penalty='none',
       power_t=0.5, random_state=None, shuffle=True, tol=0.0001, verbose=0,
       warm_start=False)




In [11]:
accuracy = 0.0
for alpha in range(-4,4):
    for penalty in ('none', 'l1', 'l2', 'elasticnet'):
        for class_w_t in range(1,9):
            for n_it in range(1,5):
                for lr in ('constant', 'optimal'):
                        alpha_ = 1.0/((2.0**alpha)*1199) #Default at 0.0001
                        n_iter = n_it*1000
                        class_weight = {False:10-class_w_t,True:class_w_t}
                        MySVMClassifier = SGDClassifier(loss='hinge', class_weight=class_weight, warm_start=False, max_iter=n_iter, alpha=alpha_, penalty = penalty, learning_rate=lr, eta0=0.000001)
                        MySVMClassifier.fit(X_train_norm,y_train_norm)
                        cm = confusion_matrix(y_test,MySVMClassifier.predict(X_test))
                        sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
                        new_accuracy = f1_score(y_test_norm, MySVMClassifier.predict(X_test_norm), average='weighted')
                        if (new_accuracy > accuracy):
                            accuracy = new_accuracy
                            bestSVM = {"Alpha:": alpha_,"max_iter:": n_iter, "Penalty:": penalty,"Learning rate:":lr, "Class weight:": class_weight}
                            print ("New max accuracy: %.2f"%accuracy, "Sensitivity: %.2f"%sensitivity)
print ("Best setting: ", bestSVM, "Accuracy: %.2f"%accuracy)

#MyClassifier.fit(X_train,y_train)
#f1_score(y_test, MyClassifier.predict(X_test))

  'precision', 'predicted', average, warn_for)


New max accuracy: 0.30 Sensitivity: 0.90
New max accuracy: 0.62 Sensitivity: 0.68
New max accuracy: 0.63 Sensitivity: 0.81
New max accuracy: 0.69 Sensitivity: 0.63
New max accuracy: 0.70 Sensitivity: 0.58
New max accuracy: 0.70 Sensitivity: 0.65
New max accuracy: 0.71 Sensitivity: 0.64
New max accuracy: 0.73 Sensitivity: 0.62
New max accuracy: 0.73 Sensitivity: 0.59
New max accuracy: 0.74 Sensitivity: 0.70
New max accuracy: 0.75 Sensitivity: 0.77
New max accuracy: 0.75 Sensitivity: 0.71
New max accuracy: 0.76 Sensitivity: 0.76
New max accuracy: 0.76 Sensitivity: 0.76
New max accuracy: 0.76 Sensitivity: 0.77
Best setting:  {'Learning rate:': 'optimal', 'max_iter:': 2000, 'Class weight:': {False: 4, True: 6}, 'Alpha:': 0.0004170141784820684, 'Penalty:': 'none'} Accuracy: 0.76


### Template on how to use the aquired dictionary

In [None]:
y_train
MyClassifier = SGDClassifier(alpha=bestSVM["Alpha:"], max_iter=bestSVM["max_iter:"], power_t=bestSVM["Power_t:"], class_weight=bestSVM["Class weight:"])
MyClassifier.fit(X_train, y_train)
MyClassifier.predict(X_test)

In [None]:
bestSVM

In [None]:
bestRFC

In [None]:
from sklearn.svm import SVC
accuracy = 0.0
for C in range(-5,5):
    for cache in range(1,5):
        for gamma in range(1,4):
            for tol in range(1,5):
                if gamma == 1:
                    gamma_p = 'auto'
                else:
                    gamma_p = 0.1*gamma
                C_val = 2.0**C
                cache_size = cache*1000
                tol_v = tol*0.001
                MySVMClassifier = SVC(C=C_val, cache_size=cache_size,gamma = gamma_p, tol=tol_v,  max_iter = -1 )
                MySVMClassifier.fit(X_train,y_train)
                cm = confusion_matrix(y_test,MySVMClassifier.predict(X_test))
                sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
                new_accuracy = f1_score(y_test, MySVMClassifier.predict(X_test), average='weighted')
                if accuracy < new_accuracy:
                    accuracy = new_accuracy
                    print ("accuracy: ", accuracy, "Sensitivity: ", sensitivity)
#print ("Best setting: ", bestSVM, "Accuracy: ", accuracy)

In [None]:
len(X_train)