# Ensemble Learning

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score

In [2]:
wineData = pd.read_csv('winequality-red.csv')

wineData['category'] = (wineData['quality'] >= 7)

X = wineData[wineData.columns[0:11]].values
y = wineData['category'].values.astype(np.int32)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print('X train size: ', X_train.shape)
print('y train size: ', y_train.shape)
print('X test size: ', X_test.shape)
print('y test size: ', y_test.shape)

X train size:  (1119, 11)
y train size:  (1119,)
X test size:  (480, 11)
y test size:  (480,)


In [4]:
scaler = StandardScaler()
X_train_stan = scaler.fit_transform(X_train)

In [5]:
logReg = LogisticRegression(random_state=0, solver='lbfgs')
logReg.fit(X_train_stan, y_train)

X_test_stan = scaler.transform(X_test)
y_pred = logReg.predict(X_test_stan)

print('precision on the test set: ', precision_score(y_test, y_pred))
print('accuracy on the test set: ', accuracy_score(y_test, y_pred))

precision on the test set:  0.5263157894736842
accuracy on the test set:  0.8645833333333334


In [6]:
bagClf = BaggingClassifier(LogisticRegression(random_state=0, solver='lbfgs'), n_estimators = 500, oob_score = True, random_state = 90)

In [7]:
bagClf.fit(X_train_stan, y_train)
print(bagClf.oob_score_)

0.8784629133154602


In [8]:
y_pred = bagClf.predict(X_test_stan)
phat = bagClf.predict_proba(X_test_stan)[:,1]

print('precision on the test set: ', precision_score(y_test, y_pred))
print('accuracy on the test set: ', accuracy_score(y_test, y_pred))

precision on the test set:  0.5405405405405406
accuracy on the test set:  0.8666666666666667
