# Out of Bag Classification

## Bagged ensemble on unseen data without performing cross validation

### Import the necessary packages

In [1]:
import pyodbc 
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


### Fetch data from sql server that will be used for Training

In [2]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=LAPTOP-HVRMUNPF;'
                      'PORT=1433;'
                      'Database=RENTERS_STP;'
                      'Trusted_Connection=yes;'
                      )

query = 'SELECT  * from [RENTERS_STP].[dbo].[DR_DetailedRequest_classification]'

    
df = pd.read_sql(query, conn)

### Get the 'features' and 'labels' from the dataset. Split the dataset to training and testing datasets

In [3]:
y=df['PREDICTION_VALUE_Y_BOOL']
X = pd.get_dummies(df.drop(['PREDICTION_VALUE_Y_BOOL'], axis=1)).fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=42, stratify=y)

SEED=1

### Instantiate the classifiers

In [4]:
# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, oob_score=True,random_state=1)


### Fit and train the model


In [5]:

# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)


### Model Evaluation

In [6]:
# Evaluate test set accuracy
acc_test = accuracy_score(y_test, y_pred)

# Evaluate OOB accuracy
acc_oob = bc.oob_score_

# Print acc_test and acc_oob
print('Test set accuracy: {:.6f}, OOB accuracy: {:.6f}'.format(acc_test, acc_oob))

Test set accuracy: 0.990323, OOB accuracy: 0.990715
