# Bagging and Hyperparameter tuning  

<img src="./images/bags.jpg" alt="The bagging operation" width="400" align='center'/>

## The Spam vs. not-Spam Dataset
<img src="./images/spam_or_not_spam.png" alt="Spam vs. not-Spam" width="300" align='center'/>

### About the (Spam vs. not-Spam) dataset:
* The dataset tags email messages as spam or not-spam.
* Classes (=categories): there are two possible classes: not_spam, spam
* Attributes (=features): there are 57 features, including two types of features:
  * word frequencies - the feature name contains a word, with the suffix '_wordFreq' e.g.: 'make_wordFreq'  
    * The word could appear 0 times in a specific email, once or a few times.
    * The values of this feature could be between 0 and 1 (the frequency is relative).
  * Capital Letter pattern attributes - 3 features regarding capital letter patterns. e.g.: 'capitalLet_long' - the length of the longest sequence of capital letters in the email.

In [1]:
# IMPORT (PACKAGES) CELL
# --------------------------------------------------------

import sys
import os
import pathlib 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
get_ipython().run_line_magic('matplotlib', 'inline')

### load the 'spam vs not-spam' dataset
<img src="./images/load_dataframe.jpg" alt="load dataframe" width="100" align='center'/>

In [2]:
def loadDataset(fileName):
    df_spam=pd.read_csv(fileName)
    return (df_spam)

In [3]:
# -----------------------------------------------------------------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
print ("Testing your implementation of the 'loadDataset' method ...")
dataset_forTesting = loadDataset(datasetCsvFileName)
assert dataset_forTesting is not None, 'spam_notSpam object not initialized'
assert isinstance(dataset_forTesting, pd.DataFrame), 'spam_notSpam object is not a dataframe'
assert dataset_forTesting.empty == False, 'spam_notSpam dataframe object is empty'
assert len(dataset_forTesting.columns)==58, 'spam_notSpam dataframe object is missing columns'
assert len(dataset_forTesting.index)==4601, 'spam_notSpam dataframe object is missing rows'
print ("----> The 'spam vs not-spam' dataframe object was loaded successfuly :-) \n")
dataset_forTesting=None
print ('The beginning (the head) of the dataframe:')
loadDataset(datasetCsvFileName).head()
# --------------------------------------------------------

Testing your implementation of the 'loadDataset' method ...
----> The 'spam vs not-spam' dataframe object was loaded successfuly :-) 

The beginning (the head) of the dataframe:


Unnamed: 0,make_wordFreq,address_wordFreq,all_wordFreq,3d_wordFreq,our_wordFreq,over_wordFreq,remove_wordFreq,internet_wordFreq,order_wordFreq,mail_wordFreq,...,semicol_wordFreq,paren_wordFreq,bracket_wordFreq,bang_wordFreq,dollar_wordFreq,pound_wordFreq,capitalLet_avg,capitalLet_long,capitalLet_total,class
0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.178,0.0,0.044,0.0,0.0,1.666,10,180,not_spam
1,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125,0.0,0.0,0.0,0.0,1.51,10,74,not_spam
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.718,11,55,not_spam
3,0.33,0.44,0.37,0.0,0.14,0.11,0.0,0.07,0.97,1.16,...,0.006,0.159,0.0,0.069,0.221,0.11,3.426,72,819,spam
4,0.0,2.08,0.0,0.0,3.12,0.0,1.04,0.0,0.0,0.0,...,0.0,0.0,0.0,0.263,0.0,0.0,1.428,4,20,spam


In [4]:
# return values (comma separated):
# - X_featureVectores - a dataframe containing all feature vectors. 
#                       It should contain the input dataframe after removing the class column.
#                       The index of the X_featureVectores should be the same as the input dataframe parameter.
# - y_Categories      - a series of containing all class values per instance.
#                       The index of the y_Categories series should be the same as the input dataframe.
# --------------------- 
def separateTo_X_and_y(dataset, classColName):
    xInstances=dataset.drop(columns=classColName)
    yCategories=dataset.loc[:,classColName]
    return xInstances,yCategories

In [5]:
# --------------------------------------------------------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------------------------------------------

# --------------------------------------------------------
print ("Testing your implementation of the 'separateTo_X_and_y' method ...")
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
# ---------------------
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 
assert X_vectors is not None, 'X_vectors object not initialized'
assert isinstance(X_vectors, pd.DataFrame), 'X_vectors object is not a dataframe'
assert X_vectors.empty == False, 'X_vectors dataframe object is empty'
assert sameIndexes(X_vectors,dataset_forTesting), 'X_vectors should share the same indexes as the dataset'
assert y_categories is not None, 'y_categories object not initialized'
assert isinstance(y_categories, pd.Series), 'y_categories object is not a series'
assert y_categories.empty == False, 'y_categories dataframe object is empty'
assert sameIndexes(X_vectors,y_categories), 'X_vectors should share the same indexes as y_categories'
print ("----> The 'separateTo_X_and_y' test passed successfully :-) \n")
dataset_forTesting = None
X_vectors = None
y_categories = None
# --------------------------------------------------------

Testing your implementation of the 'separateTo_X_and_y' method ...
----> The 'separateTo_X_and_y' test passed successfully :-) 



In [6]:
# input parameters:
# - y_Categories -  a dataframe structure, containing the dataset.
# - positiveClassName - the name of the category (string) of the positive class
#                       * if the value of a cell does not equal positiveClassName, 
#                         it should be considered as negative.
# ------------
# return value: 
# - y_NumCategories - a series of containing all class values with numeric values per instance.
#                       * each cell value, which equals the negativeClassName will recieve a value of 0 in the
#                         output series.
#                       * each cell value, which equals the positiveClassName will recieve a value of 1 in the
#                         output series.
#                       * the index should be the same of the input series.
# --------------------- 
def datasetCategoriesToNums(y_Categories, positiveClassName):
    y_NumCategories=pd.Series(index=y_Categories.index)
    for index in y_Categories.index:
        if y_Categories.iloc[index]==positiveClassName:
            y_NumCategories[index]=1
        else:
            y_NumCategories[index]=0
    return y_NumCategories

In [7]:
# Test of:
### --- Graded tests for the 'datasetCategoriesToNums' method 
# --------------------------------------------------------

# --------------------------------------------------------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------------------------------------------

# --------------------------------------------------------
print ("Testing your implementation of the 'datasetCategoriesToNums' method ...")
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categoriesB4 = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categoriesB4, 'spam')
# --------------------- 
allValidVals = lambda arrValues,validVals: False not in (val in validVals for val in arrValues) 
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 
assert y_categories is not None, 'y_categories object not initialized'
assert isinstance(y_categories, pd.Series), 'y_categories object is not a series'
assert y_categories.empty == False, 'y_categories dataframe object is empty'
assert sameIndexes(y_categories,y_categoriesB4), 'y_categories should share the same indexes as y_categoriesB4'
assert allValidVals(np.unique(y_categories.values),(0,1)), 'classes should be only 0 or 1 (as an integer number)'
print ("----> The 'datasetCategoriesToNums' test passed successfully :-) \n")
dataset_forTesting = None
X_vectors = None
y_categories = None
# --------------------------------------------------------

Testing your implementation of the 'datasetCategoriesToNums' method ...
----> The 'datasetCategoriesToNums' test passed successfully :-) 



### split the dataset to a train-set and a test-set
<img src="./images/train-test-split.png" alt="train-test split" width="100" align='center'/>

In [8]:
# --------------------------------------------------------
# method name1: trainTestSplit
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- the method split the input dataset into train and test. 
#     - It does so using the sklearn built in method
# ------------
# input parameters:
# - X_vectors - a dataframe containing all feature vectors. 
# - y_categories - a series of containing all class values per instance.
# - test_size_ratio - a number (0<number<1) of the wanted ratio of the dataset out of the dataset 
# - rand_state - a number, in order to guarantee reproducible results 
# ------------
# return values (comma separated):
# - X_train -  a dataframe containing all feature vectors of the train set
# - X_test -  a dataframe containing all feature vectors of the test set
# - y_train - a series of containing all class values per train instance
# - y_test - a series of containing all class values per test instance
# --------------------- 
def trainTestSplit(X_vectors, y_categories, test_size_ratio, rand_state):
    return train_test_split(X_vectors, y_categories, test_size=test_size_ratio, random_state=rand_state)


# --------------------------------------------------------
# method name: reAttachTrainSet
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- the method split the input dataset into train and test. 
#     - It does so using the sklearn built in method
# ------------
# input parameters:
# - X_train -  a dataframe containing all feature vectors of the train set
# - y_train - a series of containing all class values per train instance
# ------------
# return value:
# - train_set - a reattached dataframe, consisting both feature vectors and categories.
# --------------------- 
def reAttachTrainSet(X_train, y_train):
    return pd.concat((X_train, y_train), axis=1)

# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 25)
# --------------------------------------------------------

# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
# display train-test split information
# --------------------------------------------------------
print ('Information after train-test split:')
print('The train-set includes %d instances and %d corresponding categories\n' %(X_train.shape[0],y_train.shape[0]))
print('The test-set includes %d instances and %d corresponding categories\n' %(X_test.shape[0],y_test.shape[0]))

# --------------------------------------------------------
## concatinate the X_train and y_train:
# --------------------------------------------------------
train_set = reAttachTrainSet(X_train, y_train)

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None
# --------------------------------------------------------

# --------------------------------------------------------
# Display the first few rows of the training-set:
# --------------------------------------------------------
print('First few rows of unified train-set:')
train_set.head()

Information after train-test split:
The train-set includes 3680 instances and 3680 corresponding categories

The test-set includes 921 instances and 921 corresponding categories

First few rows of unified train-set:


Unnamed: 0,make_wordFreq,address_wordFreq,all_wordFreq,3d_wordFreq,our_wordFreq,over_wordFreq,remove_wordFreq,internet_wordFreq,order_wordFreq,mail_wordFreq,...,semicol_wordFreq,paren_wordFreq,bracket_wordFreq,bang_wordFreq,dollar_wordFreq,pound_wordFreq,capitalLet_avg,capitalLet_long,capitalLet_total,0
2776,0.0,0.0,1.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.139,0.0,0.279,0.0,0.0,1.736,10,66,0.0
908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.336,0.0,0.0,0.0,0.0,2.352,15,40,0.0
4540,0.44,0.0,0.0,0.0,0.89,0.0,0.0,0.0,0.0,0.44,...,0.0,0.0,0.0,0.944,0.145,0.072,2.451,28,152,1.0
788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.186,0.0,0.0,0.0,0.0,2.823,38,240,1.0
2186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.284,0.0,0.0,1.357,5,19,1.0


### Building a Bagging Classifier
<img src="./images/bagging-classifier.jpeg" alt="bagging-classifier" width="400" align='center'/>

** The following cells perform 2 methods:
* step 1 - instance bootstrap
* step 2 - feature bootstrap

In [9]:
# --------------------------------------------------------
# method name: instanceBootStrap
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- the method samples the dataset uniformly with replacements. 
#     * this means that there is a chance we could get the same instance more than once
# ------------
# input parameters:
# - X_train - a dataframe containing all training feature vectors. 
# - y_train - a series of containing all training class values per instance.
# - sampleRatio - the ratio of the sampeling out of training set 
#               * we will derive the number of instances to sample from sampleRatio
# ------------
# return values (comma separated):
# - X_trainSampled - a dataframe containing sampled training feature vectors. 
# - y_trainSampled - a series of containing Sampled training class values per instance.
# Note: X_trainSampled should have the same index as y_trainSampled 
# --------------------- 
def instanceBootStrap(X_train, y_train, sampleRatio):
    helpLi=list(X_train.index.values) 
    howMany=int(len(y_train)*sampleRatio)
    helpLi2=[]
    X_trainSampled=pd.DataFrame()
    y_trainSampled=pd.Series()
    for x in range(howMany):
        value=random.randrange(len(helpLi))
        helpLi2.append(helpLi[value])
    
    X_trainSampled=X_train.loc[helpLi2]
    y_trainSampled=y_train.loc[helpLi2]
    
    return X_trainSampled,y_trainSampled  

In [10]:
# --------------------------------------------------------
# Test of:
### --- Graded tests for the 'instanceBootStrap' method 
# Important Note: additional test might also be taken from our side.

# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------------------------------------------
print ("Testing your implementation of the 'instanceBootStrap' method ...")
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 35)
# --------------------------------------------------------

# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
allValidVals = lambda arrValues,validVals: False not in (val in validVals for val in arrValues) 
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 
X_train_100 = X_train.iloc[:100,:]
y_train_100 = y_train.iloc[:100]
sampleRatio = 0.5
X_trainSampled, y_trainSampled = instanceBootStrap(X_train_100, y_train_100, sampleRatio)
# --------------------------------------------------------

assert X_trainSampled is not None, 'X_trainSampled object not initialized'
assert isinstance(X_trainSampled, pd.DataFrame), 'X_trainSampled object is not a dataframe'
assert X_trainSampled.empty == False, 'X_trainSampled dataframe object is empty'
assert y_trainSampled is not None, 'y_trainSampled object not initialized'
assert isinstance(y_trainSampled, pd.Series), 'y_trainSampled object is not a series'
assert y_trainSampled.empty == False, 'y_trainSampled series object is empty'
assert sameIndexes(X_trainSampled,y_trainSampled), 'X_trainSampled should share the same indexes as y_trainSampled'
assert allValidVals(np.unique(y_categories.values),(0,1)), "y_trainSampled's value should be only 0 or 1 (as an integer number)"
print ("----> The 'instanceBootStrap' test passed successfully :-) \n")
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None
X_train_100 = None
y_train_100 = None
sampleRatio = None
X_trainSampled = None
y_trainSampled = None

Testing your implementation of the 'instanceBootStrap' method ...
----> The 'instanceBootStrap' test passed successfully :-) 



In [11]:
# --------------------------------------------------------
# method name: featureBootStrap
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- the method samples the featureset uniformly with replacements. 
#     * this means that there is a chance we could choose the same feature twice 
#     * note that in the output dataframe, any feature which was already chosen,
#            will be disregarded. In other words, such a case will result in less
#            output features. For instance if we have 50 input features, and the sample
#            ratio is 0.5, we should expect 25 sampled feature columns. But if one of the features
#            was selected twice, we expect only 24 feature columns.
# ------------
# input parameters:
# - X_instances - a dataframe containing all feature vectors from the dataset.
# - sampleRatio - the ratio of the sampeling out of feature set,
#               * we will derive the number of features from sampleRatio
# ------------
# return value:
# - X_instances_featureSampled - a dataframe containing feature vectors with sampled features. 
#                               * note the instances refer to the same instances as the input dataframe.
#                               * the difference is in the columns (the selected features). 
# --------------------- 
def featureBootStrap(X_instances, sampleRatio):
    size=int((X_instances.shape[1])*sampleRatio)
    helpLi=[]
    X_instances_featureSampled=pd.DataFrame()
    for x in range(size):
        chosenOne=X_instances.iloc[:,random.randrange(X_instances.shape[1])]
        helpLi.append(chosenOne.name)
        if helpLi[x] not in X_instances_featureSampled:
            X_instances_featureSampled[helpLi[x]]=X_instances.loc[:,helpLi[x]].values
    return X_instances_featureSampled
    

In [12]:
# --------------------------------------------------------
# Test of:
### --- Graded tests for the 'featureBootStrap' method 
# --------------------------------------------------------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------------------------------------------
print ("Testing your implementation of the 'featureBootStrap' method ...")
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
# --------------------------------------------------------

# --------------------------------------------------------
allValidVals = lambda arrValues,validVals: False not in (val in validVals for val in arrValues) 
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 

columns = X_vectors.columns
columns_50 = columns[:50] 
X_vectors_columns_50 = X_vectors[columns_50]
sampleRatio = 0.5
X_Sampled = featureBootStrap(X_vectors_columns_50, sampleRatio)
# --------------------------------------------------------

assert X_Sampled is not None, 'X_Sampled object not initialized'
assert isinstance(X_Sampled, pd.DataFrame), 'X_Sampled object is not a dataframe'
assert X_Sampled.empty == False, 'X_Sampled dataframe object is empty'
assert X_Sampled.shape[1] >10, 'X_Sampled dataframe object has a wrong number of sampled features'

print ("\n----> The 'featureBootStrap' test passed successfully :-) \n")
dataset_forTesting = None
X_vectors = None
y_categories = None
columns = None
columns_50 = None
X_vectors_columns_50 = None
sampleRatio = None
X_Sampled = None
# --------------------------------------------------------

# --------------------------------------------------------

Testing your implementation of the 'featureBootStrap' method ...

----> The 'featureBootStrap' test passed successfully :-) 



#### Building a model
The following cells perform the following:
* step 1 - decision stumps (and other classification utilities)
* step 2 - Bagging (fit)

### decision stumps (and other classification utilities)
<img src="./images/treeStumps.jpg" alt="bagging-classifier" width="400" align='center'/>

In [13]:
# --------------------------------------------------------
# class 1: DecisionStump
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- wraps an api for decision stumps, so we have a unified api
# ------------
class DecisionStump():
    def __init__(self):
        self._decisionStump = DecisionTreeClassifier(max_depth=1)
        
    def fit(self,X_train,y_train):
        self._decisionStump.fit(X_train,y_train)

    def predict(self,X_test):
        return self._decisionStump.predict(X_test)


# --------------------------------------------------------
# class2: ClassiferInstGen
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- utility classifier to generate objects of the classification algorithm class   
# ------------
class ClassiferInstGen():
    def __init__(self,classifierPyClass):
        self._classifierPyClass = classifierPyClass
        
    def getNewClassifierPyObj(self):
        return self._classifierPyClass()

# --------------------------------------------------------
# class3: BootstrapFeatureClassifer
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- container classifier to save model and bootstrap features    
# ------------
class BootstrapFeatureClassifer():
    def __init__(self,trainModel,featureNames):
        self.model = trainModel
        self.featureNames = featureNames

        
# --------------------------------------------------------
# method name1: trainBootstrapedClassificationModel
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- trains a classification model which matches the sklearn API of fit and predict.
#     - It does so using the sklearn built in method
# ------------
# input parameters:
# - X_sampled_train -  a dataframe containing all feature vectors of the train set
# - y_train - a series of containing all class values per train instance
# Note: the X_sampled_train, y_train parameters could be instance bootsraped, feature bootsraped, both or none
# - classificationAlgo_pyClass - the python 'class' parameter of a classification algorithm 
#                                For instance, the above 'DecisionStump' class.
#                                Note: passing the pyton class is similar to passing a method
#                                      as a parameter. Each call to: classificationAlgo_pyClass()
#                                      creates a new object (also called instance of the class)
#                                      of the type of the class.  
# ------------
# return value:
# - featuresTrainModelObj - an object including trained model and feature names 
# --------------------- 
def trainBootstrapedClassificationModel(X_sampled_train, y_train,classificationAlgo_pyClass):
    classiferInstGen = ClassiferInstGen(classificationAlgo_pyClass)
    classificationObj = classiferInstGen.getNewClassifierPyObj()
    classificationObj.fit(X_sampled_train, y_train)
    featuresTrainModelObj = BootstrapFeatureClassifer(classificationObj,X_sampled_train.columns)
    return featuresTrainModelObj

# --------------------------------------------------------
# method name2: classifierPredict
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- predict test examples using a classification model (which corresponds to sklearn classifier APIs)
#     - It does so using the sklearn built in method
# ------------
# input parameters:
# - X_test -  a dataframe containing feature vectors of the test set
# - classification_obj - trained classificaion model (which corresponds to sklearn classifier APIs)
# ------------
# return value:
# - yHat - a series of the predictions for each test instance  
# --------------------- trainBootstrapedFeatureModel(X_sampled_train, y_train,classificationAlgo_pyClass)
def classifierPredict(X_test, featuresTrainModelObj):
    X_adapted = X_test[featuresTrainModelObj.featureNames]
    predictions = featuresTrainModelObj.model.predict(X_adapted)
    yHat = pd.Series(data=predictions,index=X_test.index)
    return yHat

# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 11)
X_train_1st_k = X_train.iloc[:1000,:] 
y_train_1st_k = y_train.iloc[:1000]
X_train_2nd_k = X_train.iloc[1000:2000,:] 
y_train_2nd_k = y_train.iloc[1000:2000]
classificationModel1=trainBootstrapedClassificationModel(X_train_1st_k, y_train_1st_k,DecisionStump)
classificationModel2=trainBootstrapedClassificationModel(X_train_2nd_k, y_train_2nd_k,DecisionStump)
# --------------------------------------------------------


# --------------------------------------------------------
# show usages of functions
# --------------------------------------------------------
print ('A few prediction examples:')
nExamples=5

y_hat1 = classifierPredict(X_test.iloc[:nExamples,:], classificationModel1)
y_hat2 = classifierPredict(X_test.iloc[:nExamples,:], classificationModel2)
for nInd in range(nExamples):
    print ('test instance [%d]: actual: %r, prediction(classifier1): %r, prediction(classifier2): %r' %(nInd,y_test.iloc[nInd],y_hat1.iloc[nInd],y_hat2.iloc[nInd]))
# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
if 'DecisionTreeClassifier' in sys.modules:
    del (DecisionTreeClassifier)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None
y_hat1 = None
y_hat2 = None
X_train_1st_k = None 
y_train_1st_k = None
X_train_2nd_k = None 
y_train_2nd_k = None
classificationModel1=None
classificationModel2=None

# --------------------------------------------------------

A few prediction examples:
test instance [0]: actual: 0.0, prediction(classifier1): 0.0, prediction(classifier2): 0.0
test instance [1]: actual: 1.0, prediction(classifier1): 0.0, prediction(classifier2): 0.0
test instance [2]: actual: 0.0, prediction(classifier1): 0.0, prediction(classifier2): 0.0
test instance [3]: actual: 0.0, prediction(classifier1): 1.0, prediction(classifier2): 0.0
test instance [4]: actual: 0.0, prediction(classifier1): 0.0, prediction(classifier2): 0.0


#### Bagging (fit)
<img src="./images/bags.jpg" alt="The bagging operation" width="200" align='center'/>

In [15]:
#--------------------------------------------------------
# method name: baggingFit
# --------------------------------------------------------

# --------------------------------------------------------
# The following is expected:
# --- the method needs to return the a list of models created after bootstraping instances & features.
# ------------
# input parameters:
# - X_train - a dataframe containing all feature vectors of the train set
# - y_train - a series of containing all class values per train instance
# - instanceSampleRatio - the ratio of the sampeling out of training set, 
#                        * we will pass it on as a parameter, in order to sample instances.
#                        - if instanceSampleRatio<=0, no instance bootstrap is done, and we leave
#                          the training instances with no change.
# - featureSampleRatio - the ratio of the sampeling out of feature set,
#                        * we will pass it on as a parameter, in order to sample features.
#                        - if featureSampleRatio<=0, no feature bootstrap is done, and we leave
#                          the features with no change.
# - classificationAlgo_pyClass - the python 'class' parameter of a classification algorithm 
#                                For instance, the above 'DecisionStump' class.
#                                Note: passing the pyton class is similar to passing a method
#                                      as a parameter.    
# - numModels - number of models to train in bagging
# Note: the X_train, y_train parameters could be instance bootsraped, feature bootsraped or both
# ------------
# return value:
# - a list of trained models 
#  notes:
#        * the number of models are detrminded by the input parameter 
# ---------------------
def baggingFit(X_train, y_train, instanceSampleRatio, featureSampleRatio, classificationAlgo_pyClass, numModels):
    res_li=[]
    for x in range(numModels):
        if instanceSampleRatio>0:
            X_train_bootstraped,y_train_corrolated_labels=instanceBootStrap(X_train, y_train, instanceSampleRatio)
            classificationModel=trainBootstrapedClassificationModel(X_train_bootstraped, y_train_corrolated_labels,classificationAlgo_pyClass)
            res_li.append(classificationModel)
        if featureSampleRatio>0:
            X_train_bootstraped,y_train_corrolated_labels=instanceBootStrap(X_train, y_train, featureSampleRatio)
            classificationModel=trainBootstrapedClassificationModel(X_train_bootstraped, y_train_corrolated_labels,classificationAlgo_pyClass)
            res_li.append(classificationModel)
    return (res_li)
    

In [16]:
# --------------------------------------------------------
# Test of:
### --- Graded tests for the 'baggingFit' method 
# Important Note: additional test might also be taken from our side.

# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
print ("Testing your implementation of the 'baggingFit' method ...")
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 25)
# --------------------------------------------------------

# --------------------------------------------------------
allValidVals = lambda arrValues,validVals: False not in (val in validVals for val in arrValues) 
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 
innerElementTupple = lambda arrValues: False not in (isinstance(elem,tuple) for elem in arrValues)
# --------------------------------------------------------
instanceSampleRatio = 0.1
featureSampleRatio = 0.5
numModels = 3
classification_PyClass = DecisionStump
baggedModels =  baggingFit(X_train, y_train, instanceSampleRatio, featureSampleRatio, classification_PyClass, numModels)
# --------------------------------------------------------

print ("check basic 'baggingFit' output validation ...")
assert baggedModels is not None, 'baggedModels object not initialized'
assert isinstance(baggedModels, list), 'baggedModels object is not a list'
assert None not in baggedModels, 'baggedModels should not include None elements'
assert False not in [isinstance(elem,BootstrapFeatureClassifer) for elem in baggedModels], 'baggedModels should include only DecisionStump objects'

print ('trying to create inner bagged models prediction ...')
nExamples=20
y_hat_arr = [classifierPredict(X_test.iloc[:nExamples],model) for model in baggedModels]
assert False not in [allValidVals(y_hat.values,(0,1)) for y_hat in y_hat_arr], 'something went wrong with bagging inner models - invalid values found (valid values: 0 or 1)'

print ("----> The 'baggingFit' test passed successfully :-) \n")

print ('\n----------------------')
print ('A few prediction examples:')
for nInd in range(nExamples):
    predictions = '; '.join('[bag-model %d]: %r' %(nPred,y_hat_arr[nPred].iloc[nInd]) for nPred in range(len(y_hat_arr)))
    print ('test instance [%d]: actual: %r, predictions: %r' %(nInd,y_test.iloc[nInd],predictions))


# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
if 'DecisionTreeClassifier' in sys.modules:
    del (DecisionTreeClassifier)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None
y_hat = None

instanceSampleRatio = None
featureSampleRatio = None
numModels = None
classification_PyClass = None
baggedModels = None

nExamples = None
y_hat_arr = None
# --------------------------------------------------------

# --------------------------------------------------------

Testing your implementation of the 'baggingFit' method ...
check basic 'baggingFit' output validation ...
trying to create inner bagged models prediction ...
----> The 'baggingFit' test passed successfully :-) 


----------------------
A few prediction examples:
test instance [0]: actual: 1.0, predictions: '[bag-model 0]: 0.0; [bag-model 1]: 1.0; [bag-model 2]: 0.0; [bag-model 3]: 0.0; [bag-model 4]: 1.0; [bag-model 5]: 1.0'
test instance [1]: actual: 1.0, predictions: '[bag-model 0]: 0.0; [bag-model 1]: 1.0; [bag-model 2]: 0.0; [bag-model 3]: 0.0; [bag-model 4]: 1.0; [bag-model 5]: 1.0'
test instance [2]: actual: 0.0, predictions: '[bag-model 0]: 0.0; [bag-model 1]: 0.0; [bag-model 2]: 0.0; [bag-model 3]: 0.0; [bag-model 4]: 0.0; [bag-model 5]: 0.0'
test instance [3]: actual: 1.0, predictions: '[bag-model 0]: 0.0; [bag-model 1]: 1.0; [bag-model 2]: 0.0; [bag-model 3]: 0.0; [bag-model 4]: 1.0; [bag-model 5]: 1.0'
test instance [4]: actual: 0.0, predictions: '[bag-model 0]: 0.0; [bag-mo

### Prediction
The following cells perform the following:
* step 1 - predict (voting)

In [17]:
# --------------------------------------------------------
# method name: baggingPredict
# --------------------------------------------------------

# --------------------------------------------------------
# The following is expected:
# --- the method needs to return the predicted value for each test instances, 
#     based on the bagged trained models 
# ------------
# input parameters:
# - X_test -  a dataframe containing feature vectors of the test set
# - baggingModels - the list of bagged models returned from 'baggingFit'
# ------------
# return value:
# - y_hat - a series of the prediction, with the same index as X_test 
#  notes:
#        * the number of models are detrminded by the input parameter 
# ---------------------
def baggingPredict(X_test, baggingModels):
    y_hat=pd.Series(index=X_test.index)
    y_help=pd.DataFrame(index=X_test.index)
    for baggedModel in baggingModels:
        y_test=classifierPredict(X_test, baggedModel)
        y_help[baggedModel]=y_test
    counter=0
    for index in y_help.index:
        for cat in y_help[baggingModels]:
            if y_help.loc[index,cat]==0:
                counter-=1
            else:
                counter+=1
        if counter>0:
            y_hat[index]=1
        else:
            y_hat[index]=0
        counter=0
    return (y_hat)

In [18]:
# --------------------------------------------------------
# Test of:
### --- Graded tests for the 'baggingPredict' method 
# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 25)
# --------------------------------------------------------

# --------------------------------------------------------
allValidVals = lambda arrValues,validVals: False not in (val in validVals for val in arrValues) 
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 
innerElementTupple = lambda arrValues: False not in (isinstance(elem,tuple) for elem in arrValues)
# --------------------------------------------------------

# --------------------------------------------------------

print ("check basic 'baggingPredict' output validation (part 1) ...")
instanceSampleRatio = 0.1
featureSampleRatio = 0.5
numModels = 10
classification_PyClass = DecisionStump
baggedModels =  baggingFit(X_train, y_train, instanceSampleRatio, featureSampleRatio, classification_PyClass, numModels)
yHat = baggingPredict(X_test, baggedModels)
assert yHat is not None, 'yHat object not initialized'
assert isinstance(yHat, pd.Series), 'yHat object is not a pandas series'
assert None not in yHat.values, 'yHat should not include None elements'

print ("----> The 'baggingPredict' test passed successfully :-) \n")
# --------------------------------------------------------
print ("check basic 'baggingPredict' output validation (part 2) ...")
assert allValidVals(yHat.values,(0,1)), 'prediction err - something went wrong with bagging inner models - invalid values found (valid values: 0 or 1)'

print ("----> The 'baggingPredict' test passed successfully :-) \n")
print ('\n----------------------')
print ('A few prediction examples:')
nExamples = 20
for nInd in range(nExamples):
    print ('test instance [%d]: actual: %r, predictions: %r' %(nInd,y_test.iloc[nInd],yHat.iloc[nInd]))


# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
if 'DecisionTreeClassifier' in sys.modules:
    del (DecisionTreeClassifier)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None
y_hat = None
yHat = None

instanceSampleRatio = None
featureSampleRatio = None
numModels = None
classification_PyClass = None
baggedModels = None

nExamples = None
y_hat_arr = None
# --------------------------------------------------------


check basic 'baggingPredict' output validation (part 1) ...
----> The 'baggingPredict' test passed successfully :-) 

check basic 'baggingPredict' output validation (part 2) ...
----> The 'baggingPredict' test passed successfully :-) 


----------------------
A few prediction examples:
test instance [0]: actual: 1.0, predictions: 1.0
test instance [1]: actual: 1.0, predictions: 1.0
test instance [2]: actual: 0.0, predictions: 0.0
test instance [3]: actual: 1.0, predictions: 1.0
test instance [4]: actual: 0.0, predictions: 0.0
test instance [5]: actual: 0.0, predictions: 0.0
test instance [6]: actual: 1.0, predictions: 1.0
test instance [7]: actual: 0.0, predictions: 1.0
test instance [8]: actual: 0.0, predictions: 0.0
test instance [9]: actual: 0.0, predictions: 1.0
test instance [10]: actual: 0.0, predictions: 0.0
test instance [11]: actual: 0.0, predictions: 0.0
test instance [12]: actual: 1.0, predictions: 1.0
test instance [13]: actual: 1.0, predictions: 1.0
test instance [14]: act

### Evaluation
The following cells perform the following:
* step 1 - evaluate confusion matrix
* step 2 - evaluate precision
* step 3 - evaluate recall

#### evaluate confusion matrix
<img src="./images/confusion_matrix.jpg" alt="confusion_matrix" width="300" align='center'/>

In [19]:
# --------------------------------------------------------
# methos: getConfusionMatrix
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- compute the 4 values of the confusion matrix: TN, FP, FN, TP
#     - It does so using the sklearn built in method
# ------------
# input parameters:
# - y_hat -  a series of containing all predicted class values per test instance
# - y_test - a series of containing all class values per test instance
# ------------
# return values (comma seperated):
# - TN - True negatives - number of instances, for which the actual value (from y_test) 
#                         is 0 (negative class, not_spam in our dataset) and 
#                         the predicted value (from y_hat) is also 0.
# - FP - True negatives - number of instances, for which the actual value (from y_test) is 0, but 
#                         the predicted value (from y_hat) is 1 (positive class, spam in our dataset).
# - FN - False negatives - number of instances, for which the actual value (from y_test) is 1, but                          
#                         the predicted value (from y_hat) is 0.
# - TP - False negatives - number of instances, for which the actual value (from y_test) is 1 and                          
#                         the predicted value (from y_hat) is also 1.
# --------------------- 
def getConfusionMatrix(y_hat,y_test):
    TN = len([1 for ind in range(len(y_test)) if y_test.iloc[ind]==0 and y_hat.iloc[ind]==0])
    FP = len([1 for ind in range(len(y_test)) if y_test.iloc[ind]==0 and y_hat.iloc[ind]==1])
    FN = len([1 for ind in range(len(y_test)) if y_test.iloc[ind]==1 and y_hat.iloc[ind]==0])
    TP = len([1 for ind in range(len(y_test)) if y_test.iloc[ind]==1 and y_hat.iloc[ind]==1])
    return TN,FP,FN,TP


# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 11)
instanceSampleRatio = 0.1
featureSampleRatio = 0.5
numModels = 10
classification_PyClass = DecisionStump
baggedModels =  baggingFit(X_train, y_train, instanceSampleRatio, featureSampleRatio, classification_PyClass, numModels)
y_hat = baggingPredict(X_test, baggedModels)
# --------------------------------------------------------

# --------------------------------------------------------
numTN,numFP,numFN,numTP = getConfusionMatrix(y_hat,y_test)
print ('confusion matrix:')
print ('\tpred-0\tpred-1')
print ('is-0\tTN=%d\tFP=%d' %(numTN,numFP))
print ('is-1\tFN=%d\tTP=%d' %(numFN,numTP))
# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
if 'DecisionTreeClassifier' in sys.modules:
    del (DecisionTreeClassifier)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None

yHat = None

instanceSampleRatio = None
featureSampleRatio = None
numModels = None
classification_PyClass = None
baggedModels = None

nExamples = None

# --------------------------------------------------------

confusion matrix:
	pred-0	pred-1
is-0	TN=534	FP=11
is-1	FN=184	TP=192


#### evaluate precision
<img src="./images/confusion_matrix-precision-recall.jpg" alt="confusion_matrix-precision-recall" width="300" align='center'/>

In [20]:
# --------------------------------------------------------
# method name: getPrecision
# --------------------------------------------------------

# --------------------------------------------------------
# The following is expected:
# --- the method needs to return the precision for class 1 (spam in out dataset)
# ------------
# input parameters:
# - y_hat -  a series of containing all predicted class values per test instance
# - y_test - a series of containing all class values per test instance
# ------------
# return value:
# - precision value
# ---------------------
def getPrecision(y_hat,y_test):
    test=getConfusionMatrix(y_hat,y_test)
    #TN,FP,FN,TP
    precision=(test[3]/(test[3]+test[1]))
    return (precision)
    

In [21]:
# --------------------------------------------------------
# Test of:
### --- Graded tests for the 'getPrecision' method 
# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 25)
instanceSampleRatio = 0.1
featureSampleRatio = 0.5
numModels = 5
classification_PyClass = DecisionStump
baggedModels =  baggingFit(X_train, y_train, instanceSampleRatio, featureSampleRatio, classification_PyClass, numModels)
yHat = baggingPredict(X_test, baggedModels)
# --------------------------------------------------------

# --------------------------------------------------------
allValidVals = lambda arrValues,validVals: False not in (val in validVals for val in arrValues) 
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 
innerElementTupple = lambda arrValues: False not in (isinstance(elem,tuple) for elem in arrValues)
# --------------------------------------------------------

# --------------------------------------------------------

print ("check basic 'getPrecision' output validation ...")
precision = getPrecision(yHat,y_test)
assert precision is not None, 'precision not initialized'
assert precision>0, 'precision should not be 0'
assert precision<=1, 'precision should not be more than 1 (=100%)'
assert precision>0.7, 'precision should not be >0.7 (more than 70%)'
print ('pricision=%r' %(precision))
print ("----> The 'getPrecision' test passed successfully :-) \n")
print ('\n----------------------')


# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
if 'DecisionTreeClassifier' in sys.modules:
    del (DecisionTreeClassifier)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None
y_hat = None
yHat = None

instanceSampleRatio = None
featureSampleRatio = None
numModels = None
classification_PyClass = None
baggedModels = None

nExamples = None
y_hat_arr = None
# --------------------------------------------------------

# --------------------------------------------------------

check basic 'getPrecision' output validation ...
pricision=0.9455445544554455
----> The 'getPrecision' test passed successfully :-) 


----------------------


#### evaluate recall
<img src="./images/confusion_matrix-precision-recall.jpg" alt="confusion_matrix-precision-recall" width="300" align='center'/>

In [22]:
# --------------------------------------------------------
# method name: getRecall
# --------------------------------------------------------

# --------------------------------------------------------
# The following is expected:
# --- the method needs to return the recall for class 1 (spam in out dataset)
# ------------
# input parameters:
# - y_hat -  a series of containing all predicted class values per test instance
# - y_test - a series of containing all class values per test instance
# ------------
# return value:
# - recall value
# ---------------------
def getRecall(y_hat,y_test):
    test=getConfusionMatrix(y_hat,y_test)
    #TN,FP,FN,TP
    recall=test[3]/(test[3]+test[2])
    return recall

In [23]:
# --------------------------------------------------------
# Test of:
### --- Graded tests for the 'getRecall' method 
# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 25)
instanceSampleRatio = 0.1
featureSampleRatio = 0.5
numModels = 10
classification_PyClass = DecisionStump
baggedModels =  baggingFit(X_train, y_train, instanceSampleRatio, featureSampleRatio, classification_PyClass, numModels)
yHat = baggingPredict(X_test, baggedModels)
# --------------------------------------------------------

# --------------------------------------------------------
allValidVals = lambda arrValues,validVals: False not in (val in validVals for val in arrValues) 
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 
innerElementTupple = lambda arrValues: False not in (isinstance(elem,tuple) for elem in arrValues)
# --------------------------------------------------------

# --------------------------------------------------------

print ("check basic 'getRecall' output validation ...")
recall = getRecall(yHat,y_test)
assert recall is not None, 'precision not initialized'
assert recall>0, 'recall should not be 0'
assert recall<=1, 'recall should not be more than 1 (=100%)'
assert recall>0.3, 'recall should not be >0.5 (more than 50%)'
print ('recall=%r' %(recall))
print ("----> The 'getRecall' test passed successfully :-) \n")

print ('\n----------------------')


# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
if 'DecisionTreeClassifier' in sys.modules:
    del (DecisionTreeClassifier)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None
y_hat = None
yHat = None

instanceSampleRatio = None
featureSampleRatio = None
numModels = None
classification_PyClass = None
baggedModels = None

nExamples = None
y_hat_arr = None
# --------------------------------------------------------

# --------------------------------------------------------

check basic 'getRecall' output validation ...
recall=0.7298850574712644
----> The 'getRecall' test passed successfully :-) 


----------------------


### The following cells perform the following:
* step 1 - evaluate accuracy
* step 2 - split the data-set to train-set, validation-set and test-set
* step 3 - bagging hyperparameters tuning

### evaluate accuracy

In [24]:
# --------------------------------------------------------
# methos: getConfusionMatrix
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- computes the accuracy of the classifier
# ------------
# input parameters:
# - y_hat -  a series of containing all predicted class values per test instance
# - y_test - a series of containing all class values per test instance
# ------------
# return value:
# - accuracy value
# --------------------- 
def getAccuracy(y_hat,y_test):
    correct = float(len([1 for ind in range(len(y_test)) if y_test.iloc[ind]==y_hat.iloc[ind]]))
    return correct/float(len(y_test))


# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_test, y_train, y_test = trainTestSplit(X_vectors, y_categories, 0.2, 11)
instanceSampleRatio = 0.1
featureSampleRatio = 0.5
numModels = 5
classification_PyClass = DecisionStump
baggedModels =  baggingFit(X_train, y_train, instanceSampleRatio, featureSampleRatio, classification_PyClass, numModels)
y_hat = baggingPredict(X_test, baggedModels)
# --------------------------------------------------------

# --------------------------------------------------------
accuracy = getAccuracy(y_hat,y_test)
print ('accuracy: %r' %(accuracy))
# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
if 'DecisionTreeClassifier' in sys.modules:
    del (DecisionTreeClassifier)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None

yHat = None

instanceSampleRatio = None
featureSampleRatio = None
numModels = None
classification_PyClass = None
baggedModels = None

nExamples = None

# --------------------------------------------------------

accuracy: 0.7806731813246471


#### split the data-set to train-set, validation-set and test-set
<img src="./images/train-validation-test-split.png" alt="train-validation-test split" width="300" align='center'/>

In [25]:
#--------------------------------------------------------
# method name: trainValidationTestSplit
# --------------------------------------------------------

# --------------------------------------------------------
# What does the method do?
# --- the method split the input dataset into train and test. 
#     - It does so using the sklearn built in method
# ------------
# input parameters:
# - X_vectors - a dataframe containing all feature vectors. 
# - y_categories - a series of containing all class values per instance.
# - test_size_ratio - a number (0<number<1) of the wanted ratio of the dataset out of the dataset 
# - rand_state - a number, in order to guarantee reproducible results 
# ------------
# return values (comma separated):
# - X_train -  a dataframe containing all feature vectors of the train set
# - X_test -  a dataframe containing all feature vectors of the test set
# - y_train - a series of containing all class values per train instance
# - y_test - a series of containing all class values per test instance
# --------------------- 
def trainValidationTestSplit(X_vectors, y_categories, validation_or_test_size_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X_vectors, y_categories, test_size=validation_or_test_size_ratio, random_state=rand_state)
    validationRatio = validation_or_test_size_ratio / (1-validation_or_test_size_ratio)
    X_train, X_validation, y_train, y_validation =  train_test_split(X_train, y_train, test_size=validationRatio, random_state=rand_state)
    return X_train, X_validation, X_test, y_train, y_validation, y_test  

# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_validation, X_test, y_train, y_validation, y_test =  trainValidationTestSplit(X_vectors, y_categories, 0.2, 43)

# --------------------------------------------------------

# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
# display train-test split information
# --------------------------------------------------------
print ('Information after train-test split:')
print('* The train-set includes %d instances and %d corresponding categories' %(X_train.shape[0],y_train.shape[0]))
print('* The validation-set includes %d instances and %d corresponding categories' %(X_validation.shape[0],y_validation.shape[0]))
print('* The test-set includes %d instances and %d corresponding categories' %(X_test.shape[0],y_test.shape[0]))


Information after train-test split:
* The train-set includes 2760 instances and 2760 corresponding categories
* The validation-set includes 920 instances and 920 corresponding categories
* The test-set includes 921 instances and 921 corresponding categories


#### bagging hyperparameters tuning (using grid search)
<img src="./images/grid_search.png" alt="train-validation-test split" width="100" align='center'/>

In [26]:
# --------------------------------------------------------
# method name: baggingFit
# --------------------------------------------------------

# --------------------------------------------------------
# The following is expected:
# --- the method chooses the best permutation of bagging model hyperparameters, using grid search
# ------------
# input parameters:
# - X_train - a dataframe containing all feature vectors of the train set
# - y_train - a series of containing all class values per train instance
# - instanceSampleRatio - the ratio of the sampeling out of training set, 
#                        * we will pass it on as a parameter, in order to sample instances.
#                        - if instanceSampleRatio<=0, no instance bootstrap is done, and we leave
#                          the training instances with no change.
# - featureSampleRatio - the ratio of the sampeling out of feature set,
#                        * we will pass it on as a parameter, in order to sample features.
#                        - if featureSampleRatio<=0, no feature bootstrap is done, and we leave
#                          the features with no change.
# - classificationAlgo_pyClass_Arr - an array of the python 'class' parameters of a classification algorithm to choose from,
#                                For instance, the above [DecisionStump, NaiveBayes].
#                                Note: passing the pyton class is similar to passing a method
#                                      as a parameter.    
# - numModels_Arr - an array of the options of the number of bagging models to train, for which we need to choose from
#                   For instance: [3,5,10]
# Note: the X_train, y_train parameters could be instance bootsraped, feature bootsraped or both
# ------------
# return values (comma seperated):
# - allBaggedModels - an array of all trainedBaggedModels,
# - best_baggedModels - array of the baggedModels ensembles (from baggeningFit), reaching the highest accuracy
# - bestAccuracy - the highest accuracy (number) of all baggedModels ensemble
# - best_classificationAlgo_pyClass - the classificationAlgo_pyClass (e.g. DecisionStump) of
#                                     baggedModels ensemble with highest accuracy
# - best_numModels - the num of 'bagged' Models (e.g. 5) of
#                    baggedModels ensemble with highest accuracy
def gridSearchBaggingModel(X_train, y_train, X_validation, y_validation, instanceSampleRatio, 
                           featureSampleRatio, classificationAlgo_pyClass_Arr, numModels_Arr):
    help_df=pd.DataFrame(columns=['baggedModels','acc','numModels'])
    allBaggedModels=[]
    best_baggedModels=[]
    counter=0
    for x in numModels_Arr:   
        for y in classificationAlgo_pyClass_Arr:
            fit=baggingFit(X_train, y_train, instanceSampleRatio, featureSampleRatio, y, x)
            predict=baggingPredict(X_validation,fit)
            acc=getAccuracy(predict,y_validation)
            for m in fit:
                help_df.loc[counter,'baggedModels']=m
                help_df.loc[counter,'algo']=y
                help_df.loc[counter,'acc']=acc
                help_df.loc[counter,'numModels']=x
                counter+=1
                allBaggedModels.append(m)
    bestAccuracy=help_df['acc'].max()
    best_baggedModels=help_df.loc[help_df['acc']==bestAccuracy,'baggedModels']
    best_baggedModels=list(best_baggedModels)
    best_classificationAlgo_pyClass=help_df.loc[help_df['acc']==bestAccuracy,'algo'].iloc[0]
    best_numModels=help_df.loc[help_df['acc']==bestAccuracy,'numModels'].iloc[0]
    return (allBaggedModels,best_baggedModels,bestAccuracy,best_classificationAlgo_pyClass,best_numModels)

In [27]:
# --------------------------------------------------------
# Test of:
### --- Graded tests for the 'getRecall' method 
# --------------------------------------------------------
# Temporary imports:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
# --------
folderName = '.' + os.sep + 'data' 
datasetCsvFileName = folderName + os.sep + 'spam_notSpam.csv'
# --------------------- 
dataset_forTesting = loadDataset(datasetCsvFileName)
X_vectors, y_categories = separateTo_X_and_y(dataset_forTesting, 'class')
y_categories = datasetCategoriesToNums(y_categories, 'spam')
X_train, X_validation, X_test, y_train, y_validation, y_test =  trainValidationTestSplit(X_vectors, y_categories, 0.2, 19)
# --------------------------------------------------------

# --------------------------------------------------------
allValidVals = lambda arrValues,validVals: False not in (val in validVals for val in arrValues) 
sameIndexes = lambda obj1,obj2: False not in (obj1.index.tolist()[n] == obj2.index.tolist()[n] for n in range(len(obj1.index))) 
innerElementTupple = lambda arrValues: False not in (isinstance(elem,tuple) for elem in arrValues)
# --------------------------------------------------------

# --------------------------------------------------------
instanceSampleRatio = 0.1
featureSampleRatio = 0.5
numModels_Arr = [5,10,40]
classificationAlgo_pyClass_Arr = [DecisionStump,GaussianNB]
allBaggedModels,best_baggedModels,bestAccuracy, best_classificationAlgo_pyClass,best_numOfModels = gridSearchBaggingModel(
                        X_train, y_train, X_validation, y_validation, instanceSampleRatio, featureSampleRatio, 
                                                         classificationAlgo_pyClass_Arr, numModels_Arr)

print ("check basic 'gridSearchBaggingModel' output validation ...")
assert allBaggedModels is not None, 'allBaggedModels not initialized'
assert best_baggedModels is not None, 'best_baggedModels not initialized'
assert bestAccuracy is not None, 'bestAccuracy not initialized'
assert best_classificationAlgo_pyClass is not None, 'best_classificationAlgo_pyClass not initialized'
assert best_numOfModels is not None, 'best_numOfModels not initialized'
assert isinstance(allBaggedModels, list), 'allBaggedModels object is not a list'
assert isinstance(best_baggedModels, list), 'best_baggedModels object is not a list'
assert bestAccuracy>0, 'bestAccuracy should not be 0'
assert bestAccuracy<=1, 'bestAccuracy should not be more than 1 (=100%)'
assert bestAccuracy>0.6, 'bestAccuracy should not be >0.6 (more than 60%)'
assert None not in allBaggedModels, 'allBaggedModels should not include None elements'
assert None not in best_baggedModels, 'best_baggedModels should not include None elements'

print ("----> The 'gridSearchBaggingModel' test passed successfully :-) \n")

print ('\n----------------------')
print ('Best bagged Classification algo: %r, num of models: %r, accuracy: %r' %(best_classificationAlgo_pyClass.__name__,best_numOfModels,bestAccuracy))


# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
if 'DecisionTreeClassifier' in sys.modules:
    del (DecisionTreeClassifier)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------

# --------------------------------------------------------
dataset_forTesting = None
X_vectors = None
y_categories = None
X_train = None
X_test = None
y_train = None
y_test = None
y_hat = None
yHat = None

instanceSampleRatio = None
featureSampleRatio = None
numModels = None
classification_PyClass = None
baggedModels = None

nExamples = None
y_hat_arr = None
# --------------------------------------------------------


check basic 'gridSearchBaggingModel' output validation ...
----> The 'gridSearchBaggingModel' test passed successfully :-) 


----------------------
Best bagged Classification algo: 'GaussianNB', num of models: 40, accuracy: 0.8391304347826087
