In [1]:
#Naive Bayes  
## The Iris flowers Dataset


<h3><u/>Classes</u>- types of Iris flower</h3><br/><br/>
<table align='left'>
    <tr><div style="text-align:left;font-size:125%"><u>There are 3 possible flower types:</u></div></tr>
    <tr><th>class 0 - Iris Setosa</th><th>class 1 - Iris Virginica</th><th>class 2 - Iris Versicolor</th></tr>
    <tr>
        <td><img src="./images/Iris-versicolor.jpg" alt="Iris Versicolor" style="width: 200px;text-align:left"/></td>
        <td><img src="./images/iris_setosa.jpg" alt="Iris Setosa" style="width: 200px;text-align:left"/></td>
        <td><img src="./images/iris_virginica.jpg" alt="Iris Virginica" style="width: 200px;text-align:left"/></td>
    </tr>
</table> 

<h3><u>Feature Set</u></h3>
<table align='left' width="200">
    <tr><td>
        <img src="./images/iris_petal_sepal.png" alt="Iris" width="200" align='left'/>
    </td></tr>
    <tr><td align='left'>
        <div style="text-align:left;font-size:125%"><u>Dataset includes 4 features</u>:</div><br/>
        <ul style="list-style-type:circle;text-align:left;font-size:115%">
            <li>sepal-length</li>
            <li>sepal-width</li>
            <li>petal-length</li>
            <li>petal-width</li>
        </ul>
    </td></tr>
</table>    

In [2]:
##imports:
import pandas as pd
import numpy as np
import sys
%matplotlib inline

###Loading datasets.
The following cells perform 2 things:

load the Iris dataset
split the dataset to a train-set and a test-set

In [3]:
from sklearn import datasets
# --------------------------------------------------------
data_iris = datasets.load_iris()
X_iris = pd.DataFrame(data_iris['data'], columns=data_iris['feature_names'])
y_iris = pd.DataFrame(data_iris['target'], columns=['target'])
# --------------------------------------------------------
# display the iris class information:
# --------------------------------------------------------
classNames = data_iris['target_names']
iris_classes = np.unique(y_iris.values)
strClasses = ["%d (representing '%s')" %(iris_class,classNames[iris_class]) for iris_class in iris_classes]
print ('There are %d Iris classes' %(len(strClasses)))
print ('iris classes:')
print(strClasses)
# --------------------------------------------------------
# display the feature information:
# --------------------------------------------------------
featureNames = [col for col in X_iris.columns]
print ('\nThere are %d features in the feature set (each feature vector has a value for every feature)' %(len(featureNames)))
print('Feature names:')
print(featureNames)
# --------------------------------------------------------
# display the first few rows of the dataset vectors:
# --------------------------------------------------------
print('\nThe dataset includes %d instances (aka feature vectors)' %(X_iris.shape[0]))
print('first few instances:')
print(X_iris.head())
print('\nfirst few corresponding categories:')
print(y_iris.head())

There are 3 Iris classes
iris classes:
["0 (representing 'setosa')", "1 (representing 'versicolor')", "2 (representing 'virginica')"]

There are 4 features in the feature set (each feature vector has a value for every feature)
Feature names:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

The dataset includes 150 instances (aka feature vectors)
first few instances:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

first few corresponding categories:
   target
0       0
1       0
2       0
3       0
4       0


In [4]:
# --------------------------------------------------------
### The Following section splits the iris dataset directly from sklean
# --------------------------------------------------------
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)

# --------------------------------------------------------
# display train-test split information
# --------------------------------------------------------
print ('Information after train-test split:')
print('The train-set includes %d instances and %d corresponding categories\n' %(X_train.shape[0],y_train.shape[0]))
print('The test-set includes %d instances and %d corresponding categories\n' %(X_test.shape[0],y_test.shape[0]))

# --------------------------------------------------------
## concatinate the X_train and y_train for Naive Bayes training:
# --------------------------------------------------------
train_set = pd.concat((X_train, y_train), axis=1)
# --------------------------------------------------------
if 'datasets' in sys.modules:
    del (datasets)
if 'train_test_split' in sys.modules:
    del (train_test_split)
sys_modules = list(sys.modules.keys())
for mdl in sys_modules:
    if mdl.startswith('sklearn.'):
        del(sys.modules[mdl]) 
del (sklearn)
if 'sklearn' in sys.modules:
    del (sys.modules['sklearn'])
# --------------------------------------------------------
# Display the first few rows of the training-set:
# --------------------------------------------------------
print('First few rows of unified train-set:')
train_set.head()

Information after train-test split:
The train-set includes 120 instances and 120 corresponding categories

The test-set includes 30 instances and 30 corresponding categories

First few rows of unified train-set:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
22,4.6,3.6,1.0,0.2,0
15,5.7,4.4,1.5,0.4,0
65,6.7,3.1,4.4,1.4,1
11,4.8,3.4,1.6,0.2,0
42,4.4,3.2,1.3,0.2,0


## The naive bayes Classifier

<img src="./images/bayes.PNG" alt="Naive Bayes Classifier" align='left'/>

## Training a (Guassian) Naive bayes model 
We perform the following during the training step:
1. Calculate Priors
2. Calculate Gaussian Likelihood 'mean' parameter
3. Calculate Gaussian Likelihood 'std' parameter 
4. organize the call to the training steps in the 'fit' method

train step 1 - calculate category priors:
for each class (0,1,2) you need to calculate the prior.<br/><br/>
<b>prior(y=0)</b>=p(y=0)=count(y=0 in train-set)/count(number-of-instances in train-set)<br/><br/>
<b> do this for each class (0,1,2) </b>

In [5]:
def calcCategoryPriors(trainingSet):
    
    yTrain = trainingSet[['target']]
    total=yTrain.shape[0]
    uniqueClasses = np.unique(yTrain['target'].values)
    helpLi=[]
    li=[]
    for m in uniqueClasses:
        helpLi.append(0)
        li.append(0)
    
    for i in yTrain['target']:
        for x in uniqueClasses:
            if i==uniqueClasses[x]:
                helpLi[x]+=1
    for x in range(len(li)):
        li[x]=helpLi[x]/total
    return li
        

In [6]:
arrPriors = calcCategoryPriors(train_set)
priorClass_0 = arrPriors[0]
print ('testing for expected class prior ...')
assert int(priorClass_0*100)==33,'wrong prior for class-0'
print ("... 'calcCategoryPriors' test passed successfully :-)")
print ('prior for category 0: %f' %(priorClass_0))

testing for expected class prior ...
... 'calcCategoryPriors' test passed successfully :-)
prior for category 0: 0.333333


###Calculate Gaussian Likelihood 'mean' parameter:
for each feature calculate mean value for the feature in each for each of the class values (0,1,2) seperatly.<br/><br/>
<b>for class 0 (y=0)</b> take the rows consisting 'target' value of 0, and calculate the mean <br/>
To calculate mean use: dataframe[colName].mean() <br/><br/>
<b>Do this for each feature for each class</b>

In [7]:
def calcMeanLikelihood(trainingSet):
    yTrain = trainingSet[['target']]
    meandf=pd.DataFrame(index=np.unique(yTrain['target'].values),columns=trainingSet.columns)
    meandf=meandf.drop(columns=['target'])
    for i in range(len(meandf)):
        meandf0=trainingSet.loc[trainingSet['target'].values == i]
        meandf0=meandf0.drop(columns=['target'])
        for col in meandf0.columns:
            meandf.loc[i,col]=meandf0[col].mean()
    return meandf

In [8]:
meanLiklihoodDf = calcMeanLikelihood(train_set)
likelihood_petalLength_class1 = meanLiklihoodDf.iloc[1,2]
print ('testing for expected mean likelihood estimation ...')
assert int(likelihood_petalLength_class1*10)==42,'wrong mean likelihood estimation for petalLength_class1'
print ("... 'calcMeanLikelihood' test passed successfully :-)")
print ('likelihood for the mean of petal length for category 1 is estimated as: %f' %(likelihood_petalLength_class1))

testing for expected mean likelihood estimation ...
... 'calcMeanLikelihood' test passed successfully :-)
likelihood for the mean of petal length for category 1 is estimated as: 4.241463


###Calculate Gaussian Likelihood 'std' parameter:
for each feature calculate std value for the feature in each for each of the class values (0,1,2) seperatly.<br/><br/>
<b>for class 0 (y=0)</b> take the rows consisting 'target' value of 0, and calculate the std <br/>
To calculate std use: dataframe[colName].std() <br/><br/>
<b>Do this for each feature for each class</b>

In [9]:
def calcStdLikelihood(trainingSet):
    yTrain = trainingSet[['target']]
    std=pd.DataFrame(index=np.unique(yTrain['target'].values),columns=trainingSet.columns)
    std=std.drop(columns=['target'])
    for i in range(len(std)):
        std0=trainingSet.loc[trainingSet['target'].values == i]
        std0=std0.drop(columns=['target'])
        for col in std0.columns:
            std.loc[i,col]=std0[col].std()
    return std

In [10]:
stdLiklihoodDf = calcStdLikelihood(train_set)
likelihood_petalLength_class1 = stdLiklihoodDf.iloc[1,2]
print ('testing for expected std likelihood estimation ...')
assert int(likelihood_petalLength_class1*10)==4,'wrong std likelihood estimation for petalLength_class1'
print ("... 'calcStdLikelihood' test passed successfully :-)")
print ('likelihood for the std of petal length for category 1 is estimated as: %f' %(likelihood_petalLength_class1))

testing for expected std likelihood estimation ...
... 'calcStdLikelihood' test passed successfully :-)
likelihood for the std of petal length for category 1 is estimated as: 0.481132


## the fit method:
<br>The fit method uses the previous 3 methods for a full (Gaussian) Naive Bayes model training step.<br/>

In [18]:
def fit(trainingSet):
    """
    1. Calculate the class priors of the training set, using the 'calcCategoryPriors' method.
    2. Calculate the mean of the training set per feature per class, using the 'calcMeanLikelihood' method.
    3. Calculate the std of the training set per feature per class, using the 'stdLiklihoodDf' method.
    """
    arrPriors = calcCategoryPriors(trainingSet)
    meanLiklihoodDf = calcMeanLikelihood(trainingSet)
    stdLiklihoodDf = calcStdLikelihood(trainingSet)
    
    return meanLiklihoodDf, stdLiklihoodDf, arrPriors

## Predicting a class for a new example using the (Guassian) Naive bayes model 
We perform the following during the training step:
1. Calculate Guassian likelihood probability, for a given feature value, mean and std
2. Calculate a posteriori probabilities for each training example
3. prdict class for for each training example, given a posteriori probabilities
4. a full predict method using the above

<img src="./images/bayes.PNG" alt="Naive Bayes Classifier" align='left'/>

## the 'calcGaussianProb' method:
The 'calcGaussianProb' method uses the training methods and returns the Gaussian probablilty <br/>
of that feature value (for a specifc class).<br/>

<img src="./images/gausianProb.PNG" alt="Gausian likelihood probability" align='left'/>

In [19]:
"""
given a specific feature value and the trained mean & std (per a specific class) 
We assume normal (Guassian distribution) and we return the density value 
or the Gaussian Probability for that given value
Note: the input parameters are all numbers (scalars)
"""
def calcGaussianProb(xFeatureVal, mean, std):
    exponent = np.exp(-((xFeatureVal-mean)**2 / (2 * std**2 )))
    return (1 / ((2 * np.pi)**(1/2) * std)) * exponent

## the 'calcAposteriorProbs' method:
The 'calcAposteriorProbs' method uses the training parameters to predict the a posteriori probability <br/>
for every test instance, per class <br/>

In [20]:
"""
    1. Create a probability matrix to store the results
    2. Update each label's probability using the Gaussian probability function
"""
def calcAposteriorProbs(XTest, arrTrainedClassPriors, dfTrainedMean, dfTrainedStd, categories):
    numClasses = len(categories)
    dfProbPerTestInstPerClass = pd.DataFrame(np.zeros((XTest.shape[0], numClasses)), columns=categories, index=XTest.index)
    for category in categories:
        classPrior = arrTrainedClassPriors[category]
        dfProbPerTestInstPerClass[category]=classPrior
        # Check for each row
        for nRow in range(XTest.shape[0]):

            # Multiply the current given probability by the newly calculated probability for the given event (feature)
            for nCol in range(XTest.shape[1]):
                xFeatureVal=XTest.iloc[nRow, nCol]
                mean=dfTrainedMean.iloc[category,nCol]
                std=dfTrainedStd.iloc[category,nCol]
                gaussianProb = calcGaussianProb(xFeatureVal, mean, std)
                # multiple the prior class probability with the gausian likelihood:
                dfProbPerTestInstPerClass.iloc[nRow, category] *= gaussianProb
    return dfProbPerTestInstPerClass            

## the 'predictClasses' method:
The 'predictClasses' method uses the calculated  a posteriori probabilites <br/>
for every test instance, to calculate the most probable class for each test instance <br/>


In [21]:
def predictClasses(df_probPerTestInstPerClass):
    res=pd.Series(index=df_probPerTestInstPerClass.index)
    for row in df_probPerTestInstPerClass.index:
        res[row]=df_probPerTestInstPerClass.loc[row].idxmax()
    print (res)
    return res

## the 'predict' method:
The 'predict' method is a Guasian Naive Bayes classifier <br/>
It uses the above methods to predict test instances <br/>


In [22]:
def predict(XTest, arrTrainedClassPriors, dfTrainedMean, dfTrainedStd, categories):
    # 1. calculate a posterior probabities:
    dfProbPerTestInstPerClass = calcAposteriorProbs(XTest, arrTrainedClassPriors, dfTrainedMean, dfTrainedStd, categories)

    # 2. predict classes using the a posterior probabities:
    results = predictClasses(dfProbPerTestInstPerClass)
    
    return results

In [23]:
def evaluate_accuracy(y_true, y_pred):
    """
    Compare how many predictions were correct (compare the y_hat to y)
    """
    accuracy_score = pd.Series(y_true.values == y_pred.values).value_counts() * 100 / y_true.shape[0]
    return accuracy_score.iloc[0]

In [24]:
# --------------------- 
## The Following tests the the predict, using the accuracy function
# --------------------- 
meanLiklihoodDf, stdLiklihoodDf, arrPriors=fit(train_set)
iris_classes = np.unique(train_set['target'].values)
y_hat = predict(X_test, arrPriors, meanLiklihoodDf, stdLiklihoodDf, iris_classes)
accuracy_score = evaluate_accuracy(y_test['target'], y_hat)
assert accuracy_score == 100, "accuracy should be 100"
print("Accuracy Score: {}".format(accuracy_score))

73     1.0
18     0.0
118    2.0
78     1.0
76     1.0
31     0.0
64     1.0
141    2.0
68     1.0
82     1.0
110    2.0
12     0.0
36     0.0
9      0.0
19     0.0
56     1.0
104    2.0
69     1.0
55     1.0
132    2.0
29     0.0
127    2.0
26     0.0
128    2.0
131    2.0
145    2.0
108    2.0
143    2.0
45     0.0
30     0.0
dtype: float64
Accuracy Score: 100.0
