### Machine Learning Estimation

There are two cages - A and B. The probabilty of drawing ball N from A is 2/3 and from B is 1/3. First, with prior p we choose a cage. Then we draw with replacement 6 balls from it and record the number of Ns measured by X. We repeat these trials many times. The (prior, n_draws, sample_size) constitute the experimental conditions and (cages, draws) constitute the training data. 

Part 1: Build a machine learning algoithm than predicts cages from draws. Observe the test accuracy - i.e the P(predict=A|cage=A). And compare this with Bayes rule. Bayes rule when $P(A)$ is prior and $N$ is number of balls observed distributed by $N|A \sim BIN(6, 2/3)$ and $N|B \sim BIN(6, 1/3)$:

 $P(A|N=n) = \frac{P(N=n|A)P(A)}{P(N=n|A)P(A)+P(N=n|B)P(B)}$

In [8]:
import numpy as np
import pandas as pd

In [9]:
def training_data_generator(n_draws, prior, sample_size):
    import numpy as np
    cages = np.random.binomial(1, prior, size=sample_size)
    drawA = np.random.binomial(n_draws, 2/3, size=sample_size)
    drawB = np.random.binomial(n_draws, 1/2, size=sample_size)
    draws = np.where(cages==1, drawA, drawB)
    return cages, draws 

cages, draws  = training_data_generator(6, 1/2, 10000)

In [10]:
def preprocessing(cages, draws):
    import pandas as pd
    df = pd.DataFrame(cages, columns=['cage'])
    df['draws'] = draws
    df.head()
    x = df.drop('cage', axis = 1)
    y = df['cage'].astype('int')
    return x, y

x, y = preprocessing(cages, draws)

In [11]:
def machineLearning(n_draws, prior, sample_size):
    """Input: Priors, Draws, Sample Size
    Output: Table of Cross-validated Test-Accuracy of various machine learning classifiers"""
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from prettytable import PrettyTable
    from PIL import Image, ImageDraw, ImageFont
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.neural_network import MLPClassifier
    from catboost import CatBoostClassifier
    from xgboost import XGBClassifier
    from sklearn.model_selection import cross_val_score
    import warnings
    warnings.filterwarnings('ignore')
    np.random.seed(3293423)

    cages, draws  = training_data_generator(n_draws, prior, sample_size)
    x, y = preprocessing(cages, draws)
    
    model1=LogisticRegressionCV()
    model2=RandomForestClassifier(n_estimators=100)
    model3=GradientBoostingClassifier(n_estimators=100)
    model4=MLPClassifier(max_iter=5000,activation='tanh')
    model5=XGBClassifier(max_depth=2, n_estimators=100)
    model6=CatBoostClassifier(max_depth=2, n_estimators=100, verbose=0)

    table = PrettyTable()
    table.field_names = ['Estimator', 'Avg Cross-Validated Test-Accuracy']
    table.add_row(['Logistic Regression',cross_val_score(model1, x, y, cv=10).mean()])
    table.add_row(['Random Forest Classifier',cross_val_score(model2, x, y, cv=10).mean()])
    table.add_row(['Gradient Boosting Classifier',cross_val_score(model3, x, y, cv=10).mean()])
    table.add_row(['Neural Network Classifier',cross_val_score(model4, x, y, cv=10).mean()])
    table.add_row(['XGBoost Classifier',cross_val_score(model5, x, y, cv=10).mean()])
    table.add_row(['CatBoost Classifer',cross_val_score(model6, x, y, cv=10).mean()])
    table.float_format = '0.3'
    print(table)

machineLearning(6, 2/3, 1000)


+------------------------------+-----------------------------------+
|          Estimator           | Avg Cross-Validated Test-Accuracy |
+------------------------------+-----------------------------------+
|     Logistic Regression      |               0.714               |
|   Random Forest Classifier   |               0.714               |
| Gradient Boosting Classifier |               0.714               |
|  Neural Network Classifier   |               0.714               |
|      XGBoost Classifier      |               0.714               |
|      CatBoost Classifer      |               0.714               |
+------------------------------+-----------------------------------+


In [12]:
def bayesRule(n, p, N):
    """Input: Number of balls drawn in current trial, prior prob of cage A, total number of draws in every trial"""
    """Output: Prob(A|N=n) or Posterior Prob of A given that n draws were seen at latest trial"""
    from scipy.stats import binom
    pA = p
    pB = 1-p
    pnA = binom.pmf(n, N, 2/3)
    pnB = binom.pmf(n, N, 1/2)
    pn = pnA*pA+pnB*pB
    pAn = pnA*pA/pn
    return pAn

p1 = bayesRule(5, 1/3, 6)
p2 = bayesRule(4, 1/2, 6)
p3 = bayesRule(3, 2/3, 6)
print(p1,p2,p3)

0.5841414717626923 0.5841414717626924 0.5841414717626925


In [13]:
def posteriorOdds(draws, priors):
    rows = []
    for prior in priors: 
        for n in range(draws):
            pAn = bayesRule(n, prior, draws)
            rows.append([pAn, prior, n, draws])
    df=pd.DataFrame(rows, columns=['posterior', 'prior', 'draws of N', 'total draws'])
    df.prior = df.prior.astype(str)
    return df

def plotOdds(df):
    import plotly.express as px
    px.line(df, x="draws of N", y="posterior", color = "prior", markers = True).show() 


# Show plot 
df = posteriorOdds(6, [1/3, 1/2, 2/3])
plotOdds(df)

In [14]:
def compareMLwithBayes(n_draws, prior, sample_size):
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from catboost import CatBoostClassifier
    from prettytable import PrettyTable

    model1=LogisticRegressionCV()
    model2=RandomForestClassifier(max_depth=2)
    model3=CatBoostClassifier(max_depth=2, verbose=0)
    cages, draws  = training_data_generator(n_draws, prior, sample_size)
    x, y = preprocessing(cages, draws)
    table = PrettyTable()
    table.field_names = ['#of N', 'Bayes Rule: P(A|n)', 'Logistic Regression: P(A|n)', 'Random Forest: P(A|n)', 'CatBoost Classifier: P(A|n)']
    for n in range(n_draws):
        MLprob1 = model1.fit(x,y).predict_proba(np.array([n]).reshape(1, -1))[0][1]
        MLprob2 = model2.fit(x,y).predict_proba(np.array([n]).reshape(1, -1))[0][1]
        MLprob3 = model3.fit(x,y).predict_proba(np.array([n]).reshape(1, -1))[0][1]
        Bayesprob = bayesRule(n, prior, 6)
        table.add_row([n, Bayesprob, MLprob1, MLprob2, MLprob3])
    table.float_format = '0.3'
    print(table)

compareMLwithBayes(6, 1/2, 1000)

+-------+--------------------+-----------------------------+-----------------------+-----------------------------+
| #of N | Bayes Rule: P(A|n) | Logistic Regression: P(A|n) | Random Forest: P(A|n) | CatBoost Classifier: P(A|n) |
+-------+--------------------+-----------------------------+-----------------------+-----------------------------+
|   0   |       0.081        |            0.371            |         0.199         |            0.095            |
|   1   |       0.149        |            0.407            |         0.199         |            0.159            |
|   2   |       0.260        |            0.444            |         0.224         |            0.247            |
|   3   |       0.413        |            0.482            |         0.424         |            0.412            |
|   4   |       0.584        |            0.520            |         0.581         |            0.587            |
|   5   |       0.737        |            0.557            |         0.755      