# Natural Language Processing - Predict Headlines for Blogs and Tweets

In [2]:
import re
import string

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB

np.set_printoptions(linewidth=140, precision=4, suppress=True)

%matplotlib inline

In [3]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/siqili/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/siqili/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/siqili/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
# Import data for control and experiment groups
control = pd.read_excel("labeled_headlines.xlsx", header=None)
control.columns= ["article", "headline", "viral", "author"]
control_c = control[["headline", "article", "viral"]]
control_c

Unnamed: 0,headline,article,viral
0,Consuming Refined Carbohydrates is Associated ...,2,0
1,"High dietary glycemic index (GI), high glycemi...",2,0
2,High dietary glycemic index may increase insom...,2,0
3,Eating your greens: How a compound in broccoli...,3,0
4,The findings of the two consecutive studies ar...,6,2
...,...,...,...
787,New study finds that carbon impacts from the l...,7,1
788,Rocking to sleep: Study shows gentle motion he...,1,2
789,Compound in broccoli can help suppress tumor g...,3,0
790,Rocking motion during sleep improves sleep qua...,1,0


In [5]:
experiment = pd.read_excel("Altmetric_blogs_demo.xlsx")
experiment_c = experiment[["Mediaheadline", "Article"]]
experiment_c

Unnamed: 0,Mediaheadline,Article
0,The Evolution of Insomnia: Unveiling Our Obses...,Article 2
1,Menopause and insomnia: Could a low-GI diet help?,Article 2
2,Certain Foods May Help Postmenopausal Women Sl...,Article 2
3,Sugar and white bread tied to older women’s in...,Article 2
4,"Prescription: More Broccoli, Cut Down On Carbs...",Article 3
...,...,...
92,Elephants’ diets help forests to thrive,Article 8
93,Gardeners and guardians of biodiversity in the...,Article 8
94,How Elephants May Help Africa’s Rainforests Fi...,Article 8
95,Elephants Boost Carbon Storage in Rain Forests,Article 8


In [6]:
experiment_p1 = pd.read_excel("paper1.xlsx")
experiment_t = pd.read_excel("Altmetric_tweets.xlsx")
experiment_tc = experiment_t[["mediaheadline", "#article"]]
experiment_p1

Unnamed: 0,Article,Mediaheadline
0,1,A Hammock-Like Rocking Motion Helps People Sleep
1,1,"SOMMEIL : Pour bien dormir, laissez-vous bercer !"
2,1,The Neuroscience of ‘Rock-a-Bye Baby’
3,1,Dimostrato che dondolarsi può migliorare il so...
4,1,Rock-A-Bye Adult – Study Shows Grown-ups Enjoy...
5,1,Gentle Rocking Motion Improves Sleep Quality
6,1,Rocking Encourages Deeper Sleep and Better Memory
7,1,Scientific study shows that adults sleep bette...
8,1,Even Adults Sleep Better After Being Rocked to...
9,1,Убаюкивание удлинило глубокий сон и улучшило п...


In [7]:
# Normalize headlines for all data sets
def normalizer(text):
    # Set all text to lowercase
    text = text.lower()

    # Remove anywhere with more than two consecutive whitespaces
    text = re.sub("\s\s+", " ", text)

    # Lemmatize words
    lemm = WordNetLemmatizer()
    text = " ".join(
        map(lemm.lemmatize, text.split(" "))
    )
    return text 

experiment_c["clean_headline"] = experiment_c["Mediaheadline"].map(normalizer)
control_c["clean_headline"] = control_c["headline"].map(normalizer)
experiment_p1["clean_headline"] = experiment_p1["Mediaheadline"].map(normalizer)
experiment_tc["clean_headline"] = experiment_tc["mediaheadline"].map(normalizer)
experiment_c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  experiment_c["clean_headline"] = experiment_c["Mediaheadline"].map(normalizer)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  experiment_tc["clean_headline"] = experiment_tc["mediaheadline"].map(normalizer)


Unnamed: 0,Mediaheadline,Article,clean_headline
0,The Evolution of Insomnia: Unveiling Our Obses...,Article 2,the evolution of insomnia: unveiling our obses...
1,Menopause and insomnia: Could a low-GI diet help?,Article 2,menopause and insomnia: could a low-gi diet help?
2,Certain Foods May Help Postmenopausal Women Sl...,Article 2,certain food may help postmenopausal woman sle...
3,Sugar and white bread tied to older women’s in...,Article 2,sugar and white bread tied to older women’s in...
4,"Prescription: More Broccoli, Cut Down On Carbs...",Article 3,"prescription: more broccoli, cut down on carbs..."
...,...,...,...
92,Elephants’ diets help forests to thrive,Article 8,elephants’ diet help forest to thrive
93,Gardeners and guardians of biodiversity in the...,Article 8,gardener and guardian of biodiversity in the c...
94,How Elephants May Help Africa’s Rainforests Fi...,Article 8,how elephant may help africa’s rainforest figh...
95,Elephants Boost Carbon Storage in Rain Forests,Article 8,elephant boost carbon storage in rain forest


## Predict for Blogs 

In [8]:
# Divide 'experiment_c' and 'control_c' into 8 saperate datasets according to their article numbers, and make predictions for each article
# Article 1
control1 = control_c.loc[control_c["article"] == 1].reset_index(drop = True)
experiment_p1["viral"] = 0

def dataset_article(narticle):
    control = control_c.loc[control_c["article"] == narticle].reset_index(drop = True)
    print(f"Article {narticle}")
    experiment = experiment_c.loc[experiment_c["Article"] == f"Article {narticle}"].reset_index(drop = True)
    experiment["viral"] = 0
    return control, experiment
    
# Article 2
control2 = dataset_article(2)[0]
experiment2 = dataset_article(2)[1]

# Article 3
control3 = dataset_article(3)[0]
experiment3 = dataset_article(3)[1]

# Article 4
control4 = dataset_article(4)[0]
experiment4 = dataset_article(4)[1]

# Article 5
control5 = dataset_article(5)[0]
experiment5 = dataset_article(5)[1]

# Article 6
control6 = dataset_article(6)[0]
experiment6 = dataset_article(6)[1]

# Article 7
control7 = dataset_article(7)[0]
experiment7 = dataset_article(7)[1]

# Article 8
control8 = dataset_article(8)[0]
experiment8 = dataset_article(8)[1]

Article 2
Article 2
Article 3
Article 3
Article 4
Article 4
Article 5
Article 5
Article 6
Article 6
Article 7
Article 7
Article 8
Article 8


### Binary Presence Approach


In [10]:
def predict_bp(control, experiment):
    all_words = list(
        set(
            control["clean_headline"].map(
                lambda x: x.split(" ")
            ).sum()
        )
    )
    all_words.sort()

    # Choose the vocabulary
    out = np.zeros(len(all_words), dtype=bool)
    for (iw, word) in enumerate(all_words):
        count_headlines = (
            control["clean_headline"]
            .str
            .contains(word, regex=False)
            .sum()
        )
        out[iw] = count_headlines >= 3

    headline_vocabulary = [
        word for (iw, word) in enumerate(all_words) if out[iw]
    ]
    
    # Prepare the control data
    nobs = control.shape[0]
    nwords = len(headline_vocabulary)

    virals = {
        0: 0,
        1: 1,
        2: 2
    }

    X = np.zeros((nobs, nwords), dtype=int)
    y = np.zeros(nobs, dtype=int)

    for i in range(nobs):
        y[i] = virals[control.at[i, "viral"]]

        for (j, word) in enumerate(headline_vocabulary):
            X[i, j] = word in control.at[i, "clean_headline"]
            
    # Fit the model
    mnb = MultinomialNB(fit_prior=False)
    mnb.fit(X, y)
    
    # Predict for experiment data
    noos = experiment.shape[0]
    X_oos = np.zeros((noos, nwords), dtype=int)
    y_oos = np.zeros(noos, dtype=int)

    for i in range(noos):
        y_oos[i] = virals[experiment.at[i, "viral"]]

        for (j, word) in enumerate(headline_vocabulary):
            X_oos[i, j] = word in experiment.at[i, "clean_headline"]
    
    result = mnb.predict(X_oos)
    result_prob = mnb.predict_proba(X_oos)
    
    return result, result_prob

### Bag of Words Approach

In [11]:
def predict_bw(control, experiment):
    
    # Count the total words for each post
    control["nwords"] = control["clean_headline"].str.split(" ").map(lambda x: len(x))
    experiment["nwords"] = experiment["clean_headline"].str.split(" ").map(lambda x: len(x))
    
    # Choose the vocabulary
    all_words = list(
        set(
            control2["clean_headline"].map(
                lambda x: x.split(" ")
            ).sum()
        )
    )
    all_words.sort()
    
    out = np.zeros(len(all_words), dtype=bool)
    for (iw, word) in enumerate(all_words):
        count_headlines = (
            control2["clean_headline"]
            .str
            .contains(word, regex=False)
            .sum()
        )
        out[iw] = count_headlines >= 5

    headline_vocabulary = [
        word for (iw, word) in enumerate(all_words) if out[iw]
    ]
    
    # Check whether the word is in our text
    def bag_of_words(vocabulary, text):
        dcount = {word: 0 for word in vocabulary}
        for word in text.split(" "):
            if word in vocabulary:
                dcount[word] += 1

        out = np.array([dcount[word] for word in vocabulary])

        return out

    # Prepare the control data
    nobs = control.shape[0]
    nvocab = len(headline_vocabulary)

    virals = {
        0: 0,
        1: 1,
        2: 2
    }

    X = np.zeros((nobs, nvocab), dtype=int)
    y = np.zeros(nobs, dtype=int)

    for i in range(nobs):
        y[i] = virals[control.at[i, "viral"]]
        X[i, :] = bag_of_words(
            headline_vocabulary,
            control.at[i, "clean_headline"]
        )

    nwords = control.loc[:, "nwords"].to_numpy()
    X = np.round(10 * X/nwords[:, None]).astype(int)
    
    # Fit the model
    mnb = MultinomialNB(fit_prior=False)
    mnb.fit(X, y)
    
    # Predict for the experiment data
    noos = experiment.shape[0]

    X_oos = np.zeros((noos, nvocab), dtype=int)

    for i in range(noos):
        X_oos[i, :] = bag_of_words(
            headline_vocabulary,
            experiment.at[i, "clean_headline"]
        )

    nwords = experiment.loc[:, "nwords"].to_numpy()
    X_oos = np.round(10 * X_oos/nwords[:, None]).astype(int)
    
    result = mnb.predict(X_oos)
    result_prob = mnb.predict_proba(X_oos)
    
    
    return result, result_prob

### Predict for 7 articles

In [12]:
# Article 1
predict_bp(control1, experiment_p1)
predict_bw(control1, experiment_p1)

(array([0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0]),
 array([[0.463 , 0.2853, 0.2517],
        [0.3333, 0.3333, 0.3333],
        [0.1608, 0.2894, 0.5498],
        [0.3333, 0.3333, 0.3333],
        [0.6022, 0.273 , 0.1248],
        [0.4323, 0.3425, 0.2252],
        [0.4351, 0.3126, 0.2523],
        [0.4323, 0.1214, 0.4462],
        [0.3699, 0.2982, 0.3318],
        [0.3333, 0.3333, 0.3333],
        [0.0654, 0.3336, 0.601 ],
        [0.2001, 0.3403, 0.4597],
        [0.3664, 0.2678, 0.3658]]))

In [13]:
# Article 2
predict_bp(control2, experiment2)
predict_bw(control2, experiment2)

(array([0, 1, 2, 1]),
 array([[0.3795, 0.3518, 0.2687],
        [0.128 , 0.5096, 0.3625],
        [0.3118, 0.0842, 0.604 ],
        [0.2463, 0.5402, 0.2136]]))

In [14]:
# Article 3
predict_bp(control3, experiment3)
predict_bw(control3, experiment3)

(array([0]), array([[0.4619, 0.1435, 0.3946]]))

In [15]:
# Article 4
predict_bp(control4, experiment4)
predict_bw(control4, experiment4)

(array([2, 0, 1, 2]),
 array([[0.0776, 0.3892, 0.5331],
        [0.4064, 0.3834, 0.2102],
        [0.199 , 0.6889, 0.1121],
        [0.2095, 0.173 , 0.6175]]))

In [16]:
# Article 5
predict_bw(control4, experiment4)
predict_bp(control5, experiment5)

(array([2, 0, 2, 2, 0, 2, 0, 2, 2, 1, 2, 0, 1, 1, 0, 2, 2, 2]),
 array([[0.2234, 0.1772, 0.5994],
        [0.7008, 0.1842, 0.115 ],
        [0.0199, 0.0501, 0.9299],
        [0.0774, 0.0115, 0.9112],
        [0.5159, 0.1476, 0.3366],
        [0.262 , 0.0852, 0.6528],
        [0.6805, 0.0065, 0.313 ],
        [0.1601, 0.0272, 0.8127],
        [0.111 , 0.0657, 0.8233],
        [0.1174, 0.6618, 0.2208],
        [0.1459, 0.2336, 0.6205],
        [0.6339, 0.0325, 0.3336],
        [0.2177, 0.4329, 0.3494],
        [0.0019, 0.505 , 0.4931],
        [0.3993, 0.2122, 0.3885],
        [0.0215, 0.3607, 0.6178],
        [0.19  , 0.3707, 0.4393],
        [0.2737, 0.0344, 0.6918]]))

In [17]:
# Article 6
predict_bp(control6, experiment6)
predict_bw(control6, experiment6)

(array([2, 2, 1, 2, 0, 2, 2, 2, 0, 2, 2, 1, 1, 1, 1, 2, 0, 2, 0, 2, 2, 0, 2, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1,
        1, 1, 0]),
 array([[0.2548, 0.1333, 0.612 ],
        [0.343 , 0.222 , 0.4349],
        [0.2328, 0.3876, 0.3796],
        [0.3304, 0.1384, 0.5311],
        [0.3333, 0.3333, 0.3333],
        [0.2499, 0.1367, 0.6134],
        [0.3096, 0.2194, 0.471 ],
        [0.3683, 0.037 , 0.5946],
        [0.3926, 0.2971, 0.3103],
        [0.1297, 0.3864, 0.484 ],
        [0.343 , 0.222 , 0.4349],
        [0.293 , 0.3609, 0.3461],
        [0.1443, 0.5528, 0.3029],
        [0.1805, 0.4105, 0.4091],
        [0.1805, 0.4105, 0.4091],
        [0.1653, 0.3162, 0.5185],
        [0.434 , 0.2633, 0.3027],
        [0.3158, 0.3111, 0.373 ],
        [0.3333, 0.3333, 0.3333],
        [0.1973, 0.2964, 0.5063],
        [0.3304, 0.1384, 0.5311],
        [0.5571, 0.1794, 0.2634],
        [0.2635, 0.0932, 0.6433],
        [0.5241, 0.0836, 0.3924],
        [0.2931, 0.6667, 

In [18]:
# Article 7
predict_bp(control7, experiment7)
predict_bw(control7, experiment7)

(array([2, 0, 2, 1, 0, 1, 1, 0]),
 array([[0.3031, 0.1593, 0.5375],
        [0.3624, 0.33  , 0.3076],
        [0.1641, 0.3486, 0.4873],
        [0.3149, 0.5441, 0.141 ],
        [0.3333, 0.3333, 0.3333],
        [0.2797, 0.4688, 0.2516],
        [0.3282, 0.378 , 0.2938],
        [0.5064, 0.2829, 0.2107]]))

In [19]:
# Article 8
predict_bp(control8, experiment8)
predict_bw(control8, experiment8)

(array([1, 0, 0, 2, 2, 0, 2, 1, 0, 0, 0, 2, 2, 0, 0]),
 array([[0.4002, 0.4023, 0.1974],
        [0.3486, 0.3135, 0.3379],
        [0.3586, 0.3204, 0.321 ],
        [0.2132, 0.2693, 0.5176],
        [0.2783, 0.2654, 0.4563],
        [0.4762, 0.2634, 0.2603],
        [0.3708, 0.1325, 0.4966],
        [0.0894, 0.5683, 0.3422],
        [0.511 , 0.1817, 0.3073],
        [0.3333, 0.3333, 0.3333],
        [0.3359, 0.3352, 0.3289],
        [0.2952, 0.2608, 0.444 ],
        [0.331 , 0.248 , 0.4211],
        [0.345 , 0.3273, 0.3277],
        [0.6885, 0.0481, 0.2633]]))

## Predict for Tweets

In [20]:
# Article 2
experiment_t2 = experiment_tc.loc[experiment_tc["#article"] == "Article 2"].reset_index(drop = True)
experiment_t2["viral"] = 0

# Article 3
experiment_t3 = experiment_tc.loc[experiment_tc["#article"] == "Article 3"].reset_index(drop = True)
experiment_t3["viral"] = 0

# Article 4
experiment_t4 = experiment_tc.loc[experiment_tc["#article"] == "Article 4"].reset_index(drop = True)
experiment_t4["viral"] = 0

# Article 5
experiment_t5 = experiment_tc.loc[experiment_tc["#article"] == "Article 5"].reset_index(drop = True)
experiment_t5["viral"] = 0

# Article 6
experiment_t6 = experiment_tc.loc[experiment_tc["#article"] == "Article 6"].reset_index(drop = True)
experiment_t6["viral"] = 0

# Article 7
experiment_t7 = experiment_tc.loc[experiment_tc["#article"] == "Article 7"].reset_index(drop = True)
experiment_t7["viral"] = 0

# Article 8
experiment_t8 = experiment_tc.loc[experiment_tc["#article"] == "Article 8"].reset_index(drop = True)
experiment_t8["viral"] = 0

### Bag of Words Approach

In [21]:
def predict_bw(control, experiment):
    
    # Count the total words for each post
    control["nwords"] = control["clean_headline"].str.split(" ").map(lambda x: len(x))
    experiment["nwords"] = experiment["clean_headline"].str.split(" ").map(lambda x: len(x))
    
    # Choose the vocabulary
    all_words = list(
        set(
            control2["clean_headline"].map(
                lambda x: x.split(" ")
            ).sum()
        )
    )
    all_words.sort()
    
    out = np.zeros(len(all_words), dtype=bool)
    for (iw, word) in enumerate(all_words):
        count_headlines = (
            control2["clean_headline"]
            .str
            .contains(word, regex=False)
            .sum()
        )
        out[iw] = count_headlines >= 5

    headline_vocabulary = [
        word for (iw, word) in enumerate(all_words) if out[iw]
    ]
    
    # Check whether the word is in our text
    def bag_of_words(vocabulary, text):
        dcount = {word: 0 for word in vocabulary}
        for word in text.split(" "):
            if word in vocabulary:
                dcount[word] += 1

        out = np.array([dcount[word] for word in vocabulary])

        return out

    # Prepare the control data
    nobs = control.shape[0]
    nvocab = len(headline_vocabulary)

    virals = {
        0: 0,
        1: 1,
        2: 2
    }

    X = np.zeros((nobs, nvocab), dtype=int)
    y = np.zeros(nobs, dtype=int)

    for i in range(nobs):
        y[i] = virals[control.at[i, "viral"]]
        X[i, :] = bag_of_words(
            headline_vocabulary,
            control.at[i, "clean_headline"]
        )

    nwords = control.loc[:, "nwords"].to_numpy()
    X = np.round(10 * X/nwords[:, None]).astype(int)
    
    # Fit the model
    mnb = MultinomialNB(fit_prior=False)
    mnb.fit(X, y)
    
    # Predict for the experiment data
    noos = experiment.shape[0]

    X_oos = np.zeros((noos, nvocab), dtype=int)

    for i in range(noos):
        X_oos[i, :] = bag_of_words(
            headline_vocabulary,
            experiment.at[i, "clean_headline"]
        )

    nwords = experiment.loc[:, "nwords"].to_numpy()
    X_oos = np.round(10 * X_oos/nwords[:, None]).astype(int)
    
    result = mnb.predict(X_oos)
    result_prob = mnb.predict_proba(X_oos)
    
    return result, result_prob

### Predict for 7 articles

In [22]:
# Article 2
predict_bw(control2, experiment_t2)

(array([2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2,
        0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0]),
 array([[0.0391, 0.0698, 0.8911],
        [0.3786, 0.1539, 0.4674],
        [0.3633, 0.3376, 0.2991],
        [0.1072, 0.2778, 0.6149],
        [0.1072, 0.2778, 0.6149],
        [0.3786, 0.1539, 0.4674],
        [0.1807, 0.2687, 0.5507],
        [0.1807, 0.2687, 0.5507],
        [0.3333, 0.3333, 0.3333],
        [0.3633, 0.3376, 0.2991],
        [0.3633, 0.3376, 0.2991],
        [0.3333, 0.3333, 0.3333],
        [0.3311, 0.1642, 0.5047],
        [0.1072, 0.2778, 0.6149],
        [0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333],
        [0.6037, 0.2736, 0.1227],
        [0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333],
        [0.2893, 0.3024, 0.4082],
        [0.3333, 0.3333, 0.3333],
        [0

In [23]:
# Article 3
predict_bw(control3, experiment_t3)

(array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 2, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 2, 0, 0, 2, 2, 0, 1, 2, 1, 0, 2, 0,
        0, 1, 2, 2, 1, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 2, 0, 1,
        0, 1, 0, 1, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 1, 2, 1, 0, 2, 1, 1, 2, 2, 1, 2, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 2, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 0, 1, 2, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2]),
 array([[0.3333, 0.3333, 0.3333],
        [0.4861, 0.1479, 0.366 ],
        [0.1859, 0.6397, 0.1745],
        [0.1721

In [24]:
# Article 4
predict_bw(control4, experiment_t4)

(array([0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0]),
 array([[0.7555, 0.1888, 0.0557],
        [0.3914, 0.3703, 0.2383],
        [0.3333, 0.3333, 0.3333],
        [0.4059, 0.3092, 0.2849],
        [0.4059, 0.3092, 0.2849],
        [0.4059, 0.3092, 0.2849],
        [0.5701, 0.2136, 0.2163],
        [0.3333, 0.3333, 0.3333],
        [0.1633, 0.2818, 0.5549],
        [0.1633, 0.2818, 0.5549],
        [0.3333, 0.3333, 0.3333],
        [0.4059, 0.3092, 0.2849],
        [0.2873, 0.3634, 0.3494],
        [0.2873, 0.3634, 0.3494],
        [0.2873, 0.3634, 0.3494],
        [0.4059, 0.3092, 0.2849],
        [0.3333, 0.3333, 0.3333],
        [0.4059, 0.3092, 0.2849],
        [0.4059, 0.3092, 0.2849],
        [0.4059, 0.3092, 0.2849],
        [0.5701, 0.2136, 0.2163],
        [0.3333, 0.3333, 0.3333],
        [0.326 , 0.4812, 0.1928],
        [0.3333, 0.3333, 0.33

In [25]:
# Article 5
predict_bw(control5, experiment_t5)

(array([2, 0, 2, ..., 2, 0, 2]),
 array([[0.3741, 0.1662, 0.4597],
        [0.4599, 0.1588, 0.3812],
        [0.3548, 0.2286, 0.4166],
        ...,
        [0.2433, 0.1961, 0.5606],
        [0.6541, 0.2343, 0.1116],
        [0.37  , 0.2557, 0.3744]]))

In [26]:
# Article 6
predict_bw(control6, experiment_t6)

(array([1, 2, 1, ..., 1, 2, 2]),
 array([[0.285 , 0.5042, 0.2108],
        [0.3362, 0.2985, 0.3654],
        [0.2955, 0.5458, 0.1586],
        ...,
        [0.3683, 0.4712, 0.1605],
        [0.1762, 0.3351, 0.4887],
        [0.1762, 0.3351, 0.4887]]))

In [27]:
# Article 7
predict_bw(control7, experiment_t7)

(array([0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 2, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 1, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 1, 2, 0, 2, 0, 0, 0, 2, 0,
        2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
        0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0,
        2, 0, 0, 0, 

In [28]:
# Article 8
predict_bw(control8, experiment_t8)

(array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 2, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0]),
 array([[0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333],
        [0.3935, 0.3957, 0.2108],
        [0.3333, 0.3333, 0.3333],
        [0.5547, 0.0545, 0.3908],
        [0.5547, 0.0545, 0.3908],
        [0.5547, 0.0545, 0.3908],
        [0.3333, 0.3333, 0.3333],
        [0.3935, 0.3957, 0.2108],
        [0.5547, 0.0545, 0.3908],
        [0.3935, 0.3957, 0.2108],
        [0.3333, 0.3333, 0.3333],
        [0.3799, 0.3524, 0.2678],
        [0.2474, 0.1931, 0.5595],
        [0.256 , 0.4071, 0.3369],
        [0.256 , 0.4071, 0.3369],
        [0.256 , 0.4071, 0.3369],
        [0.256 , 0.4071,