In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from vowpalwabbit import pyvw
from vowpalwabbit.DFtoVW import (
    DFtoVW,
    Feature,
    MulticlassLabel,
    MultiLabel,
    Namespace,
    SimpleLabel
)
import tensorboardX as tx
from datetime import datetime

In [2]:
from vowpalwabbit import DFtoVW as v
help(v)

Help on module vowpalwabbit.DFtoVW in vowpalwabbit:

NAME
    vowpalwabbit.DFtoVW

CLASSES
    builtins.object
        AttributeDescriptor
        DFtoVW
        Feature
        MultiLabel
        MulticlassLabel
        Namespace
        SimpleLabel
    
    class AttributeDescriptor(builtins.object)
     |  AttributeDescriptor(attribute_name, expected_type, min_value=None)
     |  
     |  This descriptor class add type and value checking informations to the _Col
     |  instance for future usage in the DFtoVW class. Indeed, the type and value checking
     |  can only be done once the dataframe is known (i.e in DFtoVW class). This descriptor
     |  class is used in the following managed class: SimpleLabel, MulticlassLabel, Feature, etc.
     |  
     |  Methods defined here:
     |  
     |  __init__(self, attribute_name, expected_type, min_value=None)
     |      Initialize an AttributeDescriptor instance
     |      
     |      Parameters
     |      ----------
     |      attri

In [3]:
df = pd.read_csv('./Iris.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [5]:
# converting "Species" categorical column to integer
def categorical_to_int(specie):
    if specie == 'Iris-setosa':
        return 1
    elif specie == 'Iris-versicolor':
        return 2
    elif specie == 'Iris-virginica':
        return 3
    
df['Species'] = df['Species'].apply(categorical_to_int)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,1
1,2,4.9,3.0,1.4,0.2,1
2,3,4.7,3.2,1.3,0.2,1
3,4,4.6,3.1,1.5,0.2,1
4,5,5.0,3.6,1.4,0.2,1


In [6]:
train, test = train_test_split(df, test_size=0.1)  # Splitting train test 

In [7]:
print("Length of train:", len(train))
train

Length of train: 135


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
60,61,5.0,2.0,3.5,1.0,2
142,143,5.8,2.7,5.1,1.9,3
67,68,5.8,2.7,4.1,1.0,2
125,126,7.2,3.2,6.0,1.8,3
72,73,6.3,2.5,4.9,1.5,2
...,...,...,...,...,...,...
137,138,6.4,3.1,5.5,1.8,3
41,42,4.5,2.3,1.3,0.3,1
83,84,6.0,2.7,5.1,1.6,2
112,113,6.8,3.0,5.5,2.1,3


In [8]:
print("Length of test:", len(test))
test

Length of test: 15


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
135,136,7.7,3.0,6.1,2.3,3
53,54,5.5,2.3,4.0,1.3,2
21,22,5.1,3.7,1.5,0.4,1
57,58,4.9,2.4,3.3,1.0,2
49,50,5.0,3.3,1.4,0.2,1
33,34,5.5,4.2,1.4,0.2,1
126,127,6.2,2.8,4.8,1.8,3
2,3,4.7,3.2,1.3,0.2,1
127,128,6.1,3.0,4.9,1.8,3
146,147,6.3,2.5,5.0,1.9,3


In [9]:
sum(train.Species.value_counts())

135

In [10]:
sum(test.Species.value_counts())

15

In [11]:
target_col = "Species"
# features = [Feature(col) for col in df.columns if col != target_col]  # target column not part of Features
features = [Feature(col) for col in df.columns if col != target_col and col != 'Id']  # 'Id' column also not part of Features along with target col
label = MulticlassLabel(label=target_col)  # target column is a multi class label column
tag = 'Id'

df_to_vw = DFtoVW(df=df, features=features, label=label, tag=tag)

In [12]:
vw_formatted_data = df_to_vw.convert_df()   # Converting dataframe to vw required string format
print("Total vw formatted strings:", len(vw_formatted_data))
vw_formatted_data

Total vw formatted strings: 150


['1 1| SepalLengthCm:5.1 SepalWidthCm:3.5 PetalLengthCm:1.4 PetalWidthCm:0.2',
 '1 2| SepalLengthCm:4.9 SepalWidthCm:3.0 PetalLengthCm:1.4 PetalWidthCm:0.2',
 '1 3| SepalLengthCm:4.7 SepalWidthCm:3.2 PetalLengthCm:1.3 PetalWidthCm:0.2',
 '1 4| SepalLengthCm:4.6 SepalWidthCm:3.1 PetalLengthCm:1.5 PetalWidthCm:0.2',
 '1 5| SepalLengthCm:5.0 SepalWidthCm:3.6 PetalLengthCm:1.4 PetalWidthCm:0.2',
 '1 6| SepalLengthCm:5.4 SepalWidthCm:3.9 PetalLengthCm:1.7 PetalWidthCm:0.4',
 '1 7| SepalLengthCm:4.6 SepalWidthCm:3.4 PetalLengthCm:1.4 PetalWidthCm:0.3',
 '1 8| SepalLengthCm:5.0 SepalWidthCm:3.4 PetalLengthCm:1.5 PetalWidthCm:0.2',
 '1 9| SepalLengthCm:4.4 SepalWidthCm:2.9 PetalLengthCm:1.4 PetalWidthCm:0.2',
 '1 10| SepalLengthCm:4.9 SepalWidthCm:3.1 PetalLengthCm:1.5 PetalWidthCm:0.1',
 '1 11| SepalLengthCm:5.4 SepalWidthCm:3.7 PetalLengthCm:1.5 PetalWidthCm:0.2',
 '1 12| SepalLengthCm:4.8 SepalWidthCm:3.4 PetalLengthCm:1.6 PetalWidthCm:0.2',
 '1 13| SepalLengthCm:4.8 SepalWidthCm:3.0 PetalL

In [13]:
def get_label(example, label_type):
    switch_label_type = {
        pyvw.vw.lDefault: None,
        pyvw.vw.lBinary: example.get_simplelabel_label,
        pyvw.vw.lMulticlass: example.get_multiclass_label,
        pyvw.vw.lCostSensitive: example.get_costsensitive_class,
        pyvw.vw.lContextualBandit: example.get_cbandits_class
#         pyvw.vw.lConditionalContextualBandit:
#         pyvw.vw.lSlates:
#         pyvw.vw.lContinuous:
    }
    return switch_label_type[label_type]()

def calculate_average_loss(sum_loss, weighted_examples):
    try:
        return sum_loss / weighted_examples
    
    except ZeroDivisionError:
        return 0.

def calculate_since_last(sum_loss_since_last, weighted_examples_since_last):
    try:
        return sum_loss_since_last / weighted_examples_since_last
    
    except ZeroDivisionError:
        return 0.

In [17]:
vw = pyvw.vw('--oaa 3 -P 1')  # -oaa is One Agent All algo for multi class problem (seems supervised) -P 1 outputs metrics for each example

In [18]:
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")  # logs directory
file_writer = tx.SummaryWriter(logdir+'/iris')   # creating file writer
sum_loss = 0.
weighted_examples = 0.

for ind, iteration in zip(df.index, range(len(df))):
    vw_format = vw_formatted_data[ind]   # get the string format of specific example
    example = vw.parse(vw_format) 
    vw.learn(example)

    label = get_label(example, vw.get_label_type())
    prediction = pyvw.get_prediction(example, vw.get_prediction_type())
    features = example.get_feature_number()    

    vw.finish_example(example)  # Any use of vw object should be done after this

    sum_loss_since_last = vw.get_sum_loss() - sum_loss  # vw.get_sum_loss() return current sum loss, sum_loss variable right now holds sum loss of previous iteration
    weighted_examples_since_last = vw.get_weighted_examples() - weighted_examples  # vw.get_weighted_examples() return current weighted examples(sum),  weighted_examples variable right now holds weighted examples of previous iteration
        
    sum_loss= vw.get_sum_loss()  # Now sum_loss no longer hold previous iteration's sum_loss
    weighted_examples = vw.get_weighted_examples()  # Now weighted_examples no longer hold previous iteration's weighted examples
    
    average_loss = calculate_average_loss(sum_loss, weighted_examples)
    since_last = calculate_since_last(sum_loss_since_last, weighted_examples_since_last)
    
    print( 'average_loss:{:.6f}'.format(average_loss) , end='\t')
    print('since_last:{:.6f}'.format(since_last), end='\t')
    print('label:', label, end='\t')    
    print('prediction:', prediction, end='\t')    
    print('features:', features)  
    
    file_writer.add_scalar('average_loss', average_loss, iteration)  # logging average_loss on each iteration
    file_writer.add_scalar('since_last', since_last, iteration)   # logging since_last on each iteration
#     file_writer.add_scalar('label' , label, iteration)
#     file_writer.add_scalar('prediction', prediction, iteration)
#     file_writer.add_histogram('label-prediction', [label, prediction], iteration)
# #     file_writer.add_histogram('label', label, iteration)
# #     file_writer.add_histogram('prediction', prediction, iteration)
#     file_writer.add_scalar('features', features, iteration)

average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 5

In [16]:
# logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")  # logs directory
# file_writer = tx.SummaryWriter(logdir + "/iris-train")   # creating file writer
# sum_loss = 0.
# weighted_examples = 0.

# for ind, iteration in zip(train.index, range(len(df))):
#     vw_format = vw_formatted_data[ind]
#     vw.learn(vw_format)
    
#     sum_loss_since_last = vw.get_sum_loss() - sum_loss   
#     weighted_examples_since_last = vw.get_weighted_examples() - weighted_examples  
        
#     sum_loss= vw.get_sum_loss()
#     weighted_examples = vw.get_weighted_examples()
    
#     average_loss = calculate_average_loss(sum_loss, weighted_examples)
#     since_last = calculate_since_last(sum_loss_since_last, weighted_examples_since_last)

#     print( 'average_loss:{:.6f}'.format(average_loss) , end='\t')
#     print('since_last:{:.6f}'.format(since_last))
#     file_writer.add_scalar('average_loss', average_loss, iteration)
#     file_writer.add_scalar('since_last', since_last, iteration)

In [17]:
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")  # logs directory
file_writer = tx.SummaryWriter(logdir + "/test")   # creating file writer

for ind, iteration in zip(test.index, range(len(test))):
    vw_format = vw_formatted_data[ind]
    pipe_char_index = vw_format.index('|')
    vw_format_predict = vw_format[pipe_char_index:]
    
    prediction = vw.predict(vw_format_predict)
    print(vw_format[0], prediction)
#     print(vw_format[0] == prediction)
    if vw_format[0] == prediction:
        is_correct = 1
    else:
        is_correct = 0
    
    file_writer.add_scalar('correct_prediction', is_correct, iteration)   #  scalar value of ctr in this iteration

file_writer.close()

3 3
1 3
2 3
2 3
1 3
2 3
1 3
3 3
2 3
3 3
2 3
2 3
3 3
1 3
1 3


In [18]:
from DFtoVWtoTensorboard import DFtoVWtoTensorboard
to_tensorboard = DFtoVWtoTensorboard(df=df, df_to_vw=df_to_vw)

vw2 = pyvw.vw('--oaa 3 -P 1')  # -oaa is One Agent All algo for multi class problem (seems supervised) -P 1 outputs metrics for each example
to_tensorboard.fit(vw2, tensorboard=True)

average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.000000	since_

In [11]:
df_simple = pd.DataFrame({
        "f1": [1, 2, 3],
        "f2": [1.2, 2.3, 3.4],
        "f3": [0.2, 0.8, .01],
        "l": [1, 2, 3],
})

label = MulticlassLabel(label="l")
features = [Feature(col) for col in ["f1", "f2", "f3"]]

df_to_vw = DFtoVW(df=df_simple, label=label, features=features)
vw = pyvw.vw('--oaa 3 -P 1')
vw_formatted_data = df_to_vw.convert_df()

In [12]:
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")  # logs directory
file_writer = tx.SummaryWriter(logdir + "/iris")   # creating file writer
sum_loss = 0.
weighted_examples = 0.

for ind, iteration in zip(df_simple.index, range(len(df_simple))):
    print(ind, iteration)
    vw_format = vw_formatted_data[ind]   # get the string format of specific example
    example = vw.parse(vw_format) 
    vw.learn(example)

    label = get_label(example, vw.get_label_type())
    prediction = pyvw.get_prediction(example, vw.get_prediction_type())
    features = example.get_feature_number()    

    vw.finish_example(example)  # Any use of vw object should be done after this

    sum_loss_since_last = vw.get_sum_loss() - sum_loss  # vw.get_sum_loss() return current sum loss, sum_loss variable right now holds sum loss of previous iteration
    weighted_examples_since_last = vw.get_weighted_examples() - weighted_examples  # vw.get_weighted_examples() return current weighted examples(sum),  weighted_examples variable right now holds weighted examples of previous iteration
        
    sum_loss= vw.get_sum_loss()  # Now sum_loss no longer hold previous iteration's sum_loss
    weighted_examples = vw.get_weighted_examples()  # Now weighted_examples no longer hold previous iteration's weighted examples
    
    average_loss = calculate_average_loss(sum_loss, weighted_examples)
    since_last = calculate_since_last(sum_loss_since_last, weighted_examples_since_last)
    
    print( 'average_loss:{:.6f}'.format(average_loss) , end='\t')
    print('since_last:{:.6f}'.format(since_last), end='\t')
    print('label:', label, end='\t')    
    print('prediction:', prediction, end='\t')    
    print('features:', features)  
    
    file_writer.add_scalar('average_loss', average_loss, iteration)  # logging average_loss on each iteration
    file_writer.add_scalar('since_last', since_last, iteration)   # logging since_last on each iteration
#     file_writer.add_scalar('label' , label, iteration)
#     file_writer.add_scalar('prediction', prediction, iteration)
#     file_writer.add_histogram('label-prediction', [label, prediction], iteration)
# #     file_writer.add_histogram('label', label, iteration)
# #     file_writer.add_histogram('prediction', prediction, iteration)
#     file_writer.add_scalar('features', features, iteration)

0 0
average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	features: 4
1 1
average_loss:0.500000	since_last:1.000000	label: 2	prediction: 1	features: 4
2 2
average_loss:0.666667	since_last:1.000000	label: 3	prediction: 2	features: 4
