In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from vowpalwabbit import pyvw
from vowpalwabbit.DFtoVW import (
    DFtoVW,
    Feature,
    MulticlassLabel,
    SimpleLabel
)
# import tensorboardX as tx
# from datetime import datetime
from vowpalwabbit.DFtoVWtoTensorboard import VWtoTensorboard, DFtoVWtoTensorboard

In [2]:
df = pd.read_csv('./Iris.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [4]:
# converting "Species" categorical column to integer
def categorical_to_int(specie):
    if specie == 'Iris-setosa':
        return 1
    elif specie == 'Iris-versicolor':
        return 2
    elif specie == 'Iris-virginica':
        return 3
    
df['Species'] = df['Species'].apply(categorical_to_int)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,1
1,2,4.9,3.0,1.4,0.2,1
2,3,4.7,3.2,1.3,0.2,1
3,4,4.6,3.1,1.5,0.2,1
4,5,5.0,3.6,1.4,0.2,1


In [5]:
# Splitting train test
train, test = train_test_split(df, test_size=0.1)   

In [6]:
print("Length of test:", len(train))
test

Length of test: 135


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
126,127,6.2,2.8,4.8,1.8,3
117,118,7.7,3.8,6.7,2.2,3
76,77,6.8,2.8,4.8,1.4,2
70,71,5.9,3.2,4.8,1.8,2
79,80,5.7,2.6,3.5,1.0,2
45,46,4.8,3.0,1.4,0.3,1
97,98,6.2,2.9,4.3,1.3,2
61,62,5.9,3.0,4.2,1.5,2
58,59,6.6,2.9,4.6,1.3,2
80,81,5.5,2.4,3.8,1.1,2


In [7]:
print("Length of test:", len(test))
test

Length of test: 15


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
126,127,6.2,2.8,4.8,1.8,3
117,118,7.7,3.8,6.7,2.2,3
76,77,6.8,2.8,4.8,1.4,2
70,71,5.9,3.2,4.8,1.8,2
79,80,5.7,2.6,3.5,1.0,2
45,46,4.8,3.0,1.4,0.3,1
97,98,6.2,2.9,4.3,1.3,2
61,62,5.9,3.0,4.2,1.5,2
58,59,6.6,2.9,4.6,1.3,2
80,81,5.5,2.4,3.8,1.1,2


In [8]:
target_col = "Species"

# features = [Feature(col) for col in df.columns if col != target_col]  # target column not part of Features
features = [Feature(col) for col in df.columns if col != target_col and col != 'Id']  # 'Id' column also not part of Features along with target col
label = MulticlassLabel(label=target_col)  # target column is a multi class label column
tag = 'Id'

df_to_vw = DFtoVW(df=df, features=features, label=label, tag=tag)

In [9]:
vw = pyvw.vw('--oaa 3 -P 1')  # -oaa is One Agent All algo for multi class problem (seems supervised) -P 1 outputs metrics for each example

In [13]:
# Before running this run command `rm -rf ./logs` in this directory to remove any previous logs for tensorboard
logdir = 'logs/scalars'
vw_to_tb = VWtoTensorboard(logdir)

df_to_tb = DFtoVWtoTensorboard(df_to_vw, vw)
df_to_tb.fit(train, vw_to_tb)  # If VWtoTensorboard object passed as parameter then run in the current directory command `tensorboard --logdir ./logs`

average_loss:0.039735	since_last:0.039735	label: 1	prediction: 3	num_features: 5
average_loss:0.046053	since_last:1.000000	label: 1	prediction: 3	num_features: 5
average_loss:0.052288	since_last:1.000000	label: 1	prediction: 3	num_features: 5
average_loss:0.051948	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.051613	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.051282	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.050955	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.050633	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.050314	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.050000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.049689	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.049383	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.049080	since_

average_loss:0.056738	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.056537	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.056338	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.056140	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.055944	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.055749	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.055556	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.055363	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.055172	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.054983	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.054795	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.054608	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.054422	since_

In [11]:
df_simple = pd.DataFrame({
        "f1": [1, 2, 3],
        "f2": [1.2, 2.3, 3.4],
        "f3": [0.2, 0.8, .01],
        "l": [1, 2, 3],
})

label = MulticlassLabel(label="l")
features = [Feature(col) for col in ["f1", "f2", "f3"]]

df_to_vw = DFtoVW(df=df_simple, label=label, features=features)
vw = pyvw.vw('--oaa 3 -P 1')

df_to_tb = DFtoVWtoTensorboard(df_to_vw, vw)
df_to_tb.fit(df_simple)  

average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 4
average_loss:0.500000	since_last:1.000000	label: 2	prediction: 1	num_features: 4
average_loss:0.666667	since_last:1.000000	label: 3	prediction: 2	num_features: 4


In [12]:
for i, x in zip(df_simple.index, range(len(df_simple))):
    print(i, x)

0 0
1 1
2 2
