In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from vowpalwabbit import pyvw
from vowpalwabbit.DFtoVW import (
    DFtoVW,
    Feature,
    MulticlassLabel,
    MultiLabel,
    Namespace,
    SimpleLabel
)
import tensorboardX as tx
from datetime import datetime

In [2]:
from vowpalwabbit import DFtoVW as v
help(v)

Help on module vowpalwabbit.DFtoVW in vowpalwabbit:

NAME
    vowpalwabbit.DFtoVW

CLASSES
    builtins.object
        AttributeDescriptor
        DFtoVW
        Feature
        MultiLabel
        MulticlassLabel
        Namespace
        SimpleLabel
    
    class AttributeDescriptor(builtins.object)
     |  AttributeDescriptor(attribute_name, expected_type, min_value=None)
     |  
     |  This descriptor class add type and value checking informations to the _Col
     |  instance for future usage in the DFtoVW class. Indeed, the type and value checking
     |  can only be done once the dataframe is known (i.e in DFtoVW class). This descriptor
     |  class is used in the following managed class: SimpleLabel, MulticlassLabel, Feature, etc.
     |  
     |  Methods defined here:
     |  
     |  __init__(self, attribute_name, expected_type, min_value=None)
     |      Initialize an AttributeDescriptor instance
     |      
     |      Parameters
     |      ----------
     |      attri

In [3]:
df = pd.read_csv('./Iris.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [5]:
def categorical_to_int(specie):
    if specie == 'Iris-setosa':
        return 1
    elif specie == 'Iris-versicolor':
        return 2
    elif specie == 'Iris-virginica':
        return 3
    
df['Species'] = df['Species'].apply(categorical_to_int)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,1
1,2,4.9,3.0,1.4,0.2,1
2,3,4.7,3.2,1.3,0.2,1
3,4,4.6,3.1,1.5,0.2,1
4,5,5.0,3.6,1.4,0.2,1


In [6]:
train, test = train_test_split(df, test_size=0.1)

In [7]:
print("Length of train:", len(train))
train

Length of train: 135


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
14,15,5.8,4.0,1.2,0.2,1
99,100,5.7,2.8,4.1,1.3,2
52,53,6.9,3.1,4.9,1.5,2
71,72,6.1,2.8,4.0,1.3,2
119,120,6.0,2.2,5.0,1.5,3
...,...,...,...,...,...,...
66,67,5.6,3.0,4.5,1.5,2
68,69,6.2,2.2,4.5,1.5,2
33,34,5.5,4.2,1.4,0.2,1
108,109,6.7,2.5,5.8,1.8,3


In [8]:
print("Length of test:", len(test))
test

Length of test: 15


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
7,8,5.0,3.4,1.5,0.2,1
107,108,7.3,2.9,6.3,1.8,3
148,149,6.2,3.4,5.4,2.3,3
123,124,6.3,2.7,4.9,1.8,3
45,46,4.8,3.0,1.4,0.3,1
138,139,6.0,3.0,4.8,1.8,3
35,36,5.0,3.2,1.2,0.2,1
36,37,5.5,3.5,1.3,0.2,1
93,94,5.0,2.3,3.3,1.0,2
48,49,5.3,3.7,1.5,0.2,1


In [9]:
train.Species.value_counts()

2    48
3    44
1    43
Name: Species, dtype: int64

In [10]:
test.Species.value_counts()

1    7
3    6
2    2
Name: Species, dtype: int64

In [11]:
target_col = "Species"
# features = [Feature(col) for col in df.columns if col != target_col]
features = [Feature(col) for col in df.columns if col != target_col and col != 'Id']
label = MulticlassLabel(label=target_col)
tag = 'Id'

df_to_vw = DFtoVW(df=df, features=features, label=label, tag=tag)

In [12]:
vw_formatted_data = df_to_vw.convert_df()
print("Total vw formatted strings:", len(vw_formatted_data))
vw_formatted_data

Total vw formatted strings: 150


['1 1| SepalLengthCm:5.1 SepalWidthCm:3.5 PetalLengthCm:1.4 PetalWidthCm:0.2',
 '1 2| SepalLengthCm:4.9 SepalWidthCm:3.0 PetalLengthCm:1.4 PetalWidthCm:0.2',
 '1 3| SepalLengthCm:4.7 SepalWidthCm:3.2 PetalLengthCm:1.3 PetalWidthCm:0.2',
 '1 4| SepalLengthCm:4.6 SepalWidthCm:3.1 PetalLengthCm:1.5 PetalWidthCm:0.2',
 '1 5| SepalLengthCm:5.0 SepalWidthCm:3.6 PetalLengthCm:1.4 PetalWidthCm:0.2',
 '1 6| SepalLengthCm:5.4 SepalWidthCm:3.9 PetalLengthCm:1.7 PetalWidthCm:0.4',
 '1 7| SepalLengthCm:4.6 SepalWidthCm:3.4 PetalLengthCm:1.4 PetalWidthCm:0.3',
 '1 8| SepalLengthCm:5.0 SepalWidthCm:3.4 PetalLengthCm:1.5 PetalWidthCm:0.2',
 '1 9| SepalLengthCm:4.4 SepalWidthCm:2.9 PetalLengthCm:1.4 PetalWidthCm:0.2',
 '1 10| SepalLengthCm:4.9 SepalWidthCm:3.1 PetalLengthCm:1.5 PetalWidthCm:0.1',
 '1 11| SepalLengthCm:5.4 SepalWidthCm:3.7 PetalLengthCm:1.5 PetalWidthCm:0.2',
 '1 12| SepalLengthCm:4.8 SepalWidthCm:3.4 PetalLengthCm:1.6 PetalWidthCm:0.2',
 '1 13| SepalLengthCm:4.8 SepalWidthCm:3.0 PetalL

In [None]:
def average_loss(sum_loss, weighted_examples):
    return sum_loss / weighted_examples

def since_last()

In [266]:
vw = pyvw.vw('--oaa 3 -P 1')

In [267]:
sum_loss_since_last = 0.
sum_loss = 0.
weighted_examples = 0.
for i in df.index:
    vw_format = vw_formatted_data[i]
    vw.learn(vw_format)
    
    sum_loss_since_last = vw.get_sum_loss() - sum_loss
    weighted_examples_since_last = weighted_examples  
        
    sum_loss= vw.get_sum_loss()
    weighted_examples = vw.get_weighted_examples()
    
    average_loss = (sum_loss/weighted_examples)
#     print('loss:',sum_loss, 'since_last:',sum_loss_since_last, 'wei:',weighted_examples, 'since_last:',weighted_examples_since_last)
#     print('loss:',sum_loss, 'since_last:',sum_loss_since_last)
#     print('wei:',weighted_examples, 'since_last:',weighted_examples_since_last)

    diff_weighted_examples = weighted_examples-weighted_examples_since_last
    if i in [0, 1, 3, 7, 15, 31, 63, 127]:
#     if diff_weighted_examples > 0:
        print( 'average_loss:{:.6f}'.format(average_loss) , end='\t')
        print('since_last:{:.6f}'.format((sum_loss_since_last/diff_weighted_examples)))

average_loss:0.000000	since_last:0.000000
average_loss:0.000000	since_last:0.000000
average_loss:0.000000	since_last:0.000000
average_loss:0.000000	since_last:0.000000
average_loss:0.000000	since_last:0.000000
average_loss:0.000000	since_last:0.000000
average_loss:0.031250	since_last:0.000000
average_loss:0.039062	since_last:0.000000


In [158]:
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")  # logs directory
file_writer = tx.SummaryWriter(logdir + "/test")   # creating file writer

for ind, iteration in zip(test.index, range(len(test))):
    vw_format = vw_formatted_data[ind]
    pipe_char_index = vw_format.index('|')
    vw_format_predict = vw_format[pipe_char_index:]
    
    prediction = vw.predict(vw_format_predict)
    print(vw_format[0], prediction)
#     print(vw_format[0] == prediction)
    if vw_format[0] == prediction:
        is_correct = 1
    else:
        is_correct = 0
    
    file_writer.add_scalar('correct_prediction', is_correct, iteration)   #  scalar value of ctr in this iteration

file_writer.close()

1 3
3 3
3 3
3 3
1 3
3 3
1 3
1 3
2 3
1 3
3 3
1 3
3 3
1 3
2 3


In [19]:
df_2 = pd.read_csv('Iris.csv')

In [20]:
df_2.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [22]:
df_2 = df_2.drop('Id', axis=1)
df_2.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [26]:
df_2.to_csv('~/Downloads/Iris-2.csv', index=False)

In [25]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [88]:
x = 0
x, y = 1, x
print(x, y)

1 0


In [251]:
0.1234 > 0

True