In [1]:
import pandas as pd
import numpy as np
from xlearn._sklearn import write_data_to_xlearn_format
# from utils import _convert_to_ffm

In [2]:
data = {
    'Click' : [0, 1, 0],
    'Numeric': [10, 20, 80],
    'Advertiser': ['Nike', 'ESPN', 'Nike'],
    'Publisher': ['CNN', 'BBC', 'BBC'],
}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,Click,Numeric,Advertiser,Publisher
0,0,10,Nike,CNN
1,1,20,ESPN,BBC
2,0,80,Nike,BBC


In [4]:
X = df.iloc[:, 1:]
X

Unnamed: 0,Numeric,Advertiser,Publisher
0,10,Nike,CNN
1,20,ESPN,BBC
2,80,Nike,BBC


In [5]:
field_cols = list(X.columns)
field_cols

['Numeric', 'Advertiser', 'Publisher']

## Generating Features and Values

In [7]:
df_dummy = pd.get_dummies(df)
df_dummy

Unnamed: 0,Click,Numeric,Advertiser_ESPN,Advertiser_Nike,Publisher_BBC,Publisher_CNN
0,0,10,0,1,0,1
1,1,20,1,0,1,0
2,0,80,0,1,1,0


In [8]:
X_dummy = df_dummy.iloc[:, 1:]
X_dummy

Unnamed: 0,Numeric,Advertiser_ESPN,Advertiser_Nike,Publisher_BBC,Publisher_CNN
0,10,0,1,0,1
1,20,1,0,1,0
2,80,0,1,1,0


In [9]:
features = X_dummy.columns
features_cols = list(features)
features_cols

['Numeric',
 'Advertiser_ESPN',
 'Advertiser_Nike',
 'Publisher_BBC',
 'Publisher_CNN']

In [13]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [14]:
fields = []
for index, field_col in enumerate(field_cols):
    for feat_col in features_cols:
        similarity = similar(field_col, feat_col) 
        if similarity > 0.6:
            fields.append(index)

[0, 1, 1, 2, 2]

In [23]:
fields

array([0, 1, 1, 2, 2])

In [None]:
# data = {
#     'Click' : [0, 1, 0],
#     'Numeric': [10, 20, 80],
#     'Advertiser-Nike': [1, 0, 1],
#     'Publisher-CNN': [1, 0, 0],
#     'Advertiser-ESPN': [0, 1, 0],
#     'Publisher-BBC': [0, 1, 1]
# }

In [None]:
# df = pd.DataFrame(data)
# df

## For example for first row

| Field   | Numeric | Advertiser     | Advertiser     | Publisher     | Publisher |
|---------|---------|----------------|---------------|-----------------|-----------|
|         | 0       | 1              | 1             | 2               | 2         |
| Feature | Numeric | Advertiser-ESPN | Advertiser-Nike | Publisher-BBC | Publisher-CNN |
|         | 0       | 1              | 2             | 3               | 4         |
| Value   | --      | --             | --            | --              | --        |
|         | 10      | 0              | 1             | 0               | 1         |

## LIBFFM required data is

## `0 0:0:10 1:1:0 1:2:1 2:3:0 2:4:1`
# Without `value=0` for categorical data
## `0 0:0:10 1:2:1 2:4:1`

In [15]:
data = df_dummy.to_numpy()
data

array([[ 0, 10,  0,  1,  0,  1],
       [ 1, 20,  1,  0,  1,  0],
       [ 0, 80,  0,  1,  1,  0]])

In [16]:
X = data[:, 1:]
y = data[:, :1]

In [17]:
X

array([[10,  0,  1,  0,  1],
       [20,  1,  0,  1,  0],
       [80,  0,  1,  1,  0]])

In [18]:
y

array([[0],
       [1],
       [0]])

In [21]:
fields = np.array(fields, dtype=int)

In [22]:
data = df.to_numpy()

write_data_to_xlearn_format(X=X, y=y, filepath='sample_ffm_data.txt', fields=fields)

# LIBFFM Format Data
#### `label  <field>:<feature>:<value>  <field>:<feature>:<value> <field>:<feature>:<value> ...`

# Saved data is

# `0 0:0:10 1:2:1 2:4:1`
# `1 0:0:20 1:1:1 2:3:1`
# `0 0:0:80 1:2:1 2:3:1`