# [Quick Practice] 
-----
-----
## Learning Process - Factorization Machine
#### Using xlearn python-wrapper module (https://xlearn-doc.readthedocs.io/en/latest/)

- Using Titanic dataset. (https://www.kaggle.com/c/titanic)
- Original paper (Steffen Rendle, https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
- Basic Conceptual Description for FM in korean (http://yamalab.tistory.com/107)

-----
-----
## Step 1 : transform features (to FM's format)
- train/test split on dataframe

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [8]:
df1_path = "../dataset/titanic_dataset.csv"
df2_path = "../dataset/titanic_answer.csv"

In [9]:
df1 = pd.read_csv(df1_path)
df2 = pd.read_csv(df2_path)
df = df1.append(df2)

In [10]:
print(df.shape)
df.head()

(1309, 13)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S,,"England / Bennington, VT"
1,2,1,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0,,S,,"Cornwall / Akron, OH"
2,2,1,"Duran y More, Miss. Florentina",female,30.0,1,0,SC/PARIS 2148,13.8583,,C,,"Barcelona, Spain / Havana, Cuba"
3,3,0,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q,,
4,3,1,"Bradley, Miss. Bridget Delia",female,22.0,0,0,334914,7.725,,Q,,"Kingwilliamstown, Co Cork, Ireland Glens Falls..."


In [11]:
def age_discretize(x):
    if x == np.nan:
        return '10'
    else:
        x = int(x)
        if x < 10:
            return '1'
        elif x < 20 and x >= 10:
            return '2'
        elif x < 30 and x >= 20:
            return '3'
        elif x < 40 and x >= 30:
            return '4'
        elif x < 50 and x >= 40:
            return '5'
        elif x < 60 and x >= 50:
            return '6'
        elif x < 70 and x >= 60:
            return '7'
        elif x < 80 and x >= 70:
            return '8'
        elif x < 90 and x >= 80:
            return '9'
        else:
            return '10'

def fare_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

-----
-----
- feature discretize

In [12]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
df = df.dropna()

In [13]:
df['sex'] = df['sex'].apply(lambda x: '1' if x == "female" else '0')
df['age'] = df['age'].apply(lambda x: age_discretize(x))
df['fare'] = df['fare'].apply(lambda x: fare_discretize(int(x)))

In [14]:
df['survived'] = df['survived'].astype('str')
df['pclass'] = df['pclass'].astype('str')
df['sibsp'] = df['sibsp'].astype('str')
df['parch'] = df['parch'].astype('str')

In [15]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,1,2,1,2,0,1,2
1,1,2,1,1,1,1,3
2,1,2,1,4,1,0,2
4,1,3,1,3,0,0,1
6,1,1,1,4,0,0,6


- feature encoding (feature's discretized value must be range 1 ~ n)

In [16]:
# make discretized label to 1~n dict
def make_encoding_label_dict(col_unique):
    encoded_dict = {}
    for idx, unique in enumerate(col_unique):
        encoded_dict[unique] = idx + 1

    return encoded_dict


# get new encoding result
def get_newcode(key, label_dict):
    com_len = len(label_dict)
    if key == np.nan:
        return
    else:
        if key in label_dict:
            return label_dict[key]
        else:
            return com_len + 1

In [17]:
encoded_sibsp_dict = make_encoding_label_dict(df.sibsp.value_counts().index.tolist())
encoded_parch_dict = make_encoding_label_dict(df.parch.value_counts().index.tolist())

In [18]:
df['sibsp'] = df['sibsp'].apply(lambda x: get_newcode(x, encoded_sibsp_dict))
df['parch'] = df['parch'].apply(lambda x: get_newcode(x, encoded_parch_dict))

In [19]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,1,2,1,2,1,2,2
1,1,2,1,1,2,2,3
2,1,2,1,4,2,1,2
4,1,3,1,3,1,1,1
6,1,1,1,4,1,1,6


In [20]:
df.sex.value_counts()

0    658
1    388
Name: sex, dtype: int64

-----
-----
### save as xlearn's fm input type
###### output = label index_1:value_1 index_2:value_2 ... index_n:value_n

- make dicts for sparse marking

In [21]:
col_len_dict = {'pclass': 3, 'sex': 1, 'age': 9, 'sibsp': 7, 'parch': 7, 'fare': 10}
col_accum_index_dict = {}
cumulative = 0
for key, value in col_len_dict.items():
    col_accum_index_dict[key] = cumulative
    cumulative = cumulative + value

train_df, test_df = train_test_split(df, test_size=0.2)

- mark idx:val, with sparse matrix format

In [22]:
txt_file = open('../dataset/train.txt', 'w')
for idx, row in train_df.iterrows():
    vec = []
    label = row['survived']
    vec.append(str(label))
    row = row.drop(labels=['survived'])
    for key, value in row.items():
        if col_len_dict[key] == 1:
            if value != '0':
                col_idx = col_accum_index_dict[key]
                out_val = value
                vec.append(str(col_idx) + ":" + str(out_val))
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
            vec.append(str(col_idx) + ":" + str(out_val))
    txt_file.write("%s\n" % " ".join(vec))
txt_file.close()

In [23]:
txt_file = open('../dataset/test.txt', 'w')
for idx, row in test_df.iterrows():
    vec = []
    label = row['survived']
    vec.append(str(label))
    row = row.drop(labels=['survived'])
    for key, value in row.items():
        if col_len_dict[key] == 1:
            if value != '0':
                col_idx = col_accum_index_dict[key]
                out_val = value
                vec.append(str(col_idx) + ":" + str(out_val))
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
            vec.append(str(col_idx) + ":" + str(out_val))
    txt_file.write("%s\n" % " ".join(vec))
txt_file.close()

-----
-----
## Step 2 : Train Factorization Machine
- test accuracy/auc is just fine. process works well!

In [24]:
import xlearn as xl

def test():
    fm_model = xl.create_fm()

    train_path = '../dataset/train.txt'
    test_path = '../dataset/test.txt'


    fm_model.setTrain(train_path)
    fm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 10,
             'lr':0.2,
             'lambda':0.002,
             'metric': 'auc'}

    # Start to train
    # The trained model will be stored in model.out
    fm_model.fit(param, './model.out')
    fm_model.setTXTModel('./model.txt')

    # Prediction task
    fm_model.setTest(test_path)  # Set the path of test dataset
    fm_model.setSigmoid()                 # Convert output to 0-1

    # Start to predict
    # The output result will be stored in output.txt
    fm_model.predict("./model.out", "./output.txt")

In [25]:
test()

In [26]:
# [------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
# [   10%      ]     1            0.585323            0.517724            0.861508                0.07
# [   20%      ]     2            0.532670            0.479222            0.878329                0.03
# [   30%      ]     3            0.514772            0.464327            0.884684                0.03
# [   40%      ]     4            0.505971            0.457235            0.886833                0.03
# [   50%      ]     5            0.500067            0.451201            0.889263                0.03
# [   60%      ]     6            0.495272            0.446077            0.890384                0.03
# [   70%      ]     7            0.490828            0.445277            0.890571                0.03
# [   80%      ]     8            0.489121            0.440604            0.891973                0.03
# [   90%      ]     9            0.487465            0.440109            0.891879                0.03
# [  100%      ]    10            0.485841            0.437609            0.894309                0.03

-----
-----
## Step 3 : FFM (Field aware Factorization Machine) Practice
- Using xlearn module, train ffm model same as fm.

-----
### input type is diffrent with fm or lm.
###### output = label field_1:index_1:value_1 field_2:index_2:value_2 ...

In [27]:
field_dict = {"Financial": "0",
              "Demography": "1",
              "Family": "2"}
mapping_dict = {"pclass": "Financial", "fare": "Financial", 
                "sex": "Demography", "age": "Demography", 
                "sibsp": "Family", "parch": "Family"}
col_len_dict = {'pclass': 3, 'sex': 1, 'age': 9, 'sibsp': 7, 'parch': 7, 'fare': 10}
col_accum_index_dict = {}
cumulative = 0
for key, value in col_len_dict.items():
    col_accum_index_dict[key] = cumulative
    cumulative = cumulative + value

In [28]:
txt_file = open('../dataset/train_ffm.txt', 'w')
for idx, row in train_df.iterrows():
    vec = []
    label = row['survived']
    vec.append(str(label))
    row = row.drop(labels=['survived'])
    for key, value in row.items():
        if col_len_dict[key] == 1:
            if value != '0':
                col_idx = col_accum_index_dict[key]
                out_val = value
                vec.append(field_dict[mapping_dict[key]] + ":" + str(col_idx) + ":" + str(out_val))
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
            vec.append(field_dict[mapping_dict[key]] + ":" + str(col_idx) + ":" + str(out_val))
    txt_file.write("%s\n" % " ".join(vec))
txt_file.close()

In [29]:
txt_file = open('../dataset/test_ffm.txt', 'w')
for idx, row in test_df.iterrows():
    vec = []
    label = row['survived']
    vec.append(str(label))
    row = row.drop(labels=['survived'])
    for key, value in row.items():
        if col_len_dict[key] == 1:
            if value != '0':
                col_idx = col_accum_index_dict[key]
                out_val = value
                vec.append(field_dict[mapping_dict[key]] + ":" + str(col_idx) + ":" + str(out_val))
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
            vec.append(field_dict[mapping_dict[key]] + ":" + str(col_idx) + ":" + str(out_val))
    txt_file.write("%s\n" % " ".join(vec))
txt_file.close()

-----
-----
- train FFM model : it works well too. 
- FFM models work better than FM if the fields are clearly separated

In [30]:
import xlearn as xl

def test2():
    ffm_model = xl.create_ffm()

    train_path = '../dataset/train_ffm.txt'
    test_path = '../dataset/test_ffm.txt'


    ffm_model.setTrain(train_path)
    ffm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 10,
             'lr':0.2,
             'lambda':0.002,
             'metric': 'auc', 
             'opt':'sgd'}

    # Start to train
    # The trained model will be stored in model.out
    ffm_model.fit(param, './model.out')
    ffm_model.setTXTModel('./model.txt')

    # Prediction task
    ffm_model.setTest(test_path)  # Set the path of test dataset
    ffm_model.setSigmoid()                 # Convert output to 0-1

    # Start to predict
    # The output result will be stored in output.txt
    ffm_model.predict("./model.out", "./output.txt")

In [31]:
test2()

In [32]:
# [------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
# [   10%      ]     1            0.557265            0.439284            0.893001                0.05
# [   20%      ]     2            0.495380            0.413676            0.900850                0.03
# [   30%      ]     3            0.467982            0.444183            0.905616                0.03
# [   40%      ]     4            0.463426            0.427557            0.909354                0.03
# [   50%      ]     5            0.459838            0.446707            0.901131                0.03
# [   60%      ]     6            0.452319            0.447469            0.908233                0.03
# [   70%      ]     7            0.447234            0.445043            0.902719                0.03
# [   80%      ]     8            0.451769            0.393457            0.903187                0.03
# [   90%      ]     9            0.443482            0.397095            0.899542                0.03
# [  100%      ]    10            0.441640            0.384864            0.905616                0.03

-----
-----
### Step 4 : Hyper-paramter Tuning
- various tuning methods

In [27]:
import xlearn as xl

def test3():
    ffm_model = xl.create_ffm()

    train_path = '../dataset/train_ffm.txt'
    test_path = '../dataset/test_ffm.txt'


    ffm_model.setTrain(train_path)
    ffm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 30,
             'lr':0.2, # learning rate
             'lambda':0.002, # L2 parameter
             'metric': 'auc',
             'opt':'sgd', # optimizer
             'stop_window':3, # early-stopping window size
             'fold':3} # k-fold parameter
    
    # k-fold cross-validation
    ffm_model.cv(param)

    # Start to train
    # The trained model will be stored in model.out
    ffm_model.fit(param, './model.out')
    ffm_model.setTXTModel('./model.txt')

    # Prediction task
    ffm_model.setTest(test_path)  # Set the path of test dataset
    ffm_model.setSigmoid()                 # Convert output to 0-1

    # Start to predict
    # The output result will be stored in output.txt
    ffm_model.predict("./model.out", "./output.txt")

In [28]:
test3()

In [29]:
# [------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
# [    3%      ]     1            0.546296            0.432907            0.890291                0.04
# [    6%      ]     2            0.493454            0.432527            0.903000                0.03
# [   10%      ]     3            0.469055            0.429376            0.897486                0.03
# [   13%      ]     4            0.460795            0.497076            0.905243                0.03
# [   16%      ]     5            0.460834            0.399953            0.910289                0.03
# [   20%      ]     6            0.454425            0.403551            0.906270                0.03
# [   23%      ]     7            0.451007            0.393322            0.899168                0.03
# [   26%      ]     8            0.445749            0.461010            0.886366                0.03
# [   30%      ]     9            0.447690            0.406205            0.901691                0.03
# [   33%      ]    10            0.441299            0.400010            0.900570                0.04
# [   36%      ]    11            0.443144            0.414139            0.902252                0.04
# [   40%      ]    12            0.438947            0.396157            0.903560                0.04
# [   43%      ]    13            0.442649            0.389896            0.904588                0.03
# [   46%      ]    14            0.446762            0.385900            0.911690                0.03
# [   50%      ]    15            0.440703            0.402541            0.904215                0.03
# [   53%      ]    16            0.443890            0.401996            0.913653                0.03
# [   56%      ]    17            0.436985            0.397561            0.903841                0.03
# [   60%      ]    18            0.436276            0.398204            0.900383                0.03
# [   63%      ]    19            0.436306            0.391577            0.909448                0.03
# [   66%      ]    20            0.441049            0.382979            0.905336                0.03
# [   70%      ]    21            0.441182            0.406859            0.905803                0.03
# [   73%      ]    22            0.432891            0.397659            0.907111                0.03
# [   76%      ]    23            0.429748            0.411565            0.911690                0.03
# [   80%      ]    24            0.429877            0.388347            0.909822                0.03
# [   83%      ]    25            0.437763            0.387214            0.911597                0.03
# [   86%      ]    26            0.429573            0.419523            0.911223                0.03
# [   90%      ]    27            0.429604            0.396298            0.908513                0.03
# [   93%      ]    28            0.428271            0.396823            0.908139                0.03
# [   96%      ]    29            0.426560            0.421723            0.908046                0.03
# [  100%      ]    30            0.428501            0.401748            0.904682                0.03

-----
-----
### Step 5 : Calculate AUC

In [50]:
validate_data_file = "../dataset/test.txt"
predicted_file = "output.txt"

In [57]:
with open(validate_data_file) as fp:  
    valid_data_ndarray = np.array([int(line[0]) for line in fp.readlines()])
    
with open(predicted_file) as fp:
    content = fp.readlines()
    predicted_data_ndarray = np.array([float(line.strip()) for line in content])

In [58]:
from sklearn.metrics import roc_auc_score

roc_auc_score(valid_data_ndarray, predicted_data_ndarray)

0.8806647985312591

-----
-----
### Step 6 : Logging learning process

In [70]:
import subprocess
import sys

log_path = './log.txt'
sys.stdout = open(log_path, 'w')
subprocess.call('/anaconda3/envs/yoon/bin/python3.6 /Users/admin/Documents/github-workspace/recommender-system/factorization-machine/runner.py',
                stdout=sys.stdout, stderr=subprocess.STDOUT, shell=True)

0