In [6]:
import os
import gc
import argparse
import pandas as pd
import numpy as np
from lightfm import LightFM
from scipy import sparse
from util import s_to_time_format, string_to_datetime, hour_to_range

In [8]:
#---------------------------------
# load dataset
#---------------------------------
df_train = pd.read_csv("train.csv").head(5000)
df_test = pd.read_csv("test.csv").head(5000)

In [9]:
df_train

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,...,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,6881,113261,38038,513.80,5,0,N,0,N,N,...,0,33,172652.0,457,59333,N,0,102,0,516056
1,0,134508,45725,465.62,5,0,N,2,N,N,...,0,9,105114.0,451,0,N,5817,102,0,4376
2,6881,15408,188328,513.80,5,0,N,0,N,N,...,0,6,152458.0,457,59333,N,0,102,0,483434
3,6716,157159,29967,1016.11,5,62,N,5,N,N,...,0,5,172946.0,247,50436,N,3281,102,0,1407164
4,5975,105985,81305,713.66,5,62,N,4,N,N,...,0,6,182129.0,263,93775,N,5817,102,0,1051004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,6881,29914,210026,513.80,5,0,N,0,N,N,...,0,56,64519.0,457,59369,N,0,102,0,541518
4996,0,21984,205835,465.62,5,0,N,2,N,N,...,0,46,105823.0,451,0,N,5817,102,0,22138
4997,6767,68907,87723,841.46,5,62,N,5,N,N,...,0,56,122934.0,241,19197,N,5817,102,0,1224434
4998,6769,156310,201805,346.96,5,62,N,5,N,N,...,0,59,185421.0,251,81372,N,5858,102,0,462689


In [10]:
# 授權時間 time
df_test["loctm"]

0       215328.0
1       222007.0
2       170013.0
3       165914.0
4       215311.0
          ...   
4995    170724.0
4996    161132.0
4997    162321.0
4998    162436.0
4999     82125.0
Name: loctm, Length: 5000, dtype: float64

In [11]:
for df in [df_train, df_test]:
    # pre-processing
    df["loctm_"] = df.loctm.astype(int).astype(str)
    df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
    # # time-related feature
    df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour).astype('category') # lambda x for every x in that column
    df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
    df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)
    # df["loctm_absolute_time"] = [h*60+m for h,m in zip(df.loctm_hour_of_day,df.loctm_minute_of_hour)]
    df["hour_range"] = df.loctm_.apply(lambda x: hour_to_range(x.hour)).astype("category")
    # removed the columns no need
    df.drop(columns = ["loctm_"], axis = 1, inplace = True)
    # auxiliary fields
    df["day_hr_min"] = ["{}:{}:{}".format(i,j,k) for i,j,k in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour)]
    df["day_hr_min_sec"] = ["{}:{}:{}:{}".format(i,j,k,z) for i,j,k,z in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour,df.loctm_second_of_min)]

In [12]:
df = pd.concat([df_train, df_test], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [13]:
#---------------------------------
# prepare bacno-cano count matrix
#---------------------------------
ls = ["bacno", "cano"]


In [14]:
interactions = df[ls+["loctm"]].groupby(ls).count().reset_index().rename(columns = {"loctm":"num_count"})
interactions

Unnamed: 0,bacno,cano,num_count
0,70,82961,1
1,125,116506,2
2,125,116507,1
3,128,5069,1
4,161,191815,1
...,...,...,...
5170,163706,182766,3
5171,163712,144174,1
5172,163797,193898,1
5173,163798,101298,1


In [15]:
# min-max normalization
max_ = interactions.num_count.max()
min_ = interactions.num_count.min()
interactions.num_count = interactions.num_count.apply(lambda x : 1.0 * (x-min_)/(max_-min_))


In [16]:
interactions

Unnamed: 0,bacno,cano,num_count
0,70,82961,0.000000
1,125,116506,0.013699
2,125,116507,0.000000
3,128,5069,0.000000
4,161,191815,0.000000
...,...,...,...
5170,163706,182766,0.027397
5171,163712,144174,0.000000
5172,163797,193898,0.000000
5173,163798,101298,0.000000


In [17]:
# row
num_bacno = interactions["bacno"].nunique()
bacno_dict = {e:i for i, e in enumerate(interactions["bacno"].unique())} # key: bacno ,value: index
bacno_dict_inv = {e:i for i,e in bacno_dict.items()}
# column
num_cano = interactions["cano"].nunique()
cano_dict = {e:i for i, e in enumerate(interactions["cano"].unique())} # key: cano, value: index
cano_dict_inv = {e:i for i,e in cano_dict.items()}


In [28]:
cano_dict

{82961: 0,
 116506: 1,
 116507: 2,
 5069: 3,
 191815: 4,
 88782: 5,
 7893: 6,
 116147: 7,
 145429: 8,
 137858: 9,
 198678: 10,
 210526: 11,
 190809: 12,
 119867: 13,
 100877: 14,
 42909: 15,
 42910: 16,
 172319: 17,
 61540: 18,
 82321: 19,
 72051: 20,
 90255: 21,
 41484: 22,
 182282: 23,
 81961: 24,
 90870: 25,
 173375: 26,
 28889: 27,
 34085: 28,
 171503: 29,
 118985: 30,
 209599: 31,
 101951: 32,
 39769: 33,
 49940: 34,
 49941: 35,
 95810: 36,
 177968: 37,
 72071: 38,
 159052: 39,
 191953: 40,
 12224: 41,
 202201: 42,
 66374: 43,
 198941: 44,
 190346: 45,
 108369: 46,
 209574: 47,
 103532: 48,
 82138: 49,
 29461: 50,
 6098: 51,
 104243: 52,
 13734: 53,
 48935: 54,
 151140: 55,
 114606: 56,
 113100: 57,
 84265: 58,
 26280: 59,
 177281: 60,
 209008: 61,
 195594: 62,
 207924: 63,
 26858: 64,
 150462: 65,
 53624: 66,
 200790: 67,
 83638: 68,
 38062: 69,
 90266: 70,
 100638: 71,
 80836: 72,
 106249: 73,
 29178: 74,
 158746: 75,
 158747: 76,
 207767: 77,
 104892: 78,
 146787: 79,
 136346: 

In [27]:
cano_dict_inv

{0: 82961,
 1: 116506,
 2: 116507,
 3: 5069,
 4: 191815,
 5: 88782,
 6: 7893,
 7: 116147,
 8: 145429,
 9: 137858,
 10: 198678,
 11: 210526,
 12: 190809,
 13: 119867,
 14: 100877,
 15: 42909,
 16: 42910,
 17: 172319,
 18: 61540,
 19: 82321,
 20: 72051,
 21: 90255,
 22: 41484,
 23: 182282,
 24: 81961,
 25: 90870,
 26: 173375,
 27: 28889,
 28: 34085,
 29: 171503,
 30: 118985,
 31: 209599,
 32: 101951,
 33: 39769,
 34: 49940,
 35: 49941,
 36: 95810,
 37: 177968,
 38: 72071,
 39: 159052,
 40: 191953,
 41: 12224,
 42: 202201,
 43: 66374,
 44: 198941,
 45: 190346,
 46: 108369,
 47: 209574,
 48: 103532,
 49: 82138,
 50: 29461,
 51: 6098,
 52: 104243,
 53: 13734,
 54: 48935,
 55: 151140,
 56: 114606,
 57: 113100,
 58: 84265,
 59: 26280,
 60: 177281,
 61: 209008,
 62: 195594,
 63: 207924,
 64: 26858,
 65: 150462,
 66: 53624,
 67: 200790,
 68: 83638,
 69: 38062,
 70: 90266,
 71: 100638,
 72: 80836,
 73: 106249,
 74: 29178,
 75: 158746,
 76: 158747,
 77: 207767,
 78: 104892,
 79: 146787,
 80: 1363

In [44]:
for ix, row in interactions.iterrows():
    bacno_index = bacno_dict[row["bacno"]] # row
    cano_index = cano_dict[row["cano"]] # column
    data[bacno_index,cano_index] = row.num_count

In [18]:
data = np.zeros(shape = (num_bacno,num_cano), dtype = np.float32)
for ix, row in interactions.iterrows():
    bacno_index = bacno_dict[row["bacno"]] # row
    cano_index = cano_dict[row["cano"]] # column
    data[bacno_index,cano_index] = row.num_count
data = sparse.csr_matrix(data)

In [19]:
data

<4851x5175 sparse matrix of type '<class 'numpy.float32'>'
	with 835 stored elements in Compressed Sparse Row format>

In [20]:
no_components = 10
# Instantiate and train the model
model = LightFM(loss='logistic',no_components=no_components)
model.fit(interactions = data,
          epochs=100, 
          num_threads=2,
          verbose = True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99


<lightfm.lightfm.LightFM at 0x13b8f4048>

In [22]:
cano_dict_inv.values()

dict_values([82961, 116506, 116507, 5069, 191815, 88782, 7893, 116147, 145429, 137858, 198678, 210526, 190809, 119867, 100877, 42909, 42910, 172319, 61540, 82321, 72051, 90255, 41484, 182282, 81961, 90870, 173375, 28889, 34085, 171503, 118985, 209599, 101951, 39769, 49940, 49941, 95810, 177968, 72071, 159052, 191953, 12224, 202201, 66374, 198941, 190346, 108369, 209574, 103532, 82138, 29461, 6098, 104243, 13734, 48935, 151140, 114606, 113100, 84265, 26280, 177281, 209008, 195594, 207924, 26858, 150462, 53624, 200790, 83638, 38062, 90266, 100638, 80836, 106249, 29178, 158746, 158747, 207767, 104892, 146787, 136346, 153536, 168876, 179693, 68933, 195472, 118052, 42894, 191445, 181540, 124006, 73106, 56390, 118766, 155896, 172165, 134352, 150363, 72417, 24787, 108456, 108457, 9196, 171229, 143874, 203332, 194142, 143930, 108905, 185006, 207010, 134127, 164098, 167548, 125643, 115025, 48427, 25199, 190175, 140639, 140640, 125328, 55298, 55299, 66520, 39859, 1935, 197375, 173178, 185567, 53

In [21]:
pd.DataFrame({"cano":list(cano_dict_inv.values())})

Unnamed: 0,cano
0,82961
1,116506
2,116507
3,5069
4,191815
...,...
5170,182766
5171,144174
5172,193898
5173,101298


In [None]:
pd.DataFrame(model.item_embeddings,columns = ["{}_{}_latent_features_{}".format(args.row_name,args.column_name,i) for i in range(no_components)]

In [25]:
model.item_embeddings.shape

(5175, 10)

In [29]:
df = pd.concat(
    [pd.DataFrame({"cano":list(cano_dict_inv.values())}),
     pd.DataFrame(model.item_embeddings,columns = ["{}_{}_latent_features_{}".format("banco","cano",i) for i in range(no_components)])
    ],axis = 1)


In [30]:
df

Unnamed: 0,cano,banco_cano_latent_features_0,banco_cano_latent_features_1,banco_cano_latent_features_2,banco_cano_latent_features_3,banco_cano_latent_features_4,banco_cano_latent_features_5,banco_cano_latent_features_6,banco_cano_latent_features_7,banco_cano_latent_features_8,banco_cano_latent_features_9
0,82961,-0.041659,-0.005795,0.008697,0.030713,0.047357,-0.001996,-0.039418,-0.041652,0.005269,0.002169
1,116506,-0.069522,0.060653,-0.010312,-0.149855,0.080473,-0.004878,0.019878,0.098262,-0.019535,-0.015909
2,116507,-0.014295,-0.021832,0.043965,0.036877,0.003445,-0.000224,-0.020185,0.036281,0.036708,0.012506
3,5069,-0.014660,-0.004137,0.031677,-0.012055,-0.029014,0.037831,-0.008868,0.016385,0.046227,0.004796
4,191815,-0.002727,-0.049361,-0.018496,-0.002055,-0.019128,0.019731,0.049103,0.016106,-0.025091,-0.046504
...,...,...,...,...,...,...,...,...,...,...,...
5170,182766,0.097715,-0.015086,0.132893,0.036083,-0.037336,0.065521,0.036871,-0.133321,0.099808,-0.083901
5171,144174,0.031130,0.025435,-0.036554,-0.032263,-0.020106,-0.019017,0.034482,-0.046537,-0.043241,0.004887
5172,193898,-0.018823,0.028132,0.045902,0.011737,-0.038504,0.013983,0.042775,0.037913,-0.031290,-0.025570
5173,101298,0.021931,0.012146,-0.035871,-0.032394,-0.009214,-0.030697,0.038599,0.011280,-0.030990,-0.045419


In [31]:
# user_embeddings
df = pd.concat(
    [pd.DataFrame({"bacno":list(bacno_dict_inv.values())}),
     pd.DataFrame(model.user_embeddings,columns = ["{}_latent_features_{}_w_{}".format("banco",i,"cano") for i in range(no_components)])
    ],axis = 1)
df

Unnamed: 0,bacno,banco_latent_features_0_w_cano,banco_latent_features_1_w_cano,banco_latent_features_2_w_cano,banco_latent_features_3_w_cano,banco_latent_features_4_w_cano,banco_latent_features_5_w_cano,banco_latent_features_6_w_cano,banco_latent_features_7_w_cano,banco_latent_features_8_w_cano,banco_latent_features_9_w_cano
0,70,-0.014077,0.027563,-0.033580,-0.018040,-0.012554,0.027483,0.029210,0.007625,-0.010605,0.027616
1,125,-0.071265,0.066990,0.001854,-0.154694,0.075393,-0.020334,0.037906,0.107656,-0.022787,-0.024139
2,128,-0.009409,0.029137,0.023151,-0.033875,-0.033875,-0.037104,0.044356,0.004312,-0.039971,-0.019456
3,161,0.020382,0.014846,0.016243,0.039151,0.048083,0.012126,0.021841,0.008633,0.006324,-0.007819
4,162,0.032590,-0.002122,0.000517,0.049208,0.046610,0.030299,-0.023445,-0.043323,0.038268,0.007631
...,...,...,...,...,...,...,...,...,...,...,...
4846,163706,0.100727,0.006208,0.141136,0.023403,-0.036015,0.069312,0.054557,-0.137569,0.090113,-0.072139
4847,163712,0.034529,0.043029,0.044956,-0.007002,0.034158,0.003935,0.014452,-0.043395,0.007068,0.022195
4848,163797,-0.015624,-0.009171,0.024134,-0.039143,-0.003623,-0.004200,-0.009365,-0.033203,0.017396,0.013820
4849,163798,-0.008719,0.027074,0.016100,-0.012978,-0.048459,-0.019470,0.014238,0.040154,0.042444,0.019807
