In [1]:
import numpy as np
import pandas as pd
import subprocess
import argparse

  from pandas.core import (


# Preprocessing CareerBuilder 2012

For the CareerBuilder 2012 dataset we first need to artificially create sessions out of the user internactions

In [2]:
def make_sessions(data, 
                  session_th=30 * 60, 
                  is_ordered=False, 
                  user_key='user_id', 
                  item_key='item_id', 
                  time_key='ts'):
    """Assigns session ids to the events in data without grouping keys"""
    if not is_ordered:
        # sort data by user and time
        data.sort_values(by=[user_key, time_key], ascending=True, inplace=True)
    # compute the time difference between queries
    tdiff = np.diff(data[time_key].values)
    # check which of them are bigger then session_th
    split_session = tdiff > session_th
    split_session = np.r_[True, split_session]
    # check when the user changes his data
    new_user = data['user_id'].values[1:] != data['user_id'].values[:-1]
    new_user = np.r_[True, new_user]
    # a new sessions starts when at least one of the two conditions is verified
    new_session = np.logical_or(new_user, split_session)
    # compute the session ids
    session_ids = np.cumsum(new_session)
    data['session_id'] = session_ids
    return data

# Test set

A test set can be either created by (1) adding the last session of every user to be tested or, (2) making a time-based split.

In [3]:
def last_n_days_out_split(data, n=1,
                          user_key='user_id',
                          item_key='item_id',
                          session_key='session_id',
                          time_key='ts',
                          clean_test=True,
                          min_session_length=2):
    """
    last n-days out split
    assign the sessions in the last n days to the test set and remaining to the training one
    """
    DAY = 24 * 60 * 60
    data.sort_values(by=[user_key, time_key], inplace=True)
    # start times of all sessions
    #sessions_start = data.groupby(session_key)[time_key].agg('min')
    # extract test start and end time
    end_time = data[time_key].max()
    test_start = end_time - n * DAY
    
    # get train and test indicies
    session_max_times = data.groupby(session_key)[time_key].max()
    session_train = session_max_times[session_max_times < test_start].index
    session_test = session_max_times[session_max_times >= test_start].index
    
    # in1d: Returns a boolean array the same length as ar1 that is True where 
    # an element of ar1 is in ar2 and False otherwise.
    train = data[
        np.in1d(
            data[session_key], 
            session_train
        )
    ].copy()
    test = data[
        np.in1d(
            data[session_key], 
            session_test
        )
    ].copy()

    if clean_test:
        print("Train item count: " + str(len(train[item_key].unique())))
        before_items = len(test[item_key].unique())
        # remove items which do not occur in the train set
        test = test[np.in1d(test[item_key], train[item_key])]
        after_items = len(test[item_key].unique())
        print("Test item count - Before filtering items which do not occur in the train set: " + str(before_items))
        print("Test item count - After filtering items which do not occur in the train set: " + str(after_items))
        
        #  remove sessions in test shorter than min_session_length
        tslength = test.groupby(session_key).size()
        test = test[
           np.in1d(
                test[session_key], 
                tslength[tslength >= min_session_length].index
            )
        ].copy()
    

    return train, test

#  1. Career Builder 12 processing

In [4]:
path =  "../../data/"
dataset = "cb12/"

raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

For the Kaggle Career Builder 2012 dataset:
* Only have the **application** interaction is available

Sessions are partitioned by a **30-minute** idle time

Keep all sessions: users with >= 1 sessions and also overly active ones (< 200,000 sessions)

Link to dataset: https://www.kaggle.com/c/job-recommendation/data

In [5]:
pd.to_datetime(1333, unit="s")

Timestamp('1970-01-01 00:22:13')

In [6]:
pd.to_datetime(1333554983, unit="s")

Timestamp('2012-04-04 15:56:23')

In [7]:
interactions = pd.read_csv(raw_path + "apps.tsv", header=0, sep='\t')
interactions = interactions.rename(columns={"UserID": "user_id", "JobID": "item_id", "ApplicationDate": "created_at"})
interactions["interaction_type"] = 0 # no different interactions
interactions = interactions.drop(columns=["WindowID", "Split"])
interactions['created_at'] = interactions['created_at'].astype("datetime64[ms]").astype(np.int64) // 10**3 #// 10**9
interactions.to_csv(raw_path + "interactions.csv", sep='\t')
interactions.head()

Unnamed: 0,user_id,created_at,item_id,interaction_type
0,47,1333554983,169528,0
1,47,1333674180,284009,0
2,47,1333593627,2121,0
3,47,1333593422,848187,0
4,47,1333665846,733748,0


In [8]:
# Read data
jobs = pd.read_csv(raw_path + "jobs.tsv", header=0, sep='\t', on_bad_lines='warn') #, error_bad_lines=False)
jobs = jobs.rename(columns={"JobID": "item_id", "State": "state", "Country": "country", "City": "city", "Zip5": "zip5"})
#jobs = jobs.set_index("item_id")


  jobs = pd.read_csv(raw_path + "jobs.tsv", header=0, sep='\t', on_bad_lines='warn') #, error_bad_lines=False)

  jobs = pd.read_csv(raw_path + "jobs.tsv", header=0, sep='\t', on_bad_lines='warn') #, error_bad_lines=False)

  jobs = pd.read_csv(raw_path + "jobs.tsv", header=0, sep='\t', on_bad_lines='warn') #, error_bad_lines=False)
  jobs = pd.read_csv(raw_path + "jobs.tsv", header=0, sep='\t', on_bad_lines='warn') #, error_bad_lines=False)


In [9]:
# Get unique item_ids from interactions
unique_item_ids = jobs['item_id'].unique()
print(len(unique_item_ids))

print(len(interactions))
# Filter items where 'item_id' is in unique_item_ids
interactions = interactions[interactions["item_id"].isin(unique_item_ids)]
print(len(interactions))

1091923
1603111
1603078


In [10]:
del jobs

In [11]:
print("Start Time: {}".format(pd.to_datetime(interactions["created_at"].min(), unit="s")))
print("Start Time: {}".format(pd.to_datetime(interactions["created_at"].max(), unit="s")))

# remove NaN values
interactions = interactions[np.isfinite(interactions['created_at'])]
# convert back to long from float
interactions['created_at'] = interactions['created_at'].astype(np.int64)


interactions['interaction_type'] = interactions['interaction_type'].fillna(0).astype('int')


print('Building sessions')
# partition interactions into sessions with 30-minutes idle time
interactions = make_sessions(interactions, session_th=30 * 60, time_key='created_at', is_ordered=False)


display(interactions.head(3))
# drop duplicate interactions
interactions = interactions.drop_duplicates(['session_id','created_at'])

print('Original data:')
print('Num interactions: {}'.format(len(interactions)))
print('Num sessions: {}'.format(interactions.session_id.nunique()))
print('Num items: {}'.format(interactions.item_id.nunique()))
print('Num users: {}'.format(interactions.user_id.nunique()))

print("")
print('Filtering data')
# drop duplicate interactions within the same session
interactions.drop_duplicates(subset=['item_id', 'session_id', 'interaction_type'], keep='first', inplace=True)

# keep items with >=1 interactions
item_pop = interactions.item_id.value_counts()
#good_items = item_pop[item_pop >= 5].index
good_items = item_pop[item_pop >= 1].index
inter_dense = interactions[interactions.item_id.isin(good_items)]

# remove sessions with length < 3
session_length = inter_dense.session_id.value_counts()
good_sessions = session_length[session_length >= 3].index
inter_dense = inter_dense[inter_dense.session_id.isin(good_sessions)]


## Keep all sessions: users with >= 1 sessions and also overly active ones (< 200,000 sessions)
sess_per_user = inter_dense.groupby('user_id')['session_id'].nunique()
good_users = sess_per_user[(sess_per_user >= 1) & (sess_per_user < 200000)].index
inter_dense = inter_dense[inter_dense.user_id.isin(good_users)]

print('Filtered data:')
print('Num interactions: {}'.format(len(inter_dense)))
print('Num sessions: {}'.format(inter_dense.session_id.nunique()))
print('Num items: {}'.format(inter_dense.item_id.nunique()))
print('Num users: {}'.format(inter_dense.user_id.nunique()))

#inter_dense.to_csv(interim_path + "interactions.csv", sep='\t')

Start Time: 2012-04-01 00:00:21
Start Time: 2012-06-26 23:59:55
Building sessions


Unnamed: 0,user_id,created_at,item_id,interaction_type,session_id
563238,7,1335082466,309823,0,1
563239,7,1335615478,703889,0,2
780806,9,1337034642,809208,0,3


Original data:
Num interactions: 1328810
Num sessions: 639527
Num items: 354285
Num users: 321231

Filtering data
Filtered data:
Num interactions: 697744
Num sessions: 123743
Num items: 213433
Num users: 91544


# 2. Create train and test set by doing a time-based (2 weeks) split

In [12]:
print('Partitioning data')
# last-session-out partitioning
## remove items which do not occur in the test set
## remove sessions in test shorter than min_session_length=2

print("len(inter_dense)")
print(len(inter_dense))
print(" -")
train_full_sessions, test_full_sessions = last_n_days_out_split(inter_dense, n=14,
                                                            user_key='user_id',
                                                            item_key='item_id',
                                                            session_key='session_id',
                                                            time_key='created_at',
                                                            clean_test=True)
print("")



Partitioning data
len(inter_dense)
697744
 -
Train item count: 197578
Test item count - Before filtering items which do not occur in the train set: 29534
Test item count - After filtering items which do not occur in the train set: 13679



In [13]:
train_full_sessions.head()

Unnamed: 0,user_id,created_at,item_id,interaction_type,session_id
780806,9,1337034642,809208,0,3
780807,9,1337035117,136489,0,3
780808,9,1337035689,617374,0,3
780812,14,1337092157,206046,0,7
780813,14,1337092390,787741,0,7


In [14]:
## filtering the valid unique_sessions because the dataset is too big for my computer hardware

unique_sessions = train_full_sessions.session_id.unique()

train_valid_sessions = train_full_sessions.copy()

train_valid_sessions = train_valid_sessions[train_valid_sessions["session_id"].isin( unique_sessions[:11000])]

print("len(train_valid_sessions)")
print(len(train_valid_sessions))
print(" -")
train_valid_sessions, test_valid_sessions = last_n_days_out_split(train_valid_sessions, n=14, #train_full_sessions
                                                              user_key='user_id',
                                                              item_key='item_id',
                                                              session_key='session_id',
                                                              time_key='created_at',
                                                              clean_test=True)

# remove sessions with length < 3
train_valid_session_length = train_valid_sessions.session_id.value_counts()
train_valid_good_sessions = train_valid_session_length[train_valid_session_length >= 3].index
train_valid_sessions = train_valid_sessions[train_valid_sessions.session_id.isin(train_valid_good_sessions)]

test_valid_session_length = test_valid_sessions.session_id.value_counts()
test_valid_good_sessions = test_valid_session_length[test_valid_session_length >= 3].index
test_valid_sessions = test_valid_sessions[test_valid_sessions.session_id.isin(test_valid_good_sessions)]

len(train_valid_sessions)
60636
 -
Train item count: 34880
Test item count - Before filtering items which do not occur in the train set: 10820
Test item count - After filtering items which do not occur in the train set: 1861


In [15]:
train_full_sessions.head()

Unnamed: 0,user_id,created_at,item_id,interaction_type,session_id
780806,9,1337034642,809208,0,3
780807,9,1337035117,136489,0,3
780808,9,1337035689,617374,0,3
780812,14,1337092157,206046,0,7
780813,14,1337092390,787741,0,7


In [16]:
# print statistics
"""
train_len = len(train_full_sessions.session_id.unique())
train_len = len(train_full_sessions.item_id.unique())

test_len = len(test_full_sessions.session_id.unique())
test_len = len(test_full_sessions.item_id.unique())

#merged_full = train_full_sessions.append(test_full_sessions, ignore_index=True)
merged_full = pd.concat([train_full_sessions, test_full_sessions], ignore_index=True)
merged_full = len(merged_full.item_id.unique())
"""
merged_full = pd.concat([train_full_sessions, test_full_sessions], ignore_index=True)

print("FULL DATA - TRAIN+TEST")
print("Train + Test - Interractions: " + str(len(merged_full)))
print("Train + Test - Sessions: " + str(len(train_full_sessions.session_id.unique()) + len(test_full_sessions.session_id.unique())))
print("Train + Test - Sessions: " + str(len(merged_full.session_id.unique())))
print("Train + Test - Items: " + str(len(merged_full.item_id.unique())))
print("Train + Test - Users: " + str(len(merged_full.user_id.unique())))

print("")
print("FULL DATA - TRAIN")
print("Training - Interractions: " + str(len(train_full_sessions)))
print("Training - Sessions: " + str(len(train_full_sessions.session_id.unique())))
print("Training - Items: " + str(len(train_full_sessions.item_id.unique())))
print("Training - Users: " + str(len(train_full_sessions.user_id.unique())))


print("")
print("FULL DATA - TEST")
print("Training - Interractions: " + str(len(train_full_sessions)))
print("Testing - Sessions: " + str(len(test_full_sessions.session_id.unique())))
print("Testing - Items: " + str(len(test_full_sessions.item_id.unique())))
print("Testing - Users: " + str(len(test_full_sessions.user_id.unique())))

print("the case of the much larger CarrerBuilder12 dataset, the train set contains 108, 783 sessions, whereas the test set has 11, 364 sessions")

FULL DATA - TRAIN+TEST
Train + Test - Interractions: 661892
Train + Test - Sessions: 120144
Train + Test - Sessions: 120144
Train + Test - Items: 197578
Train + Test - Users: 88981

FULL DATA - TRAIN
Training - Interractions: 599928
Training - Sessions: 108780
Training - Items: 197578
Training - Users: 81214

FULL DATA - TEST
Training - Interractions: 599928
Testing - Sessions: 11364
Testing - Items: 13361
Testing - Users: 9852
the case of the much larger CarrerBuilder12 dataset, the train set contains 108, 783 sessions, whereas the test set has 11, 364 sessions


In [17]:
merged_valid = pd.concat([train_valid_sessions, test_valid_sessions], ignore_index=True)

print("VALID DATA - TRAIN+TEST")
print("Train + Test - Interractions: " + str(len(merged_valid)))
print("Train + Test - Sessions: " + str(len(train_valid_sessions.session_id.unique()) + len(test_valid_sessions.session_id.unique())))
print("Train + Test - Sessions: " + str(len(merged_valid.session_id.unique())))
print("Train + Test - Items: " + str(len(merged_valid.item_id.unique())))
print("Train + Test - Users: " + str(len(merged_valid.user_id.unique())))

print("")
print("VALID DATA - TRAIN")
print("Training - Interractions: " + str(len(train_valid_sessions)))
print("Training - Sessions: " + str(len(train_valid_sessions.session_id.unique())))
print("Training - Items: " + str(len(train_valid_sessions.item_id.unique())))
print("Training - Users: " + str(len(train_valid_sessions.user_id.unique())))


print("")
print("VALID DATA - TEST")
print("Training - Interractions: " + str(len(test_valid_sessions)))
print("Testing - Sessions: " + str(len(test_valid_sessions.session_id.unique())))
print("Testing - Items: " + str(len(test_valid_sessions.item_id.unique())))
print("Testing - Users: " + str(len(test_valid_sessions.user_id.unique())))

merged_valid.to_csv(interim_path + "merged_valid_interactions.csv", sep='\t')

VALID DATA - TRAIN+TEST
Train + Test - Interractions: 47423
Train + Test - Sessions: 9094
Train + Test - Sessions: 9094
Train + Test - Items: 34880
Train + Test - Users: 6988

VALID DATA - TRAIN
Training - Interractions: 45392
Training - Sessions: 8753
Training - Items: 34880
Training - Users: 6783

VALID DATA - TEST
Training - Interractions: 2031
Testing - Sessions: 341
Testing - Items: 1426
Testing - Users: 309


In [18]:
merged_valid.head()

Unnamed: 0,user_id,created_at,item_id,interaction_type,session_id
0,9,1337034642,809208,0,3
1,9,1337035117,136489,0,3
2,9,1337035689,617374,0,3
3,14,1337092157,206046,0,7
4,14,1337092390,787741,0,7


In [19]:
count=0

for i in test_valid_sessions["item_id"].unique():
    if i not in train_valid_sessions["item_id"].unique():
        print(i)
        count+=1
if count==0:
    print("all items=job IDs from test are in train")
print("END")


all items=job IDs from test are in train
END


In [20]:
count=0
test_list = [926802, 44221]
for i in test_list:
    if i not in test_valid_sessions["item_id"].unique():
        print(i)
        count+=1
if count==0:
    print("all items=job IDs from test are in test_valid_sessions")
print("END")


926802
44221
END


In [21]:
count=0
test_list = [926802, 44221]
for i in test_list:
    if i not in train_valid_sessions["item_id"].unique():
        print(i)
        count+=1
if count==0:
    print("all items=job IDs from test are in train_valid_sessions")
print("END")


all items=job IDs from test are in train_valid_sessions
END


In [22]:
#test to concat train_full_sessions, test_full_sessions to get the same stat as Lacic
# comparison with original paper

merged_full.head()
#CareerBuilder12 661,910 120,147 197,590
print('Num interractions: {}'.format(merged_full.interaction_type.count()))
print('Num interractions: 661,910')
print('')
print('Num sessions: {}'.format(merged_full.session_id.nunique()))
print('Num sessions: 120,147')
print('')
print('Num items: {}'.format(merged_full.item_id.nunique()))
print('Num items: 197,590')
print('')
print('Num users: {}'.format(merged_full.user_id.nunique()))

merged_full.to_csv(interim_path + "merged_full_interactions.csv", sep='\t')

Num interractions: 661892
Num interractions: 661,910

Num sessions: 120144
Num sessions: 120,147

Num items: 197578
Num items: 197,590

Num users: 88981


# 3. Store train and test sets

In [23]:
train_full_sessions.to_csv(processed_path + "train_14d.csv", sep='\t')
test_full_sessions.to_csv(processed_path + "test_14d.csv", sep='\t')
train_valid_sessions.to_csv(processed_path + "train_valid_14d.csv", sep='\t')
test_valid_sessions.to_csv(processed_path + "test_valid_14d.csv", sep='\t')

# 4. Create train and test session vectors

In [24]:
# Create vocabulary from train set
unique_train_items = train_valid_sessions.item_id.unique()
# store (or load)
unique_train_items_df = pd.DataFrame(unique_train_items, columns=["item_id"])
print(len(unique_train_items_df))

unique_train_items_df.to_csv(interim_path + 'valid_vocabulary.csv', header=True)
unique_train_items_df = pd.read_csv(interim_path + 'valid_vocabulary.csv', index_col=0)

34880


In [25]:
# Create vocabulary from train set
unique_train_items = train_full_sessions.item_id.unique()
# store (or load)
unique_train_items_df = pd.DataFrame(unique_train_items, columns=["item_id"])
print(len(unique_train_items_df))

unique_train_items_df.to_csv(interim_path + 'vocabulary.csv', header=True)
unique_train_items_df = pd.read_csv(interim_path + 'vocabulary.csv', index_col=0)

197578


In [26]:
unique_train_items_df.head()

Unnamed: 0,item_id
0,809208
1,136489
2,617374
3,206046
4,787741


In [27]:
unique_train_items_dict = unique_train_items_df.to_dict('dict')["item_id"]
# inverse that item_id is key and index is value
unique_train_items_dict_inv = {v: k for k, v in unique_train_items_dict.items()}

session_ids = train_full_sessions.session_id.unique()

In [28]:
unique_train_items_dict_inv

{809208: 0,
 136489: 1,
 617374: 2,
 206046: 3,
 787741: 4,
 663552: 5,
 787477: 6,
 828603: 7,
 187311: 8,
 1083186: 9,
 516837: 10,
 507614: 11,
 754917: 12,
 686406: 13,
 1058896: 14,
 335132: 15,
 576958: 16,
 848187: 17,
 602298: 18,
 2121: 19,
 586119: 20,
 785425: 21,
 97989: 22,
 281940: 23,
 666073: 24,
 337025: 25,
 849501: 26,
 1016452: 27,
 911643: 28,
 453411: 29,
 425464: 30,
 883817: 31,
 816846: 32,
 13175: 33,
 758079: 34,
 148976: 35,
 1032422: 36,
 896906: 37,
 178574: 38,
 790333: 39,
 533790: 40,
 900480: 41,
 57774: 42,
 507238: 43,
 614418: 44,
 67951: 45,
 779830: 46,
 285976: 47,
 499860: 48,
 213199: 49,
 674313: 50,
 23313: 51,
 524716: 52,
 23620: 53,
 57771: 54,
 460843: 55,
 758714: 56,
 634527: 57,
 1088828: 58,
 455213: 59,
 775441: 60,
 155390: 61,
 590247: 62,
 450025: 63,
 151834: 64,
 1021186: 65,
 1021171: 66,
 167313: 67,
 500312: 68,
 280681: 69,
 1114576: 70,
 662253: 71,
 366888: 72,
 449169: 73,
 153632: 74,
 1008042: 75,
 680718: 76,
 473911: 