In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

## Function to convert given date-time data to generic timestamp

In [3]:
import datetime
def time_to_timestamp(time_string):
    d = int(time_string.split('/')[0])
    m = int(time_string.split('/')[1])
    temp = time_string.split('/')[-1].split(' ')
    y = int('20'+temp[0])
    h = int(temp[-1].split(':')[0])
    mm = int(temp[-1].split(':')[1])
    ss = 30
    ms = 0
    # datetime.datetime(year, month, date, hour, minutes, seconds, miliseconds)
    return datetime.datetime.timestamp(datetime.datetime(y,m,d,h,mm,ss,ms))


## Data Remodeling for training data.

In [4]:


df_len = df_train.shape[0]

# these values will create the new data dictionary 
ids = [None]*df_len
durations = [None]*df_len
categories_column = [[None]]*df_len
sub_categories_column = [[None]]*df_len
sub_sub_categories_column = [[None]]*df_len
products_column = [[None]]*df_len
genders = [None]*df_len
distinct_categories = [None]*df_len
distinct_sub_categories = [None]*df_len
distinct_sub_sub_categories = [None]*df_len
distinct_products = [None]*df_len


for i in range(df_len):
    ids[i] = (df_train.iloc[i, 0])
    genders[i] = (df_train.iloc[i, -1])
    
    start_timestamp = time_to_timestamp(df_train.iloc[i, 1])
    end_timestamp = time_to_timestamp(df_train.iloc[i, 2])
    duration = end_timestamp - start_timestamp
    # duration comes 0 in many cases since second values are not given, so i'd take them as 1
    if duration == 0.0:
        duration = 1.0
    durations[i] = (duration)
    # print("start: ",start_timestamp, ", end: ", end_timestamp, ", -> duration:", duration)
    
    products = df_train.iloc[i, 3].split(';')
    
    cat, subcat, subsubcat, prod = [], [], [], []

    for product in products:
        # print(product)
        cat.append(int(product.split('/')[0][1:]))
        subcat.append(int(product.split('/')[1][1:]))
        subsubcat.append(int(product.split('/')[2][1:]))
        prod.append(int(product.split('/')[3][1:]))
    
    categories_column[i] = cat
    sub_categories_column[i] = subcat
    sub_sub_categories_column[i] = subsubcat
    products_column[i] = prod

    distinct_categories[i] = len(np.unique(cat))
    distinct_sub_categories[i] = len(np.unique(subcat))
    distinct_sub_sub_categories[i] = len(np.unique(subsubcat))
    distinct_products[i] = len(np.unique(prod))


train_data_dict = {
    "sessionId": ids,
    "duration": durations,
    
    "visited-Products": products_column,
    "unique-Products": distinct_products,
    
    "visited-Categories": categories_column,
    "unique-Categories": distinct_categories,
    
    "visited-Sub-categories": sub_categories_column,
    "unique-Sub-Categories": distinct_sub_categories,
    
    "visited-Sub-sub-categories": sub_sub_categories_column,
    "unique-Sub-sub-categories": distinct_sub_sub_categories,
    
    "gender": genders
}

train_data = pd.DataFrame(train_data_dict)
train_data.to_csv("train_data.csv", index=False)
train_data.head()

Unnamed: 0,sessionId,duration,visited-Products,unique-Products,visited-Categories,unique-Categories,visited-Sub-categories,unique-Sub-Categories,visited-Sub-sub-categories,unique-Sub-sub-categories,gender
0,u16159,60.0,"[28435, 2554, 28436, 28437]",4,"[2, 2, 2, 2]",1,"[3, 3, 3, 3]",1,"[6, 6, 6, 6]",1,female
1,u10253,360.0,"[29404, 2617, 29407, 29410, 29411, 25444, 29418]",7,"[1, 1, 1, 1, 1, 1, 1]",1,"[9, 9, 9, 9, 9, 9, 9]",1,"[31, 31, 31, 31, 31, 31, 31]",1,male
2,u19037,1.0,[16944],1,[2],1,[1],1,[20],1,female
3,u14556,180.0,"[10284, 10285, 10286]",3,"[2, 2, 2]",1,"[4, 4, 4]",1,"[18, 18, 18]",1,female
4,u24295,120.0,"[30805, 30806]",2,"[1, 1]",1,"[1, 1]",1,"[12, 12]",1,male


## Data Remodeling for testing data.

In [5]:


df_len = df_test.shape[0]

# these values will create the new data dictionary 
ids = [None]*df_len
durations = [None]*df_len
categories_column = [[None]]*df_len
sub_categories_column = [[None]]*df_len
sub_sub_categories_column = [[None]]*df_len
products_column = [[None]]*df_len
distinct_categories = [None]*df_len
distinct_sub_categories = [None]*df_len
distinct_sub_sub_categories = [None]*df_len
distinct_products = [None]*df_len


# creating new dictionary
for i in range(df_len):
    ids[i] = (df_test.iloc[i, 0])
    
    start_timestamp = time_to_timestamp(df_test.iloc[i, 1])
    end_timestamp = time_to_timestamp(df_test.iloc[i, 2])
    duration = end_timestamp - start_timestamp
    # duration comes 0 in many cases since second values are not given, so i'd take them as 1
    if duration == 0.0:
        duration = 1.0
    durations[i] = (duration)
    # print("start: ",start_timestamp, ", end: ", end_timestamp, ", -> duration:", duration)
    
    products = df_test.iloc[i, 3].split(';')
    
    cat, subcat, subsubcat, prod = [], [], [], []

    for product in products:
        # print(product)
        cat.append(int(product.split('/')[0][1:]))
        subcat.append(int(product.split('/')[1][1:]))
        subsubcat.append(int(product.split('/')[2][1:]))
        prod.append(int(product.split('/')[3][1:]))
    
    categories_column[i] = cat
    sub_categories_column[i] = subcat
    sub_sub_categories_column[i] = subsubcat
    products_column[i] = prod

    distinct_categories[i] = len(np.unique(cat))
    distinct_sub_categories[i] = len(np.unique(subcat))
    distinct_sub_sub_categories[i] = len(np.unique(subsubcat))
    distinct_products[i] = len(np.unique(prod))


test_data_dict = {
    "sessionId": ids,
    "duration": durations,
    
    "visited-Products": products_column,
    "unique-Products": distinct_products,
    
    "visited-Categories": categories_column,
    "unique-Categories": distinct_categories,
    
    "visited-Sub-categories": sub_categories_column,
    "unique-Sub-Categories": distinct_sub_categories,
    
    "visited-Sub-sub-categories": sub_sub_categories_column,
    "unique-Sub-sub-categories": distinct_sub_sub_categories,
}

test_data = pd.DataFrame(test_data_dict)
test_data.to_csv("test_data.csv", index=False)
test_data.head()

Unnamed: 0,sessionId,duration,visited-Products,unique-Products,visited-Categories,unique-Categories,visited-Sub-categories,unique-Sub-Categories,visited-Sub-sub-categories,unique-Sub-sub-categories
0,u12112,1.0,[19956],1,[2],1,[3],1,[6],1
1,u19725,1.0,[2026],1,[2],1,[5],1,[67],1
2,u11795,1.0,[12538],1,[2],1,[2],1,[4],1
3,u22639,180.0,"[22781, 22782, 19325, 22786]",4,"[2, 2, 2, 2]",1,"[3, 3, 3, 3]",1,"[79, 79, 79, 79]",1
4,u18034,1.0,[23419],1,[2],1,[1],1,[10],1
