In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import calendar

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [2]:
train = pd.read_csv(r"train_8wry4cB.csv")

In [3]:
train.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


In [4]:
train.dtypes

session_id     object
startTime      object
endTime        object
ProductList    object
gender         object
dtype: object

In [5]:
ids = train['session_id']
del train['session_id']
targets = train['gender']

In [6]:
def sess_time(obj1, obj2): #Session time hoping that no one browses more than 24 hrs :)

  time1 = datetime.strptime(str(obj1), '%d/%m/%y %H:%M')
  time2 = datetime.strptime(str(obj2), '%d/%m/%y %H:%M')

  if time1.hour == 23 and time2.hour != 23:
    return abs(time2.hour*60 + (60-time1.minute+time2.minute))
  else:
    return abs((time1.hour-time2.hour)*60 + (time1.minute-time2.minute))    

In [7]:
#Feature Engineering on Date-Time objects
# month, day and time added as features along with session time

time_diff = []
months = []
time_of_day = []
days = []
weekdays = []
for index in range(len(train)):  
  
  time_start = datetime.strptime(str(train['startTime'][index]), '%d/%m/%y %H:%M')
  time_end   = datetime.strptime(str(train['endTime'][index]), '%d/%m/%y %H:%M')

  time_diff.append(sess_time(train['startTime'][index],train['endTime'][index]))
  months.append(time_start.month)
  time_of_day.append((time_start.hour+time_end.hour)/2)
  days.append(time_start.day)
  weekdays.append(calendar.day_name[time_start.weekday()])

In [8]:
# Adding new columns to the dataset after feature engineering
train['session_time'] = time_diff
train['month'] = months
train['time_of_day'] = time_of_day
train['day'] = days
train['weekday'] = weekdays

In [9]:
# Removing original columns from dataset
train.drop(columns=['startTime','endTime'],inplace=True)

In [10]:
num_pdts = [] # List of number of products searched in a session
products = []

for index in range(len(train)):
    s = str(train['ProductList'][index])
    lst = s.split(';')
    num_pdts.append(len(lst))
    products.append(lst)
    
print(max(num_pdts))   # 36 is the maximum total number of products searched by a person


36


In [11]:
all_pdts = [y for x in products for y in x]
unique_pdts = []
unique_pdts = [x for x in all_pdts if x not in unique_pdts]
print(len(all_pdts),len(unique_pdts))
# Since we can see that all products are seemingly unique we cant distinguish between them

23251 23251


In [84]:
cat_A = []
cat_B = []
cat_C = []
cat_D = []

for index in range(len(products)):
    temp_A = []
    temp_B = []
    temp_C = []
    temp_D = []
    for lst in products[index]:
        x = lst.split('/')
        temp_A.append(x[0])
        temp_B.append(x[1])
        temp_C.append(x[2])
        temp_D.append(x[3])

    cat_A.append(temp_A)
    cat_B.append(temp_B)
    cat_C.append(temp_C)
    cat_D.append(temp_D)
        

In [85]:
def padlist(lst,pad_index): 
    # Pads the list at the end it helps us to get required length pf variables later
    while len(lst) != pad_index:
        lst.append('0')

def mkUniq(lst,desired_length): # make all the lists unique
    out_list = []
    for index, x in enumerate(lst):
        list_set = set(x) 
        unique_list = (list(list_set)) 
        padlist(unique_list,desired_length)        
        out_list.append(np.array(unique_list).astype('object'))
    return np.array(out_list)

def most_common(lst,n): # Returns first n elements from the list with most common occurence

def maxlen(lst): # gives out maximum length of sublist
    max = 0
    for x in lst:
        if len(x) > max:
            max = len(x)
    return max

In [86]:
# 3 6 13 36
cat_A = mkUniq(cat_A,3)     
cat_B = mkUniq(cat_B,6)
cat_C = mkUniq(cat_C,13)
cat_D = mkUniq(cat_D,36)

In [87]:
# Check maximum number of unique elements in a category
#print(maxlen(cat_A),maxlen(cat_B),maxlen(cat_C),maxlen(cat_D))
# Output was 3 6 13 36 for A, B, C, D respectively

In [88]:
# Convert the above arrays as dataframesto add them as variables
def create_list(letter,length):
    lst = []
    for i in range(length):
        str = letter+'{:2d}'.format(i)
        lst.append(str)
    return lst

cols_A = create_list('A',3)
cat_A = pd.DataFrame(cat_A,columns=cols_A)    

cols_B = create_list('B',6)
cat_B = pd.DataFrame(cat_B,columns=cols_B)     

cols_C = create_list('C',13)
cat_C = pd.DataFrame(cat_C,columns=cols_C)     

cols_D = create_list('D',36)
cat_D = pd.DataFrame(cat_D,columns=cols_D)

In [89]:
cat_D

Unnamed: 0,D 0,D 1,D 2,D 3,D 4,D 5,D 6,D 7,D 8,D 9,...,D26,D27,D28,D29,D30,D31,D32,D33,D34,D35
0,D28435,D02554,D28436,D28437,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,D29407,D25444,D29404,D02617,D29411,D29418,D29410,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,D16944,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,D10284,D10285,D10286,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,D30805,D30806,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10495,D06407,D06409,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10496,D11660,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10497,D18028,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10498,D09454,D09453,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
