In [1]:
import pandas as pd
import numpy as np
from operator import itemgetter
from itertools import product
from sklearn.metrics import roc_auc_score

In [2]:
people = pd.read_csv('people.csv')
train = pd.read_csv('act_train.csv')
test = pd.read_csv('act_test.csv')

In [None]:
#String in dataset are in the form of "str int"
#We only want to keep the int part

#Get start with people
names = []
for name in people.columns:
    if 'char' in name:
        if type(people[name][0]) == str:
            people[name] = people[name].str.replace('type ','')
        if type(people[name][0]) == np.bool_:
            people[name] = people[name]+0
    if 'group' in name:
        people[name] = people[name].str.replace('group ', '')
    if 'people_id' in name:
        people[name] = people[name].str.replace('ppl_', '')
        names.append(name)
    else:
        names.append('ppl_' + name)

#transform date to date variable
people['date'] = pd.to_datetime(people['date'])

#change column names
people.columns = names

In [None]:
#Then transform act dataset
#fill NA
train = train.fillna('type 0')
test = test.fillna('type 0')
test['outcome'] = None

train = train.append(test)

#transfrom date
train['date'] = pd.to_datetime(train['date'])

names = []
for name in train.columns:
    if name == 'outcome':
        names.append(name)
        continue
    if 'char' in name or 'category' in name:
        train[name] = train[name].str.replace('type ','')
    if 'activity_id' in name:
        train[name] = train[name].str.replace('act2_','')
        train[name] = train[name].str.replace('act1_','')
    if 'people_id' in name:
        train[name] = train[name].str.replace('ppl_', '')
        names.append(name)
    else:
        names.append('act_' + name)
        
#change columns name
train.columns = names

#split
test = train[train['outcome'].isnull()]
train = train[~train['outcome'].isnull()]

In [None]:
#merge act and people
train = pd.merge(train, people, on = 'people_id')
test = pd.merge(test, people, on = 'people_id')

In [None]:
train.to_csv('train.csv',index= False)
test.to_csv('test.csv', index = False)

Then we want to apply what we find in EDA: some groups only hava same outcome.
To avoid occurance, we will choose those group who occued over 100 times.

******************************
******************************
******************************

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
all_zero_group = []
all_one_group = []
other_group = []
count = 0
#get different group and store them in list
for g in train.groupby('ppl_group_1'):
    if len(g[1]) > 100:
        outcome_mean = g[1]['outcome'].mean()
        if outcome_mean == 1:
            all_one_group.append( g[0] )
        elif outcome_mean == 0:
            all_zero_group.append( g[0] )
        else:
            other_group.append( g[0] )
            count += len(g[1])
    else:
        other_group.append( g[0] )
        count += len(g[1])

In [None]:
data = train.append(test)

In [None]:
#get all leak data position
#leak_one = data['ppl_group_1'].isin(all_zero_group)
#leak_zero = data['ppl_group_1'].isin(all_one_group)
#data['outcome_leak1'] = data['outcome'].copy()
data['outcome_leak1'] = None
data['outcome_leak1'][data['ppl_group_1'].isin(all_zero_group)] = 0.05
data['outcome_leak1'][data['ppl_group_1'].isin(all_one_group)] = 0.95

In [None]:
#Split data into train and test again
train = data[~data['outcome'].isnull()]
test = data[data['outcome'].isnull()]

In [None]:
#We want to build cross validation dataset
#By EDA, there are duplicated data in our dataset
#We want there are same number of valid observation in each fold.
names = []
for name in train.columns:
    if name != 'act_activity_id':
        names.append(name)
#unique_train contrains all valid information
unique_train = train[~train.duplicated(subset = names)]

In [None]:
#Based on discussion on the forum, it's a good choice to create cv set by people_id
#cv_people contains people_id and its observation number and its mean score (used for stratify)
cv_people = []
for g in unique_train.groupby('people_id'):
    cv_people.append([g[0], len(g[1]), g[1]['outcome'].mean()])
#sort outcome mean since we want to get stratified set based on outcome
cv_people = sorted(cv_people, key = itemgetter(2), reverse = True)

In [None]:
#Then create cv set based on id and corresponding number of non-repeated activity
#create 10-fold containing outcome stratified people_id
cv_people_id = []
cv_group_count = np.zeros(10)
for i in range(10):
    cv_people_id.append( [])
for g in cv_people:
    need_add_index = np.argsort(cv_group_count)[0]
    cv_people_id[need_add_index].append(g[0])
    cv_group_count[need_add_index] += g[1]

In [None]:
#create train and cv stratified dataset
#cv_train contains unqiue information
#cv_eval contains all information
cv_train = []
cv_eval = []
cv_eval_tgt = []

#cv_eval_dup = []
#??
#cv_eval_tgt_dup = []

for g in cv_people_id:
    cv_train.append(unique_train[~unique_train.people_id.isin(g)])
    
    #cv_eval.append(unique_train[unique_train.people_id.isin(g)])
    #cv_eval_tgt.append(unique_train[unique_train.people_id.isin(g)][['act_activity_id', 'outcome']].copy())
    
    cv_eval.append(train[train.people_id.isin(g)])
    cv_eval_tgt.append(train[train.people_id.isin(g)][['act_activity_id', 'outcome'] ].copy())


In [None]:
#According to EDA, combination of group and date map to unique outcome
#Thus, we want to create a group,date table
train['act_date'] = pd.to_datetime(train['act_date'])

alldays = pd.date_range(min(train['act_date']),  max(train['act_date']), freq='D')

In [1]:
#This function is provided is the kaggle forum
def interpolateFun0(x):
    """Original script author's function rewritten in Python.
    The author interpolates between two known values by averaging them. We
    can think of this as 0th order interpolation. """

    ## TODO: This function could use some optimization. The R version is much faster...
    x = x.reset_index(drop=True)
    g = x['outcome'].copy() ## g should be a list or a pandas Series.
    
    global fv

    if (g.shape[0] < 3): ## If we have at most two rows.
        x['outcome_filled'] = g ## Will be replaced by a mean.
#        x['outcome'] = x['filled']
        return x
    
    if np.sum(g.isnull()) == 0:
        x['outcome_filled'] = g
        return x
    
    out = g.values.copy()
    value_locs = np.where(~g.isnull())[0]
    
    if len(value_locs) == 0:
        x['outcome_filled'] = np.full_like(out, np.nan)
#        x['outcome'] = x['filled']
        return x
    
    if len(value_locs) == 1:
        fillval = .89 if (g[value_locs[0]] == 1) else .13
        fv.append((g[value_locs[0]], fillval))
        g[g.isnull()] = fillval

        x['outcome_filled'] = g
#        x['outcome'] = x['filled']

        return x        
    
    # Fill in beginning (if needed)
    if value_locs[0]:
        
        fillval = .89 if (g[value_locs[0]] == 1) else .13
        fv.append((g[value_locs[0]], fillval))
        
        out[0:value_locs[0]] = fillval

    # Interpolate holes in the middle
    for i in range(0, len(value_locs) - 1):
        beg = value_locs[i]
        end = value_locs[i + 1]
        
        if g[beg] != g[end]:
            out[beg+1:end] = np.interp(range(beg+1, end), [beg, end], [g[beg], g[end]])
        else:
            out[beg+1:end] = g[beg]

    # Fill in end (if needed)
    if end < (len(g) - 1):
        beg = value_locs[-1]
        fillval = .89 if (g[beg] == 1) else .13
        fv.append((g[beg], fillval))

        out[beg+1:] = fillval

    x['outcome_filled'] = out
#    x['outcome'] = x['filled']
    
    return x

In [18]:
#Apply this rule via interpolation on this each cross-validation set and check its power
fv = []
for i in range(10):
    #create group-date table for cv set used to be interpolation
    #get all group in a cv set
    group_name = cv_eval[i]['ppl_group_1'].unique()
    cv_gd_table = pd.DataFrame.from_records(product(group_name, alldays))
    cv_gd_table.columns = ['ppl_group_1','act_date']

    #get known gt-table in the train set
    train_gd_table = cv_train[i].groupby(['ppl_group_1', 'act_date'])['outcome'].agg('mean').to_frame().reset_index()
    train_gd_table['act_date'] = pd.to_datetime(train_gd_table['act_date'])
    #fill cv table by train table
    cv_gd_table = pd.merge(cv_gd_table, train_gd_table,on =['ppl_group_1','act_date'],  how = 'left')
    
    #insert value to the None date in the same group
    cv_gd_table = cv_gd_table.groupby('ppl_group_1').apply(interpolateFun0)
    cv_gd_table.columns = ['ppl_group_1', 'act_date', 'outcome_leak2', 'outcome_ip']
    
    cv_eval[i]['act_date'] = pd.to_datetime(cv_eval[i]['act_date'] )
    cv_eval[i] = pd.merge(cv_eval[i], cv_gd_table, on = ['ppl_group_1', 'act_date'], how = 'left')
    
    #Since sometimes, groups in cv set are not in train set. In this case, we can't fill th value. Instead, we can fill all Na with the
    #mean value in training set
    cv_eval[i]['outcome_filled'] = cv_eval[i]['outcome_ip'].fillna(cv_train[i]['outcome'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
del  cv_gd_table
del train_gd_table

In [20]:
#We should append all we get to trainset as new feature
#concat eval set together and merge with original train set
temp = pd.concat(cv_eval)[['act_activity_id','outcome_leak2', 'outcome_ip', 'outcome_filled']]
train = pd.merge(train, temp, on='act_activity_id', how = 'left')
del temp

In [21]:
group_name = test['ppl_group_1'].unique()
cv_gd_table = pd.DataFrame.from_records(product(group_name, alldays))
cv_gd_table.columns = ['ppl_group_1','act_date']

train_gd_table = train.groupby(['ppl_group_1', 'act_date'])['outcome'].agg('mean').to_frame().reset_index()
train_gd_table['act_date'] = pd.to_datetime(train_gd_table['act_date'])

cv_gd_table = pd.merge(cv_gd_table, train_gd_table,on =['ppl_group_1','act_date'],  how = 'left')


cv_gd_table = cv_gd_table.groupby('ppl_group_1').apply(interpolateFun0)
cv_gd_table.columns = ['ppl_group_1', 'act_date', 'outcome_leak2', 'outcome_ip']

test['act_date'] = pd.to_datetime(test['act_date'] )
test = pd.merge(test, cv_gd_table, on = ['ppl_group_1', 'act_date'], how = 'left')

test['outcome_filled'] = test['outcome_ip'].fillna(train['outcome'].mean())

del  cv_gd_table
del train_gd_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
train.to_csv("leak_train.csv")
test.to_csv("leak_test.csv")

******************************
******************************
******************************

In [None]:
train = pd.read_csv("leak_train.csv")
test = pd.read_csv("leak_test.csv")

In [24]:
#To fit our data into classifier, we need to transform our category variables via one-hot or tf-idf.
data = train.append(test)

In [25]:
#According to EDA, category variables include 
#act: "act_activity_category", 'act_char_1' --'act_char_10'
#people: 'ppl_group_1', 'ppl_group_1' -- 'ppl_group_9'
#'ppl_group_1' -'ppl_group_9' are the detailed activity, 'ppl_group_10' is more vague. Thus it containes more 
#much more tyep than any one of others. We need to filter some its value.
char_10_count = train.groupby('act_char_10')['ppl_char_10'].agg('count').order(ascending = False)



In [26]:
print ('Char_10 has {} types '.format(len(char_10_count)))
print ('Maximum count is {0}, minimum count is {1} '.format(max(char_10_count), min(char_10_count) ))
print ('{} observations occured over 100 times '.format(  len(char_10_count[char_10_count>100]) ))
print ('They contains {}% of whole data '.format(  sum(char_10_count[char_10_count>100]) / float(sum(char_10_count)) ))


Char_10 has 6516 types 
Maximum count is 935241, minimum count is 1 
1076 observations occured over 100 times 
They contains 0.95612089102% of whole data 


In [27]:
#transform all low-frequency type as -1
a = data.act_char_10.copy()
a[a.isin(char_10_count[char_10_count <= 100].index)] = -1
data.act_char_10 = a
del a

In [28]:
#need to be done

In [29]:
#backup
data.act_date = pd.to_datetime(data.act_date).values.astype('datetime64[D]')
data.ppl_date = pd.to_datetime(data.ppl_date).values.astype('datetime64[D]')

In [30]:
#manage date-related variable, mutate them to create more variables
#extract the weekday information
data['act_dayofweek'] = data['act_date'].copy()
data['act_dayofweek'] = data['act_dayofweek'].dt.dayofweek

In [31]:
#get the day difference starting with the minimum day

#first for act
minimum_day = data['act_date'].min()
data['act_date_diff'] = data['act_date'].copy()
data['act_date_diff'] = (data['act_date_diff'] - minimum_day) / np.timedelta64(1, 'D') 

#Then for people
minimum_day = data['ppl_date'].min()
data['ppl_date_diff'] = data['ppl_date'].copy()
data['ppl_date_diff'] = (data['ppl_date_diff'] - minimum_day) / np.timedelta64(1, 'D')
#Besides, we assume there may be some connection between ppl_date and corresponding act_ppl
data['ap_date_diff'] = (data['act_date'] - data['ppl_date']) /  np.timedelta64(1, 'D')

In [29]:
######
#need to be done


In [32]:
data.head()

Unnamed: 0,people_id,act_activity_id,act_date,act_activity_category,act_char_1,act_char_2,act_char_3,act_char_4,act_char_5,act_char_6,...,ppl_char_37,ppl_char_38,outcome_leak1,outcome_leak2,outcome_ip,outcome_filled,act_dayofweek,act_date_diff,ppl_date_diff,ap_date_diff
0,100.0,1734928.0,2023-08-26,4,0,0,0,0,0,0,...,0,36,0.05,0.0,0.0,0.0,5,405.0,407.0,788.0
1,100.0,2434093.0,2022-09-27,2,0,0,0,0,0,0,...,0,36,0.05,0.0,0.0,0.0,1,72.0,407.0,455.0
2,100.0,3404049.0,2022-09-27,2,0,0,0,0,0,0,...,0,36,0.05,0.0,0.0,0.0,1,72.0,407.0,455.0
3,100.0,3651215.0,2023-08-04,2,0,0,0,0,0,0,...,0,36,0.05,0.0,0.0,0.0,4,383.0,407.0,766.0
4,100.0,4109017.0,2023-08-26,2,0,0,0,0,0,0,...,0,36,0.05,0.0,0.0,0.0,5,405.0,407.0,788.0


In [33]:
#End of this step
train = data[~data['outcome'].isnull()]
test = data[data['outcome'].isnull()]
train.to_csv("leak_train.csv")
test.to_csv("leak_test.csv")