In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from datetime import datetime
import time
import matplotlib.pyplot as plt
from scipy.stats import mode

In [None]:
uid_train = pd.read_csv('data/train/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('data/train/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('data/train/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('data/train/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [None]:
voice_test = pd.read_csv('data/test/voice_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('data/test/sms_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('data/test/wa_test_b.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [None]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('ref/uid_test_a.txt',index=None)

In [None]:
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)

In [None]:
sms['start_time'] = sms['start_time'].apply(lambda x:str(x).zfill(8) if len(str(x))<8 else str(x))
sms['start_date'] = sms['start_time'].apply(lambda x:int(x[0:2]))
sms['start_time'] = sms['start_time'].apply(lambda x:datetime.strptime("{}:{}:{}".format(x[2:4],x[4:6],x[6:8]),"%H:%M:%S"))
sms['start_hour'] = sms['start_time'].apply(lambda x:x.hour)
sms['is_work_time'] = sms['start_hour'].apply(lambda x:1 if (8<=x)&(x<=18) else 0)
sms['is_long_head'] = sms['opp_head'].apply(lambda x:1 if len(str(x))==3 else 0)
sms['is_long_len'] = sms['opp_len'].apply(lambda x:1 if x>5 else 0)

In [None]:
le = LabelEncoder()
encoded = le.fit_transform(list(set(sms['opp_num'])))
in_encoded = le.inverse_transform(encoded)

In [None]:
def sms_feat(sms):
    sms_feature = pd.DataFrame()

    sms_gp = sms.groupby('uid')['uid']
    cnt = sms_gp.apply(lambda x:x.count())
    sms_feature['uid'] = cnt.index
    sms_feature['sms_cnt'] = cnt.values

    
    sms_gp = sms.groupby('uid')['opp_num']
    cnt_num = sms_gp.apply(lambda x:len(set(x)))
    sms_feature['sms_cnt_num'] = cnt_num.values
    
    '''
    sms_gp = sms.groupby('uid')['opp_num']
    cnt_num = sms_gp.apply(lambda x:x.value_counts().head(1)).reset_index()
    most_appear_num = []
    for i in range(0,len(cnt_num)):
        idx = np.where(in_encoded == cnt_num['level_1'].values[i])
        lb = encoded[idx]
        most_appear_num.append(lb[0])
    m = pd.DataFrame({'sms_mostfreq_num':most_appear_num})
    m.insert(0,'uid',cnt.index)
    sms_feature = sms_feature.merge(m, on='uid', how='left').reset_index(drop=True)
    
    sms_gp = sms.groupby('uid')['opp_num']
    cnt_num = sms_gp.apply(lambda x:x.value_counts().head(1)).reset_index()
    most_appear_count = []
    for i in range(0,len(cnt_num)):
        most_appear_count.append(cnt_num['opp_num'].values[i])
    m = pd.DataFrame({'sms_mostfreq_count':most_appear_count})
    m.insert(0,'uid',cnt.index)
    sms_feature = sms_feature.merge(m, on='uid', how='left').reset_index(drop=True)
    '''
    

    sms_gp = sms.groupby('uid')['opp_head']
    cnt_num = sms_gp.apply(lambda x:len(set(x)))
    sms_feature['sms_head_cnt'] = cnt_num.values
    
    sms_gp = sms.groupby('uid')['opp_head'].agg(['max','min'])
    m = sms_gp.add_prefix('sms_head_').reset_index().fillna(0)
    sms_feature = sms_feature.merge(m, on='uid', how='left').reset_index(drop=True)
    
    sms_gp = sms.groupby('uid')['opp_head']
    cnt_num = sms_gp.apply(lambda x:mode(x)[0][0])
    sms_feature['sms_head_mode'] = cnt_num.values
    
    sms_gp = sms.groupby(['uid','is_long_head'])['opp_head']
    cnt_hd = sms_gp.apply(lambda x:x.count())
    cnt_hd = cnt_hd.unstack(fill_value=0).reset_index(drop=True)
    cnt_hd.columns = ['0', '1']
    sms_feature['sms_cnt_long_head'] = cnt_hd['1']
    sms_feature['sms_cnt_not_long_head'] = cnt_hd['0']
    

    '''
    sms_gp = sms.groupby('uid')['start_date']
    cnt_date = sms_gp.apply(lambda x:mode(x)[0][0])
    sms_feature['sms_mode_date'] = cnt_date.values

    sms_gp = sms.groupby('uid')['start_time']
    cnt_time = sms_gp.apply(lambda x:len(set(x)))
    sms_feature['sms_cnt_time'] = cnt_time.values
    '''
    
    sms_gp = sms.groupby(['uid','is_work_time'])['start_time']
    cnt_work = sms_gp.apply(lambda x:x.count())
    cnt_work = cnt_work.unstack(fill_value=0).reset_index(drop=True)
    cnt_work.columns = ['0', '1']
    sms_feature['sms_cnt_work'] = cnt_work['1']
    sms_feature['sms_cnt_not_work'] = cnt_work['0']
    
    
    sms_gp = sms.groupby('uid')['opp_len']
    cnt_num = sms_gp.apply(lambda x:len(set(x)))
    sms_feature['sms_opp_len_cnt'] = cnt_num.values
    
    sms_gp = sms.groupby('uid')['opp_len'].agg(['std','max','min','mean','median'])
    m = sms_gp.add_prefix('sms_opp_len_').reset_index().fillna(0)
    sms_feature = sms_feature.merge(m, on='uid', how='left').reset_index(drop=True)
    
    sms_gp = sms.groupby('uid')['opp_len']
    cnt_num = sms_gp.apply(lambda x:mode(x)[0][0])
    sms_feature['sms_opp_len_mode'] = cnt_num.values
    
    '''
    sms_gp = sms.groupby(['uid','is_long_len'])['opp_len']
    cnt_len = sms_gp.apply(lambda x:x.count())
    cnt_len = cnt_len.unstack().add_prefix('sms_cnt_len_').reset_index().fillna(0)
    sms_feature = sms_feature.merge(cnt_len, on='uid', how='left').reset_index(drop=True)
    
    sms_gp = sms.groupby(['uid','opp_len'])['uid']
    cnt_len = sms_gp.apply(lambda x:x.count())
    cnt_len = cnt_len.unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)
    sms_feature = sms_feature.merge(cnt_len, on='uid', how='left').reset_index(drop=True)
    
    sms_gp = sms.groupby(['uid','in_out'])['opp_num']
    cnt_io = sms_gp.apply(lambda x:x.count())
    cnt_io = cnt_io.unstack(fill_value=0).reset_index(drop=True)
    cnt_io.columns = ['0', '1']
    sms_feature['sms_cnt_out'] = cnt_io['0']
    sms_feature['sms_cnt_in'] = cnt_io['1']
    '''

    return sms_feature

In [None]:
total_feature = pd.DataFrame()

for i in (0,16,32):  
    sms_split = sms[(i<=sms['start_date'])&(sms['start_date']<i+16)]
    for j in (0,1):
        sms_split_io = sms_split[sms_split['in_out']==j]
        sms_gp = sms_split_io.groupby('uid')['uid']
        cnt = sms_gp.apply(lambda x:x.count())
        if(j==0):
            sms_feature = sms_feat(sms_split_io).iloc[:,1:].add_suffix("_day"+str(i)+"_out")
        else:
            sms_feature = sms_feat(sms_split_io).iloc[:,1:].add_suffix("_day"+str(i)+"_in")
        sms_feature.insert(0,'uid',cnt.index)
        if i==0 and j==0:
            total_feature = sms_feature
        else:
            total_feature = total_feature.merge(sms_feature, on='uid', how='outer').reset_index(drop=True)

total_feature = total_feature.fillna(0)

In [None]:
voice['opp_head'] = voice['opp_head'].replace('DDD',0)
voice['opp_head'] = voice['opp_head'].apply(lambda x:int(x))
voice['start_time'] = voice['start_time'].apply(lambda x:str(x).zfill(8) if len(str(x))<8 else str(x))
voice['start_date'] = voice['start_time'].apply(lambda x:int(x[0:2]))
voice['start_time'] = voice['start_time'].apply(lambda x:datetime.strptime("{}:{}:{}".format(x[2:4],x[4:6],x[6:8]),"%H:%M:%S"))
voice['end_time'] = voice['end_time'].apply(lambda x:str(x).zfill(8) if len(str(x))<8 else str(x))
voice['end_time'] = voice['end_time'].apply(lambda x:datetime.strptime("{}:{}:{}".format(x[2:4],x[4:6],x[6:8]),"%H:%M:%S"))
voice['start_hour'] = voice['start_time'].apply(lambda x:x.hour)
voice['is_work_time'] = voice['start_hour'].apply(lambda x:1 if (8<=x)&(x<=18) else 0)

In [None]:
voice['time_dura'] = voice['end_time'] - voice['start_time']
voice['time_dura'] = voice['time_dura'].apply(lambda x:x.seconds)

In [None]:
def voice_feat(voice):
    voice_feature = pd.DataFrame()

    voice_gp = voice.groupby('uid')['uid']
    cnt = voice_gp.apply(lambda x:x.count())
    voice_feature['uid'] = cnt.index
    voice_feature['voice_cnt'] = cnt.values

    voice_gp = voice.groupby('uid')['opp_num']
    cnt_num = voice_gp.apply(lambda x:len(set(x)))
    voice_feature['voice_cnt_num'] = cnt_num.values

    voice_gp = voice.groupby('uid')['opp_head']
    cnt_num = voice_gp.apply(lambda x:len(set(x)))
    voice_feature['voice_head_cnt'] = cnt_num.values
    
    voice_gp = voice.groupby('uid')['opp_head'].agg(['max','min'])
    m = voice_gp.add_prefix('voice_head_').reset_index().fillna(0)
    voice_feature = voice_feature.merge(m, on='uid', how='left').reset_index(drop=True)
    
    voice_gp = voice.groupby('uid')['opp_head']
    cnt_num = voice_gp.apply(lambda x:mode(x)[0][0])
    voice_feature['voice_head_mode'] = cnt_num.values
    
    voice_gp = voice.groupby('uid')['time_dura'].agg(['std','max','min','mean','median','sum'])
    m = voice_gp.add_prefix('voice_dura_').reset_index().fillna(0)
    voice_feature = voice_feature.merge(m, on='uid', how='left').reset_index(drop=True)
    
    voice_gp = voice.groupby(['uid','is_work_time'])['opp_num']
    cnt_work = voice_gp.apply(lambda x:x.count())
    cnt_work = cnt_work.unstack(fill_value=0).reset_index(drop=True)
    cnt_work.columns = ['0', '1']
    voice_feature['voice_cnt_work'] = cnt_work['0']
    voice_feature['vocie_cnt_not_work'] = cnt_work['1']
    
    voice_gp = voice.groupby('uid')['opp_len']
    cnt_num = voice_gp.apply(lambda x:len(set(x)))
    voice_feature['voice_opp_len_cnt'] = cnt_num.values
    
    voice_gp = voice.groupby('uid')['opp_len'].agg(['std','max','min','mean','median'])
    m = voice_gp.add_prefix('voice_opp_len_').reset_index().fillna(0)
    voice_feature = voice_feature.merge(m, on='uid', how='left').reset_index(drop=True)
    
    voice_gp = voice.groupby('uid')['opp_len']
    cnt_num = voice_gp.apply(lambda x:mode(x)[0][0])
    voice_feature['voice_opp_len_mode'] = cnt_num.values

    '''
    voice_gp = voice.groupby(['uid','opp_len'])['uid']
    cnt_len = voice_gp.apply(lambda x:x.count())
    cnt_len = cnt_len.unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)
    voice_feature = voice_feature.merge(cnt_len, on='uid', how='left').reset_index(drop=True)
    
    voice_gp = voice.groupby(['uid','in_out'])['opp_num']
    cnt_io = voice_gp.apply(lambda x:x.count())
    cnt_io = cnt_io.unstack(fill_value=0).reset_index(drop=True)
    cnt_io.columns = ['0', '1']
    voice_feature['voice_cnt_out'] = cnt_io['0']
    voice_feature['voice_cnt_in'] = cnt_io['1']
    '''

    voice_gp = voice.groupby(['uid','call_type'])['opp_num']
    cnt_type = voice_gp.apply(lambda x:x.count())
    cnt_type = cnt_type.unstack().add_prefix('voice_cnt_type').reset_index().fillna(0)
    voice_feature = voice_feature.merge(cnt_type, on='uid', how='left').reset_index(drop=True)

    return voice_feature

In [None]:
for i in (0,16,32):
    voice_split = voice[(i<=voice['start_date'])&(voice['start_date']<i+16)]
    for j in (0,1):
        voice_split_io = voice_split[voice_split['in_out']==j]
        voice_gp = voice_split_io.groupby('uid')['uid']
        cnt = voice_gp.apply(lambda x:x.count())
        if(j==0):
            voice_feature = voice_feat(voice_split_io).iloc[:,1:].add_suffix("_day"+str(i)+"_out")
        else:
            voice_feature = voice_feat(voice_split_io).iloc[:,1:].add_suffix("_day"+str(i)+"_in")
        voice_feature.insert(0,'uid',cnt.index)
        total_feature = total_feature.merge(voice_feature, on='uid', how='outer').reset_index(drop=True)

total_feature = total_feature.fillna(0)

In [None]:
wa['flow_per_sec'] =(wa['up_flow'] + wa['down_flow'])/wa['visit_dura']
wa['flow_per_sec'] = wa['flow_per_sec'].replace(np.inf,0)
wa['dura_per_visit'] = wa['visit_dura']/wa['visit_cnt']
wa['visit_dura'] = wa['visit_dura'].fillna(0)
wa['up_flow'] =  wa['up_flow'].fillna(0)
wa['down_flow'] =  wa['down_flow'].fillna(0)
wa['flow_per_sec'] = wa['flow_per_sec'].fillna(0)
wa['date'] = wa['date'].fillna(0)
wa['date'] = wa['date'].apply(lambda x:int(x))

In [None]:
def wa_feat(wa): 
    wa_feature = pd.DataFrame()

    wa_gp = wa.groupby('uid')['uid']
    cnt = wa_gp.apply(lambda x:x.count())
    wa_feature['uid'] = cnt.index
    wa_feature['wa_cnt'] = cnt.values
    
    wa_gp = wa.groupby('uid')['wa_name']
    cnt = wa_gp.apply(lambda x:len(set(x)))
    wa_feature['wa_name_cnt'] = cnt.values
    
    wa_gp = wa.groupby('uid')['visit_cnt'].agg(['std','max','min','mean','median','sum'])
    m = wa_gp.add_prefix('wa_visit_cnt_').reset_index().fillna(0)
    wa_feature = wa_feature.merge(m, on='uid', how='left').reset_index(drop=True)

    wa_gp = wa.groupby('uid')['visit_dura'].agg(['std','max','min','mean','median','sum'])
    m = wa_gp.add_prefix('wa_visit_dura_').reset_index().fillna(0)
    wa_feature = wa_feature.merge(m, on='uid', how='left').reset_index(drop=True)
    
    wa_gp = wa.groupby('uid')['dura_per_visit'].agg(['std','max','min','mean','median'])
    m = wa_gp.add_prefix('dura_per_visit_').reset_index().fillna(0)
    wa_feature = wa_feature.merge(m, on='uid', how='left').reset_index(drop=True)

    wa_gp = wa.groupby('uid')['up_flow'].agg(['std','max','min','mean','median','sum'])
    m = wa_gp.add_prefix('wa_up_flow_dura_').reset_index().fillna(0)
    wa_feature = wa_feature.merge(m, on='uid', how='left').reset_index(drop=True)

    wa_gp = wa.groupby('uid')['down_flow'].agg(['std','max','min','mean','median','sum'])
    m = wa_gp.add_prefix('wa_down_flow_dura_').reset_index().fillna(0)
    wa_feature = wa_feature.merge(m, on='uid', how='left').reset_index(drop=True)

    wa_gp = wa.groupby('uid')['flow_per_sec'].agg(['std','max','min','mean','median'])
    m = wa_gp.add_prefix('wa_flow_per_sec_').reset_index().fillna(0)
    wa_feature = wa_feature.merge(m, on='uid', how='left').reset_index(drop=True)

    '''
    wa_gp = wa.groupby(['uid','wa_type'])['uid']
    cnt_type = wa_gp.apply(lambda x:x.count())
    cnt_type = cnt_type.unstack(fill_value=0).reset_index(drop=True)
    cnt_type.columns = ['0', '1']
    wa_feature['wa_count_type0'] = cnt_type['0']
    wa_feature['wa_count_type1'] = cnt_type['1']

    wa_wx = wa[wa['wa_name']=='微信']
    wa_gp = wa_wx.groupby('uid')['wa_name']
    cnt_wx = wa_gp.apply(lambda x:x.count())
    cnt_wx = cnt_wx.to_frame().reset_index()
    cnt_wx.columns = ['uid','wa_count_wx']
    wa_feature = wa_feature.merge(cnt_wx, on='uid', how='left').reset_index(drop=True)
    '''
    
    return wa_feature

In [None]:
for i in (0,16,32):
    wa_split = wa[(i<=wa['date'])&(wa['date']<i+16)]
    for j in (0,1):     
        wa_split_io = wa_split[wa_split['wa_type']==j]
        wa_gp = wa_split_io.groupby('uid')['uid']
        cnt = wa_gp.apply(lambda x:x.count())
        if(j==0):
            wa_feature = wa_feat(wa_split_io).iloc[:,1:].add_suffix("_day"+str(i)+"_type0")
        else:
            wa_feature = wa_feat(wa_split_io).iloc[:,1:].add_suffix("_day"+str(i)+"_type1")
        wa_feature.insert(0,'uid',cnt.index)
        total_feature = total_feature.merge(wa_feature, on='uid', how='outer').reset_index(drop=True)

total_feature = total_feature.fillna(0)

In [None]:
#pca = PCA(n_components=100)
#new_pca = pca.fit_transform(total_feature.iloc[:,1:])

In [None]:
#total_feature.iloc[:,1:] = preprocessing.scale(total_feature.iloc[:,1:])

In [None]:
#total_feature.iloc[:,1:] = Normalizer().fit_transform(total_feature.iloc[:,1:])

In [None]:
train_feature = uid_train
train_feature = pd.merge(train_feature,total_feature,how='left',on='uid')

In [None]:
test_feature = uid_test
test_feature = pd.merge(test_feature,total_feature,how='left',on='uid')

In [None]:
train_feature.to_csv('train_feature.csv',index=None)
test_feature.to_csv('test_feature.csv',index=None)

In [None]:
print(len(total_feature.columns))