In [2]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
from sklearn import model_selection,preprocessing,metrics
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

import re
from plotly import tools
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
pd.options.display.max_columns = 500

import ast # Abstract Syntax Trees : The ast module helps Python applications to process trees of the Python abstract syntax grammar.
import datetime as dt
import pickle
import gc   # Garbage Collector : gc exposes the underlying memory management mechanism of Python
gc.enable()



In [4]:
def get_date(data):
  data['date'] = pd.to_datetime(data['date'], format='%Y%m%d')
  data['year'] = data['date'].dt.year
  data['month'] = data['date'].dt.month
  data['week'] = data['date'].dt.week
  data['weekday'] = data['date'].dt.weekday

  data['date'] = data['date'].apply(lambda x: dt.date(x.year, x.month, x.day))


In [5]:
def get_date_feature(data):
  month_visit_count = data.groupby(['fullVisitorId'])['month'].agg('count').reset_index()
  week_visit_count = data.groupby(['fullVisitorId'])['week'].agg('count').reset_index()
  weekdaily_visit_count = data.groupby(['fullVisitorId','year','week'])['weekday'].agg('count').reset_index()
  weekdaily_visit_count = weekdaily_visit_count.groupby(['fullVisitorId'])['weekday'].agg('mean').reset_index()

  month_visit_count.rename(columns={'month':'monthly_visit'},inplace=True)
  week_visit_count.rename(columns={'week':'weekly_visit'},inplace=True)
  weekdaily_visit_count.rename(columns={'weekday':'weekdaily_visit'},inplace=True)

  data = data.merge(month_visit_count,on='fullVisitorId',how='left')
  data = data.merge(week_visit_count,on='fullVisitorId',how='left')
  data = data.merge(weekdaily_visit_count,on='fullVisitorId',how='left')
  return data


In [6]:
def get_retention_rate(data):
  total_visit_date = data.groupby(['fullVisitorId','date'])['weekday'].agg('count').reset_index()
  first_visit_date = total_visit_date.groupby('fullVisitorId')['date'].min().reset_index()
  total_visit_date.drop('weekday',axis=1,inplace=True)

  total_visit_date_2 = pd.concat([total_visit_date, first_visit_date]).drop_duplicates(keep=False)

  second_visit_date = total_visit_date_2.groupby('fullVisitorId')['date'].min().reset_index()
  last_visit_date = total_visit_date_2.groupby('fullVisitorId')['date'].max().reset_index()

  first_visit_date.rename(columns={'date':'date_first_visit'},inplace=True)
  second_visit_date.rename(columns={'date':'date_second_visit'},inplace=True)
  last_visit_date.rename(columns={'date':'date_last_visit'},inplace=True)

  visit_date_merged = first_visit_date.merge(second_visit_date,on='fullVisitorId',how='left')
  visit_date_merged = visit_date_merged.merge(last_visit_date,on='fullVisitorId',how='left')

  visit_date_merged['revisit_dur_time'] = (visit_date_merged['date_second_visit'] - visit_date_merged['date_first_visit']).dt.days
  visit_date_merged['total_life_time'] = (visit_date_merged['date_last_visit'] - visit_date_merged['date_first_visit']).dt.days

  #stickness
  agg_total_visit_count = total_visit_date.groupby('fullVisitorId')['date'].agg('count').reset_index()
  agg_total_visit_count.rename(columns={'date':'agg_total_visit_count'},inplace=True)
  agg_total_visit_count['agg_total_visit_count'] = agg_total_visit_count['agg_total_visit_count'] - 1
  agg_total_visit_count.value_counts()
  visit_date_merged = visit_date_merged.merge(agg_total_visit_count,on='fullVisitorId',how='left')
  visit_date_merged['stickness'] = visit_date_merged['agg_total_visit_count']/visit_date_merged['total_life_time']

  #drop datetype
  visit_date_merged.drop(['date_first_visit','date_second_visit','date_last_visit'],axis=1,inplace=True)

  #merge to train df
  data = data.merge(visit_date_merged,on='fullVisitorId',how='left')

  ##memory clear
  del total_visit_date
  del total_visit_date_2
  gc.collect()
  return data

In [7]:
def get_entropy(data,numerator,denumerator):
  data[numerator].value_counts()
  entropy_df = data[['fullVisitorId',denumerator,numerator]]\
  .groupby(['fullVisitorId'])\
  .agg({numerator:'count',denumerator:'sum'}).reset_index()

  entropy_df[denumerator].value_counts(dropna=False)
  entropy_df[denumerator] = entropy_df[denumerator].astype('float')

  entropy_df['Div'] = entropy_df[numerator]/entropy_df[denumerator]
  with np.errstate(divide='ignore'):
      entropy_df['Ent_Upc'] = np.where(entropy_df['Div']==0, 0, entropy_df['Div'] * np.log2(entropy_df['Div']) * -1)
  
  entropy_df = entropy_df[['fullVisitorId','Ent_Upc']]
  entropy_df['Ent_Upc'] = np.abs(entropy_df['Ent_Upc'])
  entropy_df.rename(columns={'Ent_Upc':f'{numerator}_Ent'},inplace=True)

  return entropy_df

In [8]:
numeric_list = [
'hits_hitNumber',
'hits_hour',
'totals_timeOnSite',
'totals_hits',
'totals_sessionQualityDim',
'totals_transactions',
'totals_pageviews',
'hits_minute',
'visitNumber',
'totals_bounces',
'hits_time',
'hits_latencyTracking.redirectionTime',
'hour',
'month',
'weekday',
'year',
'monthly_visit',
'weekly_visit',
'weekdaily_visit',
'revisit_dur_time',
'total_life_time',
'agg_total_visit_count',
'stickness',
'hits_contentGroup.contentGroup2_Ent',
'trafficSource_adwordsClickInfo.gclId_Ent',
'week',
'hits_latencyTracking.pageLoadTime',
'hits_latencyTracking.pageDownloadTime',
'hits_latencyTracking.domainLookupTime',
'hits_latencyTracking.domContentLoadedTime',
'hits_latencyTracking.serverResponseTime',]

In [9]:
del_list = [
'hits_social.socialInteractionNetworkAction','hits_experiment','hits_publisher_infos',
'hits_page.pageTitle',
'hits_page.pagePath',
'hits_page.pagePathLevel1',
'hits_page.pagePathLevel3',
'hits_page.pagePathLevel2',
'hits_page.pagePathLevel4',
'hits_appInfo.landingScreenName',
'hits_eventInfo.eventLabel',
'trafficSource_keyword',
'hits_customDimensions',
'hits_customVariables',
'hits_customMetrics',
'trafficSource_adwordsClickInfo.gclId',
'hits_latencyTracking.speedMetricsSample'
]

In [10]:
def aggregate(df, col, leave): # fn to aggregate all categories in df[col] except for cols in leave
    df[col] = df[col].astype('str')
    include = df[col].unique()  # array of all unique categories
    include = list(include)
    include = set(include).difference(set(leave))  # set: take out 'leave' from include
    include = list(include)
    df.loc[df[col].isin(include), col] = "grouped"  # rename all cols in 'include' to 'grouped'
    return df

def aggregate_opposite(df,col,regs,group_name): # fn to aggregate all categories in df[col] except for cols in leave
    df[col] = df[col].astype('str')
    include = df[col].unique()  # array of all unique categories
    include = list(include)

    for k in range(len(group_name)):
      group_target = []
      for i in include:

        if re.findall(regs[k],i):
          group_target.append(i)

      df.loc[df[col].isin(group_target), col] = group_name[k]  # rename all cols in 'include' to 'grouped'
    return df


def get_nth_rev(col,nth):
  tmp = data.groupby(col)['totals_transactionRevenue'].agg(['size',lambda x: np.count_nonzero(x),'sum','mean'])
  tmp.columns = ['count','count of non-zero revenue','sum','mean']
  tmp = tmp.sort_values(by='count of non-zero revenue',ascending=False)
  tmp = tmp.head(nth).reset_index()
  
  return list(tmp[col])

def reduce_cat(data):
  # 카테고리 묶기 골라내기 + 제외하고 묶기
  regs = ['^\/$','yt','mail']
  group_name = ['/','yt','mail']
  data = aggregate_opposite(data, 'trafficSource_referralPath',regs,group_name)
  data = aggregate(data,'trafficSource_referralPath',group_name)

  regs = ['google','youtube','facebook']
  group_name = ['google','youtube','facebook']
  data = aggregate_opposite(data, 'hits_referer',regs,group_name)
  data = aggregate(data,'hits_referer',group_name)

  regs = ['store.html','signin.html','basket','vieworderdetail','ordercompleted.html','/home']
  group_name = ['store','signin','basket','vieworder','complete','home']
  data = aggregate_opposite(data, 'hits_appInfo.exitScreenName',regs,group_name)
  data = aggregate(data,'hits_appInfo.exitScreenName',group_name)

  regs = ['apparel','store.html','signin.html','basket','vieworderdetail','ordercompleted.html','/home']
  group_name = ['apparel','store','signin','basket','vieworder','complete','home']
  data = aggregate_opposite(data, 'hits_appInfo.screenName',regs,group_name)
  data = aggregate(data,'hits_appInfo.screenName',group_name)


  # 카테고리 묶기 제외하고 묶기
  data = aggregate(data,'trafficSource_source',leave=['(direct)','google','youtube.com','facebook.com'])
  data = aggregate(data,'trafficSource_campaign',leave=['(not set)',])

  ghl = get_nth_rev('geoNetwork_city',30)
  data = aggregate(data,'geoNetwork_city',leave=ghl)

  data['trafficSource_adContent'].fillna('isnull',inplace=True)
  data = aggregate(data,'trafficSource_adContent',leave=['isnull'])

  ghl = get_nth_rev('geoNetwork_country',30)
  data = aggregate(data,'geoNetwork_country',leave=ghl)

  ghl = get_nth_rev('geoNetwork_metro',15)
  data = aggregate(data,'geoNetwork_metro',leave=ghl)

  ghl = get_nth_rev('geoNetwork_region',20)
  data = aggregate(data,'geoNetwork_region',leave=ghl)

  ghl = get_nth_rev('device_browser',5)
  data = aggregate(data,'device_browser',leave=ghl)

  ghl = get_nth_rev('geoNetwork_networkDomain',10)
  data = aggregate(data,'geoNetwork_networkDomain',leave=ghl)

In [11]:
json_vars = ['device', 'geoNetwork', 'totals', 'trafficSource', 'hits', 'customDimensions']

final_vars = ['channelGrouping','customDimensions_index','customDimensions_value','date',
'device_browser','device_deviceCategory','device_isMobile','device_operatingSystem',
'fullVisitorId','geoNetwork_city','geoNetwork_continent','geoNetwork_country',
'geoNetwork_metro','geoNetwork_networkDomain','geoNetwork_region','geoNetwork_subContinent',
'hits_appInfo.exitScreenName','hits_appInfo.landingScreenName','hits_appInfo.screenDepth',
'hits_appInfo.screenName','hits_contentGroup.contentGroup1','hits_contentGroup.contentGroup2',
'hits_contentGroup.contentGroup3','hits_contentGroup.contentGroup4','hits_contentGroup.contentGroup5',
'hits_contentGroup.contentGroupUniqueViews1','hits_contentGroup.contentGroupUniqueViews2',
'hits_contentGroup.contentGroupUniqueViews3','hits_contentGroup.previousContentGroup1',
'hits_contentGroup.previousContentGroup2','hits_contentGroup.previousContentGroup3',
'hits_contentGroup.previousContentGroup4','hits_contentGroup.previousContentGroup5',
'hits_customDimensions','hits_customMetrics','hits_customVariables','hits_dataSource',
'hits_eCommerceAction.action_type','hits_eCommerceAction.option','hits_eCommerceAction.step',
'hits_eventInfo.eventAction','hits_eventInfo.eventCategory','hits_eventInfo.eventLabel',
'hits_exceptionInfo.isFatal','hits_experiment','hits_hitNumber','hits_hour','hits_isEntrance',
'hits_isExit','hits_isInteraction','hits_item.currencyCode','hits_item.transactionId',
'hits_latencyTracking.domContentLoadedTime','hits_latencyTracking.domInteractiveTime',
'hits_latencyTracking.domLatencyMetricsSample','hits_latencyTracking.domainLookupTime',
'hits_latencyTracking.pageDownloadTime','hits_latencyTracking.pageLoadSample',
'hits_latencyTracking.pageLoadTime','hits_latencyTracking.redirectionTime',
'hits_latencyTracking.serverConnectionTime','hits_latencyTracking.serverResponseTime',
'hits_latencyTracking.speedMetricsSample','hits_minute','hits_page.hostname','hits_page.pagePath',
'hits_page.pagePathLevel1','hits_page.pagePathLevel2','hits_page.pagePathLevel3',
'hits_page.pagePathLevel4','hits_page.pageTitle','hits_page.searchCategory','hits_page.searchKeyword',
'hits_promotionActionInfo.promoIsClick','hits_promotionActionInfo.promoIsView','hits_publisher_infos',
'hits_referer','hits_social.hasSocialSourceReferral','hits_social.socialInteractionNetworkAction',
'hits_social.socialNetwork','hits_time','hits_transaction.affiliation','hits_transaction.currencyCode',
'hits_transaction.localTransactionRevenue','hits_transaction.localTransactionShipping',
'hits_transaction.localTransactionTax','hits_transaction.transactionId',
'hits_transaction.transactionRevenue','hits_transaction.transactionShipping',
'hits_transaction.transactionTax','hits_type','totals_bounces','totals_hits','totals_newVisits',
'totals_pageviews','totals_sessionQualityDim','totals_timeOnSite','totals_totalTransactionRevenue',
'totals_transactionRevenue','totals_transactions','trafficSource_adContent',
'trafficSource_adwordsClickInfo.adNetworkType','trafficSource_adwordsClickInfo.gclId',
'trafficSource_adwordsClickInfo.isVideoAd','trafficSource_adwordsClickInfo.page',
'trafficSource_adwordsClickInfo.slot','trafficSource_campaign','trafficSource_isTrueDirect',
'trafficSource_keyword','trafficSource_medium','trafficSource_referralPath','trafficSource_source',
'visitId','visitNumber','visitStartTime']

In [12]:
train_v2 = pd.read_pickle('/Volumes/My Passport/train_02_fin.pkl')
test_v2 = pd.read_pickle('/Volumes/My Passport/test_v2_0.pkl')


ValueError: unsupported pickle protocol: 5