In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read Training Set
df = pd.read_csv('train.csv',dtype={
                             'ip': np.uint8,
                             'app': np.uint8,
                             'device' : np.uint8,
                             'os' : np.uint8,
                             'channel' : np.uint8,
                             'is_attributed' : np.uint8    
                        },parse_dates=['click_time'])

In [3]:
df['day'] = df['click_time'].dt.day.astype('uint8')
# Sample Data
df = df[df['day']==7]

In [4]:
df['hour'] = df['click_time'].dt.hour.astype('uint8')
df['minute'] = df['click_time'].dt.minute.astype('uint8')
df['second'] = df['click_time'].dt.second.astype('uint8')

#Edit minute variable to two-digit string
df['minute'] = df['minute'].apply(lambda x: '{0:0>2}'.format(x))

In [5]:
ranges = [-1, 14, 29, 44, np.inf]  # np.inf for infinity
labels = ['1', '2', '3', '4']
df['seconds_grp'] = pd.cut(df['second'], 
                                  bins=ranges, 
                                  labels=labels)
df['interval'] = df['hour'].astype(str) + df['minute'].astype(str) + df['seconds_grp'].astype(str)
#df['interval'] = df['interval'].astype('uint32')

df.sample(5)

MemoryError: Unable to allocate 455. MiB for an array with shape (59633310,) and data type object

In [None]:
df_interval = df.groupby(
    'interval'
).aggregate(
    {
        'is_attributed':sum,
        'click_time':"count"
    }
).rename(
    columns = {
        'is_attributed': 'total_dl', 
        'click_time': 'total_clicks'
    }, 
    inplace = False)
df_interval.sample(5)

In [None]:
df_interval['conversion_rate'] = df_interval['total_dl']/df_interval['total_clicks']
df_interval['ratio'] = df_interval['total_clicks']/df_interval['total_dl']
df_interval.reset_index(level=0, inplace=True)
df_interval.head()

In [None]:
df_interval.dtypes

In [None]:
df_interval.plot(x ='interval', y='conversion_rate', kind = 'line')
plt.show()

In [None]:
df_interval.plot(x ='interval', y='ratio', kind = 'line')
plt.show()

In [None]:
df_interval.plot(x ='interval', y='total_clicks', kind = 'line')
plt.show()

In [None]:
#threshold for fraud total clicks
co = df_interval[df_interval['conversion_rate']==0]
Q1 = co.total_clicks.quantile(0.25)
Q3 = co.total_clicks.quantile(0.75)
IQR = Q3 - Q1
click_outlier = Q3 + 1.5 * IQR
print(click_outlier)

In [None]:
#threshold for fraud total clicks
ro = df_interval[df_interval['conversion_rate']!=0]
Q1 = ro.ratio.quantile(0.25)
Q3 = ro.ratio.quantile(0.75)
IQR = Q3 - Q1
ratio_outlier = Q3 + 1.5 * IQR
print(ratio_outlier)

In [None]:
# apply to df_interval
df_interval.loc[(df_interval['total_dl'] == 0) & (df_interval['total_clicks'] > click_outlier), 'isFraud'] = 1
df_interval.loc[(df_interval['total_dl'] != 0) & (df_interval['ratio'] > ratio_outlier), 'isFraud'] = 1
df_interval.loc[(df_interval['isFraud'].isnull()), 'isFraud'] = 0

# event rate on df_interval
df_interval.isFraud.sum()/len(df_interval)

In [None]:
# apply to df
df = pd.merge(df, df_interval[['interval', 'isFraud']], on='interval', how='left')

# event rate on df
df.isFraud.sum()/len(df)