In [None]:
!pip install scikit-learn==0.21.3

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import os

import time
import seaborn as sns

In [None]:
sns.set(rc={'figure.figsize':(12,5)});
plt.figure(figsize=(12,5));

In [None]:
train = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train.csv', nrows=10000000)
test = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
variables = ['ip', 'app', 'device', 'os', 'channel']
for v in variables:
    train[v] = train[v].astype('category')
    test[v]=test[v].astype('category')

In [None]:
#set click_time and attributed_time as timeseries
train['click_time'] = pd.to_datetime(train['click_time'])
train['attributed_time'] = pd.to_datetime(train['attributed_time'])
test['click_time'] = pd.to_datetime(test['click_time'])

#set as_attributed in train as a categorical
train['is_attributed']=train['is_attributed'].astype('category')

In [None]:
train.describe()

In [None]:
plt.figure(figsize=(10, 6))
cols = ['ip', 'app', 'device', 'os', 'channel']
uniques = [len(train[col].unique()) for col in cols]
sns.set(font_scale=1.2)
ax = sns.barplot(cols, uniques, log=True)
ax.set(xlabel='Feature', ylabel='log(unique count)', title='Number of unique values per feature (from 10,000,000 samples)')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center") 
# for col, uniq in zip(cols, uniques):
#     ax.text(col, uniq, uniq, color='black', ha="center")


In [None]:
#double check that 'attributed_time' is not Null for all values that resulted in download (i.e. is_attributed == 1)
train[['attributed_time', 'is_attributed']][train['is_attributed']==1].describe()

In [None]:
#set click_id to categorical, for cleaner statistics view
test['click_id']=test['click_id'].astype('category')
test.describe()

Quick Notes/Observations :

* There are only 18717 attributed_time values. This means only 18,717 out of 10,000,000 clicks resulted in a download. That's less than 0.2% !
* There are ip adresses that trigger a click over 50 thousand times. Seems strange that one ip address would click that often in a span of just 4 days. Does that mean that ip address encoded is not device id, but network id? (explore this below)
* First click in train set is on 2017-11-06 14:32:21. Test clicks start on 2017-11-10. Based on data specifications, train coveres a 4 day period. This means that the train and test data do not overlap, but test data is taken the day after train data ends. -Train data is ordered by timestamp. (therefore batches pulled in order cover limited time span)
* 2017-11-06 was a Monday. 2017-11-10 was a Friday. i.e. Train is Mon-Thur, Test is Friday -There is no missing data in Test. Missing values in train appear to be only for attributed_time, where there isn't any value due to no app download.

Only a small proportion of clicks were followed by a download:

In [None]:
plt.figure(figsize=(6,6))
#sns.set(font_scale=1.2)
mean = (train.is_attributed.values == 1).mean()
ax = sns.barplot(['App Downloaded (1)', 'Not Downloaded (0)'], [mean, 1-mean])
ax.set(ylabel='Proportion', title='App Downloaded vs Not Downloaded')
for p, uniq in zip(ax.patches, [mean, 1-mean]):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height+0.01,
            '{}%'.format(round(uniq * 100, 2)),
            ha="center")


**Explore ip counts. Check if multiple ips have any downloads.**

At this point I was trying to figure out what 'ip' were actually encoding. My original understanding that ips were user specific did not hold up to scrutiny. If ip repeated too many times, was it a bot? This does not appear to be true, as repeated ips do convert. See below:

In [None]:
#temporary table to see ips with their associated count frequencies
temp = train['ip'].value_counts().reset_index(name='counts')
temp.columns = ['ip', 'counts']
temp[:10]

In [None]:
#add temporary counts of ip feature ('counts') to the train table, to see if IPs with high counts have conversions
train= train.merge(temp, on='ip', how='left')

In [None]:
#check top 10 values
train[train['is_attributed']==1].sort_values('counts', ascending=False)[:10]

In [None]:
train[train['is_attributed']==1].ip.describe()

So high frequency ip counts do get conversions. Up to 56 downloads for one ip. Each IP must be for some network with many devices.

In [None]:
#convert 'is_attributed' back to numeric for proportion calculations
train['is_attributed']=train['is_attributed'].astype(int)

**Conversion rates over Counts of 300 most popular IPs**

In [None]:
proportion = train[['ip', 'is_attributed']].groupby('ip', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = train[['ip', 'is_attributed']].groupby('ip', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='ip', how='left')
merge.columns = ['ip', 'click_count', 'prop_downloaded']

ax = merge[:300].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates over Counts of 300 Most Popular IPs')
ax.set(ylabel='Count of clicks')
plt.ylabel('Proportion Downloaded')
plt.show()

print('Counversion Rates over Counts of Most Popular IPs')
print(merge[:20])

Conversions are noisy and do not appear to correlate with how popular an IP is.

**Conversions by App**

Check 100 most popular apps by click count:

In [None]:
proportion = train[['app', 'is_attributed']].groupby('app', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = train[['app', 'is_attributed']].groupby('app', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='app', how='left')
merge.columns = ['app', 'click_count', 'prop_downloaded']

ax = merge[:100].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates over Counts of 100 Most Popular Apps')
ax.set(ylabel='Count of clicks')
plt.ylabel('Proportion Downloaded')
plt.show()

print('Counversion Rates over Counts of Most Popular Apps')
print(merge[:20])


There is a again a huge difference in clicks per app, with minimum of one click on an app and max at almost 13 million. The proportion flucuates more as the counts go down, since each additional click has larger impact on the proportion value. In general, for apps with counts in the thousands the ratio stays within 0.0001 - 0.0015 boundary. For less popular apps it fluxuates more widely.

**Conversions by OS**

Look at top 100 operating systems by click count

In [None]:
proportion = train[['os', 'is_attributed']].groupby('os', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = train[['os', 'is_attributed']].groupby('os', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='os', how='left')
merge.columns = ['os', 'click_count', 'prop_downloaded']

ax = merge[:100].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates over Counts of 100 Most Popular Operating Systems')
ax.set(ylabel='Count of clicks')
plt.ylabel('Proportion Downloaded')
plt.show()

print('Counversion Rates over Counts of Most Popular Operating Systems')
print(merge[:20])

Same story. For values in the thousands the boundary on the ratio is very low, roughly between 0.0006 and 0.003, but as counts on OS become lower, the ratio starts fluxuating more wildely.

**Conversions by Device**

Devices are extremely disproportionately distributed, with number one device used almost 94% of time. For that device proportion download was 0.001326. (0.13%)

In [None]:
proportion = train[['device', 'is_attributed']].groupby('device', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = train[['device', 'is_attributed']].groupby('device', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='device', how='left')
merge.columns = ['device', 'click_count', 'prop_downloaded']

print('Count of clicks and proportion of downloads by device:')
print(merge)

**Conversions by Channel
**

In [None]:
proportion = train[['channel', 'is_attributed']].groupby('channel', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = train[['channel', 'is_attributed']].groupby('channel', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='channel', how='left')
merge.columns = ['channel', 'click_count', 'prop_downloaded']

ax = merge[:100].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates over Counts of 100 Most Popular Apps')
ax.set(ylabel='Count of clicks')
plt.ylabel('Proportion Downloaded')
plt.show()

print('Counversion Rates over Counts of Most Popular Channels')
print(merge[:20])

There appear to be a few peaks for channels at reasonable click quantity, but overall the pattern holds same as for categories above.

# ****Checking for time patterns****

Round the click time down to an hour of the day to see if there are any hourly patterns.

For this part cannot use the first n rows from train data, as it's organized by time. To get a genral idea for the pattern, will use train data from the randomly sampled 100000 train set provided by organizers.

In [None]:
train_smp = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train_sample.csv')

In [None]:
train_smp.head(7)

In [None]:
#convert click_time and attributed_time to time series
train_smp['click_time'] = pd.to_datetime(train_smp['click_time'])
train_smp['attributed_time'] = pd.to_datetime(train_smp['attributed_time'])

In [None]:
#round the time to nearest hour
train_smp['click_rnd']=train_smp['click_time'].dt.round('H')  

#check for hourly patterns
train_smp[['click_rnd','is_attributed']].groupby(['click_rnd'], as_index=True).count().plot()
plt.title('HOURLY CLICK FREQUENCY');
plt.ylabel('Number of Clicks');

train_smp[['click_rnd','is_attributed']].groupby(['click_rnd'], as_index=True).mean().plot()
plt.title('HOURLY CONVERSION RATIO');
plt.ylabel('Converted Ratio');

There is no clear hourly time pattern in ratios, however there is a definete pattern in frequency of clicks based on time of day.

Lets extract the hour of day from each day as a separate feature, and see combined trend (merge the 4 days together by hour).

In [None]:
#extract hour as a feature
train_smp['click_hour']=train_smp['click_time'].dt.hour

In [None]:
train_smp.head(7)

Let's check number of clicks by hour:

In [None]:
train_smp[['click_hour','is_attributed']].groupby(['click_hour'], as_index=True).count().plot(kind='bar', color='#a675a1')
plt.title('HOURLY CLICK FREQUENCY Barplot');
plt.ylabel('Number of Clicks');

train_smp[['click_hour','is_attributed']].groupby(['click_hour'], as_index=True).count().plot(color='#a675a1')
plt.title('HOURLY CLICK FREQUENCY Lineplot');
plt.ylabel('Number of Clicks');

And number of conversions by hours:

In [None]:
train_smp[['click_hour','is_attributed']].groupby(['click_hour'], as_index=True).mean().plot(kind='bar', color='#75a1a6')
plt.title('HOURLY CONVERSION RATIO Barplot');
plt.ylabel('Converted Ratio');

train_smp[['click_hour','is_attributed']].groupby(['click_hour'], as_index=True).mean().plot( color='#75a1a6')
plt.title('HOURLY CONVERSION RATIO Lineplot');
plt.ylabel('Converted Ratio');

overlay the two graphs to see if patterns correlate in any way

In [None]:
group = train_smp[['click_hour','is_attributed']].groupby(['click_hour'], as_index=False).mean()
x = group['click_hour']
ymean = group['is_attributed']
group = train_smp[['click_hour','is_attributed']].groupby(['click_hour'], as_index=False).count()
ycount = group['is_attributed']


fig = plt.figure()
host = fig.add_subplot(111)
par1 = host.twinx()

host.set_xlabel("Time")
host.set_ylabel("Proportion Converted")
par1.set_ylabel("Click Count")

#color1 = plt.cm.viridis(0)
#color2 = plt.cm.viridis(0.5)
color1 = '#75a1a6'
color2 = '#a675a1'

p1, = host.plot(x, ymean, color=color1,label="Proportion Converted")
p2, = par1.plot(x, ycount, color=color2, label="Click Count")

lns = [p1, p2]
host.legend(handles=lns, loc='best')

host.yaxis.label.set_color(p1.get_color())
par1.yaxis.label.set_color(p2.get_color())

plt.savefig("pyplot_multiple_y-axis.png", bbox_inches='tight')

In [None]:
sns.barplot('click_hour', 'is_attributed', data=train_smp)
plt.title('HOURLY CONVERSION RATIO');
plt.ylabel('Converted Ratio');

**Look into attributed_time**

It could be useful to learn more about conversions that did take place. Let's see how much time passed from clicking on the ad to downloading it.

In [None]:
train_smp['timePass']= train_smp['attributed_time']-train_smp['click_time']
#check:
train_smp[train_smp['is_attributed']==1][:15]

In [None]:
train_smp['timePass'].describe()

It takes as long as (almost) 20 hours to go from click to purchase and as little as 4 seconds.

The 4 seconds seems to low to make a decision. This person would have either seen the ad before, or already been aware of the product some other way.

Does that mean the ad was clicked on multiple times, but only one click was counted as conversion? Or did the person click on the ad specifically with the intent to download? (eg, if channel is something like google search, the ad could be clicked during search results view and app downloaded immediately because that's what the person intended to do right away)

Raises questions to explore:

How accurately are conversions tracked? How are clicks and downloads linked? What happens if download after multiple clicks? Is there a way to identify likely same users (same IP, Device, etc...)

In [None]:
import os
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance

In [None]:
train = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train_sample.csv', parse_dates=['click_time', 'attributed_time'])
test = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv', parse_dates=['click_time'])



In [None]:
train.head()

In [None]:
test.head()

In [None]:
test.describe()

In [None]:
train.dtypes

In [None]:
categories = list(train.select_dtypes('int64').columns.values)
categories.remove('is_attributed')

for col in categories:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

train.is_attributed = train.is_attributed.astype('category')
test.click_id = test.click_id.astype('category')

In [None]:
plt.figure(figsize=(8,8))
mean = (train.is_attributed.values == 1).mean()
ax = sns.barplot(['is_attributed (1)', 'is_attributed (0)'], [mean, 1-mean], palette='deep')
ax.set(xlabel='Target', ylabel='Probability', title='Distribution of is_attribute')
for p, uniq in zip(ax.patches, [mean, 1-mean]):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height+0.01,
            '{}%'.format(round(uniq * 100, 2)),
            ha="center")


In [None]:
plt.figure(figsize=(10, 6))
uniques = [len(train[col].unique()) for col in categories]
sns.set(font_scale=1.2)
ax = sns.barplot(categories, uniques, log=True)
ax.set(xlabel='Feature', ylabel='Count', title='Number of unique values per feature')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center")

**Check if all attributed_time are recorded according to is_attributed.
**

In [None]:
print(train.loc[train.is_attributed==1]['attributed_time'].isnull().sum())

**Check ip and device count relationship.**

In [None]:
train.groupby('ip')['device'].count().describe()

In [None]:
import xgboost as xgb
from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV

In [None]:
train = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train_sample.csv', nrows=100000, parse_dates=['click_time', 'attributed_time'])

X_train = train[['ip', 'app', 'device', 'os', 'channel']]
y_train = train['is_attributed']

**Search for best parameters in terms of the area under the ROC curve metric as per the competition evaluation metrics.**

In [None]:
params = {
    'max_depth': list(range(5, 11)),
    'learning_rate': list(np.arange(0.05, 0.30, 0.05)),
    'gamma': list(np.arange(0.01, 0.06, 0.01)),
    'min_child_weight': list(range(1, 6)),
    'max_delta_step': list(range(10, 22, 2)),
    'colsample_bytree': list(np.arange(0.5, 1.1, 0.1)),
    'reg_lambda': [1000, 2000, 3000],
    
    # fixed params
    'scale_pos_weight': [99], # Because 99 percent of data is negative
    'n_jobs': [4],
    'objective': ['binary:logistic'],
    'random_state': [42]
}

model = xgb.XGBClassifier(tree_method='hist')
cv = GridSearchCV(model, params, cv=5, n_jobs=4, scoring='roc_auc')

cv.fit(X_train, y_train)

In [None]:
print('Best estimator:')
print(cv.best_estimator_)

score = cv.best_estimator_.predict_proba(X_train)
print('Best ROC-AUC: {:.4f}'.format(metrics.roc_auc_score(y_train, score[:, 1], average='macro')))

# Feature Engineering
**Extract clicking time into day, hour, minute:**

In [None]:
def processTimeFeatures(df):
    df['click_day'] = df.click_time.dt.day

    df['click_hour'] = df.click_time.dt.hour

    df['click_minute'] = df.click_time.dt.minute
    
    df.drop(['click_time'], axis=1, inplace=True)

In [None]:
processTimeFeatures(train)
train.head()

In [None]:
test = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv', parse_dates=['click_time'])
processTimeFeatures(test)
test.head()

# Return rate:
**Is this a one-off or an actually interesting app to this user that he/she continues to look at it?**

In [None]:
train = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train_sample.csv', parse_dates=['click_time', 'attributed_time'])
test = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv', parse_dates=['click_time'])


In [None]:
def getAvgAppClickPerIp(df):
    df['avg_app_click_by_ip'] = df[['app', 'ip']].groupby('app')['ip'].agg(lambda x: float(len(x)) / len(x.unique()))

In [None]:
getAvgAppClickPerIp(train)

In [None]:
train.head()

In [None]:
train.avg_app_click_by_ip.describe()

# Time until next click:
**Is the user in a download spree?**

In [None]:
train = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train_sample.csv', parse_dates=['click_time', 'attributed_time'])
test = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv', parse_dates=['click_time'])


In [None]:
def getTimeToNextClick(df):
    df['time_to_next_click'] = df[['ip', 'os', 'device', 'channel', 'click_time']]\
                                .groupby(['ip', 'os', 'device', 'channel'])['click_time']\
                                .transform(lambda x: x.diff().shift(-1).dt.seconds)

In [None]:
getTimeToNextClick(train)
train.head()

In [None]:
train.time_to_next_click.describe()

**Preprocessing data and write to files for future reuse**

In [None]:
train = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train_sample.csv',parse_dates=['click_time', 'attributed_time'])

getAvgAppClickPerIp(train)
getTimeToNextClick(train)

train.to_csv('train_60mil_with_avgClick_timeToNext.csv')

del train

In [None]:
test = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv', parse_dates=['click_time'])

getAvgAppClickPerIp(train)
getTimeToNextClick(train)

test.to_csv('test_with_avgClick_timeToNext.csv')

del test

# Train and test model
**Use xgboost.train instead of xgboost.XGBClassifier to enable incremental training.**

In [None]:
import time
import datetime
import xgboost as xgb

num_rows = 60000000
start = 0
batch_size = 20000000

params = {
    'max_depth': 6,
    'learning_rate': 0.15,
    'objective': 'binary:logistic',
    'n_jobs': 3,
    'random_state': 42,
    'gamma': 0.03,
    'min_child_weight': 4,
    'max_delta_step': 20,
    'colsample_bytree': 0.7,
    'reg_lambda': 1000,
    'scale_pos_weight': 99, # Because 99 percent of data is negative
    'tree_method': 'gpu_hist',
    'predictor':'cpu_predictor' # To avoid Windows Error
}

feats = [ # features to train
    'ip',
    'app',
    'device',
    'os',
    'channel',
    'click_day',
    'click_hour',
    'click_minute',
    'avg_app_click_by_ip'
]

model = None
# model_filename = None
cur_done = start
# for cur_done in range(start, num_rows, batch_size):
while cur_done < num_rows:
    
    start_time = time.time()
    
    train = pd.read_csv('train_60mil_with_avgClick_timeToNext.csv', skiprows=range(start+1, cur_done+1), nrows=batch_size, parse_dates=['click_time', 'attributed_time'])

    train = processTimeFeatures(train)
    
    dmatrix = xgb.DMatrix(train[feats], train['is_attributed'], feature_names=feats)
    model = xgb.train(params, dmatrix, xgb_model=model, num_boost_round=100) # num_boost_round equivalent to n_estimators in XGBClassifier
#     model_filename = 'models/model'
#     model.save_model(model_filename)
    
    del train
    
    elapsed = time.time() - start_time
    
    cur_done += batch_size
    left = num_rows - cur_done
    if left == 0:
        break
    batch_size = min(left, batch_size)

    print("\rProgress = {:9d}, batch size = {:8d}, left = {:9d}, elapsed = {:s}".format(
        cur_done, batch_size, left,str(datetime.timedelta(seconds=elapsed))))

In [None]:
test = pd.read_csv('test_with_avgClick_timeToNext.csv', parse_dates=['click_time'])
processTimeFeatures(test)

pred = model.predict(xgb.DMatrix(test[feats], feature_names=feats))

In [None]:
sub = pd.DataFrame(pred, np.array(test['click_id']), columns=['is_attributed'])

In [None]:
sub.to_csv('xgb_with_xgb_params_acpi_ttnc_60mil.csv', index_label=['click_id'])

In [None]:
import pickle

pickle.dump(model, open('models/xgb_with_xgb_params_acpi_ttnc_60mil_1806190601.pickle.dat', 'wb'))

# Feature importance

In [None]:
import xgboost as xgb

In [None]:
train = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train.csv', nrows=10000000, parse_dates=['click_time', 'attributed_time'])

In [None]:

import time
import datetime

groups = [    
    {'groupby': ['ip']},
    {'groupby': ['ip', 'app']},
    {'groupby': ['ip', 'channel']},
    {'groupby': ['ip', 'os']},
    {'groupby': ['ip', 'app', 'device', 'os', 'channel']},
    {'groupby': ['ip', 'os', 'device']},
    {'groupby': ['ip', 'os', 'device', 'app']}
]

# Calculate the time to next click for each group
for gr in groups:
    start_time = time.time()
    
    feature_name = '{}_time_to_next_click'.format('_'.join(gr['groupby']))        
    all_features = gr['groupby'] + ['click_time']

    train[feature_name] = train[gr['groupby'] + ['click_time']].\
                            groupby(gr['groupby']).click_time.\
                            transform(lambda x: x.diff().shift(-1)).dt.seconds
    elapsed = time.time() - start_time
    
    print('Done {:s}, in {:s}'.format(feature_name, str(datetime.timedelta(seconds=elapsed))))
train.head()

In [None]:
train.to_csv('../input/talkingdata-adtracking-fraud-detection/train_time_features_10mil.csv')

In [None]:
features = [
    'ip',
    'app',
    'device',
    'os',
    'channel',
    'ip_time_to_next_click',
    'ip_app_time_to_next_click',
    'ip_channel_time_to_next_click',
    'ip_os_time_to_next_click',
    'ip_app_device_os_channel_time_to_next_click',
    'ip_os_device_time_to_next_click',
    'ip_os_device_app_time_to_next_click'
]

model = xgb.XGBClassifier()
model.fit(train[features], train['is_attributed'])

xgb.plot_importance(model)
plt.show()

# Convolution Neural Network Implementaion

In [None]:
Dataset=pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train_sample.csv')
#print(Dataset.describe())
Dataset = Dataset.drop(['click_time','attributed_time'],axis=1)
print(Dataset.head())


In [None]:
x=Dataset.iloc[:,:-1].values
x1=pd.DataFrame(x)
y=Dataset.iloc[:,5].values
y1=pd.DataFrame(y)

In [None]:
for i in range(100000):
    if y[i]=='anom':
        y[i]=0
    else:
        y[i]=1
type(y)
type(x)
y=y.astype('int')

In [None]:
#data Preporcessing
#Missing Data Removal
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imputer = imputer.fit(x[:,:])
x[:,:]=imputer.transform(x[:,:])
Missing_Data_Removed=imputer.transform(x[:,:])

In [None]:
#write in file
np.savetxt('Missing_values.txt',Missing_Data_Removed)

In [None]:
#train and test
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0) 
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
X_train = X_train.reshape( 80000,5, 1)   #Reshape for CNN -  should work!!
X_test = X_test.reshape(20000,5,1)

**CNN**

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
import keras
from keras.models import Sequential,Input,Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
batch_size = 64
epochs = 10
num_classes = 2
model = Sequential()
model.add(Conv1D(32, kernel_size=5,activation='linear',input_shape=(5,1),padding='same'))
model.add(LeakyReLU(alpha=0.1))
model.add(MaxPooling1D((5),padding='same'))
model.add(Conv1D(64, (5), activation='linear',padding='same'))
model.add(LeakyReLU(alpha=0.1))
model.add(MaxPooling1D(pool_size=(5),padding='same'))
model.add(Conv1D(128, (5), activation='linear',padding='same'))
model.add(LeakyReLU(alpha=0.1))                  
model.add(MaxPooling1D(pool_size=(5),padding='same'))
model.add(Flatten())
model.add(Dense(128, activation='linear'))
model.add(LeakyReLU(alpha=0.1))              
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.sparse_categorical_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])
model.summary()
train = model.fit(X_train, y_train, batch_size=batch_size,epochs=epochs,verbose=1,validation_data=(X_test, y_test))
test_eval = model.evaluate(X_test, y_test, verbose=0)
y_pred=model.predict(X_test)
y_pred=y_pred[:,1]
y_pred=y_pred.astype(int)
print('Test loss:', test_eval[0])
print("Accuracy:",accuracy_score(y_test,np.round(y_pred))*100)

In [None]:
# roc graph
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
X, y = make_classification(n_samples=100000, n_classes=2, random_state=1)
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
ns_probs = [0 for _ in range(len(testy))]
model = LogisticRegression(solver='lbfgs')
model.fit(trainX, trainy)
lr_probs = model.predict_proba(testX)
lr_probs = lr_probs[:, 1]
ns_auc = roc_auc_score(testy, ns_probs)
lr_auc = roc_auc_score(testy, lr_probs)
print('False Values: ROC AUC=%.3f' % (ns_auc))
print('True Values: ROC AUC=%.3f' % (lr_auc))
ns_fpr, ns_tpr, _ = roc_curve(testy, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(testy, lr_probs)
pyplot.plot(ns_fpr, ns_tpr, linestyle='--')
pyplot.plot(lr_fpr, lr_tpr, marker='.')
pyplot.xlabel('False Positive Rate in the samples')
pyplot.ylabel('True Positive Rate in the samples')
pyplot.legend()
pyplot.show()