In [1]:
import sys
print(sys.version)

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from datetime import datetime, timedelta
from sklearn.utils import shuffle

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn import tree
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier 


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

3.6.0 |Anaconda custom (64-bit)| (default, Dec 23 2016, 11:57:41) [MSC v.1900 64 bit (AMD64)]




In [2]:
# load datasets

train = pd.read_csv('train.csv', nrows = 1000000)
train_sample = pd.read_csv('train_sample.csv')

In [3]:
# Check the data
train_sample.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [4]:
# have a quick summary on the data
train_sample.describe()

Unnamed: 0,ip,app,device,os,channel,is_attributed
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,91255.87967,12.04788,21.77125,22.81828,268.83246,0.00227
std,69835.553661,14.9415,259.667767,55.943136,129.724248,0.047591
min,9.0,1.0,0.0,0.0,3.0,0.0
25%,40552.0,3.0,1.0,13.0,145.0,0.0
50%,79827.0,12.0,1.0,18.0,258.0,0.0
75%,118252.0,15.0,1.0,19.0,379.0,0.0
max,364757.0,551.0,3867.0,866.0,498.0,1.0


In [5]:
# Check the types of the data

print(type(train_sample['ip'][0]))
print(type(train_sample['app'][0]))
print(type(train_sample['device'][0]))
print(type(train_sample['os'][0]))
print(type(train_sample['channel'][0]))
print(type(train_sample['is_attributed'][0]))
print(type(train_sample['click_time'][0]))

<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'str'>


# Data processing

In [6]:
# create time attributes from the time stamp

train_sample['click_time'] = pd.to_datetime(train_sample['click_time'])
train_sample['click_date'] = train_sample['click_time'].dt.date
train_sample['click_day'] = train_sample['click_time'].dt.weekday
train_sample['click_hour'] = train_sample['click_time'].dt.hour
train_sample['click_minute'] = train_sample['click_time'].dt.minute

In [7]:
# Shuffle with random_state = 1
train_sample = shuffle(train_sample, random_state = 1)

In [8]:
# total clicks for ip address, more clicks are potentially a flag for fradulent click

ip_counts = {}
for click in train_sample['ip']:
    if str(click) in ip_counts:
        ip_counts[str(click)] += 1
    else:
        ip_counts[str(click)] = 1
        click_count = []
cur_ip = []
i = 0
for i in range(len(train_sample)):
    cur_ip = train_sample['ip'][i]
    click_count.append(ip_counts[str(cur_ip)])
    i += 1
train_sample['click_count'] = np.asarray(click_count)

In [13]:
# prior clicks in last hour, a high number of prior_clicks in the past hour is also a potential indicator of fraud

click_time_counts = {}
click_date = []
# click_hour = []
i = 0
for click in train_sample['ip']:
    if str(click) not in click_time_counts:
        click_time_counts[str(click)] = {train_sample['click_time'][i]:1}
    elif str(click) in click_time_counts and train_sample['click_time'][i] not in click_time_counts[str(click)]:
        click_time_counts[str(click)][train_sample['click_time'][i]] = 1
    elif str(click) in click_time_counts and train_sample['click_time'][i] in click_time_counts[str(click)]: 
        click_time_counts[str(click)][train_sample['click_time'][i]] += 1
    i += 1
    
# print(click_time_counts)
print(len(click_time_counts))
print(len(train_sample['ip'].unique()))

prior_clicks = []
i = 0
for click in train_sample['ip']:
    prior_clicks.append(0)
    for key in click_time_counts[str(click)]:
        if key < train_sample['click_time'][i] and key > (train_sample['click_time'][i] - timedelta(hours = 1)):
            prior_clicks[i] += 1
    i += 1
train_sample['prior_clicks'] = np.asarray(prior_clicks)
print(train_sample.head())

34857
34857


NameError: name 'timedelta' is not defined

In [10]:
train_sample.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,click_date,click_day,click_hour,click_minute,click_count
43660,49293,1,1,19,178,2017-11-07 04:06:44,,0,2017-11-07,1,4,6,8
87278,123994,12,1,19,245,2017-11-07 14:04:21,,0,2017-11-07,1,14,4,149
14317,55920,14,1,13,442,2017-11-08 23:39:27,,0,2017-11-08,2,23,39,2
81932,62937,9,1,20,134,2017-11-08 00:53:46,,0,2017-11-08,2,0,53,3
95321,70361,15,1,13,412,2017-11-06 16:30:18,,0,2017-11-06,0,16,30,4


In [11]:
# Convert the features into a single np array for faster processing

ip = list(train_sample['ip'])
app = list(train_sample['app'])
device = list(train_sample['device'])
os = list(train_sample['os'])
channel = list(train_sample['channel'])
click_day = list(train_sample['click_day'])
click_count = list(train_sample['click_count'])


X = []
i = 0
for x in range(100000):
    # comment out the applicable
    X.append([ip[i], app[i], device[i], os[i], channel[i], click_day[i], click_count[i]])
    #X.append([ip[i], app[i], device[i], os[i], channel[i], click_hour[i]])
    i += 1
    
Y = train_sample.is_attributed

In [12]:
# Split into train and test.

train_data, train_labels = X[:90000], Y[:90000]
test_data, test_labels = X[90000:], Y[90000:]

# Model Training and classification

In [16]:
# DecisionTree

dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, train_labels)

print('Accuracy (a decision tree):', dt.score(test_data, test_labels))
print("Decision Tree Performance")
print(classification_report(test_labels, dt.predict(test_data)))



scores = cross_val_score(dt, train_data, train_labels, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )

Accuracy (a decision tree): 0.9967
Decision Tree Performance
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9974
          1       0.37      0.38      0.38        26

avg / total       1.00      1.00      1.00     10000

mean: 0.997 (std: 0.001)



In [17]:
#Ensemble Methods
rfc = RandomForestClassifier(n_estimators=10, max_features=None)
rfc.fit(train_data, train_labels)

print('Accuracy (a random forest):', rfc.score(test_data, test_labels))
print(classification_report(test_labels, rfc.predict(test_data)))

abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
abc.fit(train_data, train_labels)
print('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels))


Accuracy (a random forest): 0.9976
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9974
          1       0.60      0.23      0.33        26

avg / total       1.00      1.00      1.00     10000

Accuracy (adaboost with decision trees): 0.9974


In [18]:
# logstic and logsitcCV

logreg = LogisticRegression()
logreg.fit(train_data, train_labels)
logreg.score(test_data, test_labels)
print(classification_report(test_labels, logreg.predict(test_data)))


logregCV = LogisticRegressionCV(cv =10) # 10-fold
logregCV.fit(train_data, train_labels)
logregCV.score(test_data, test_labels)
print(classification_report(test_labels, logreg.predict(test_data)))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9974
          1       0.00      0.00      0.00        26

avg / total       0.99      1.00      1.00     10000

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9974
          1       0.00      0.00      0.00        26

avg / total       0.99      1.00      1.00     10000



# Feature Engineering - Binarizing Device

In [97]:
# BINARIZING the device type

train_sample_with_device_type_binarized = train_sample

# Convert the features in to a single np array for faster processing

ip = list(train_sample_with_device_type_binarized['ip'])
app = list(train_sample_with_device_type_binarized['app'])
device = list(train_sample_with_device_type_binarized['device'])
os = list(train_sample_with_device_type_binarized['os'])
channel = list(train_sample_with_device_type_binarized['channel'])
click_day = list(train_sample_with_device_type_binarized['click_day'])
click_count = list(train_sample_with_device_type_binarized['click_count'])

# convert device to 1 or 0 - Recall in train_sample summary, 
# at least 75% of the device type is 1, probably are some budget phone which is likely to be a red flag

for i in device:
    if i == 1:
        next
    else:
        i = 0


X = []
i = 0

for x in range(100000):
    # comment out the applicable
    X.append([ip[i], app[i], device[i], os[i], channel[i], click_day[i], click_count[i]])
    #X.append([ip[i], app[i], device[i], os[i], channel[i], click_hour[i]])
    i += 1
    
    

Y = train_sample.is_attributed

In [98]:
# Split into train and test.

train_data, train_labels = X[:90000], Y[:90000]
test_data, test_labels = X[90000:], Y[90000:]

In [99]:
# DecisionTree

dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, train_labels)

print('Accuracy (a decision tree):', dt.score(test_data, test_labels))
print("Decision Tree Performance")
print(classification_report(test_labels, dt.predict(test_data)))



scores = cross_val_score(dt, train_data, train_labels, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )

Accuracy (a decision tree): 0.9967
Decision Tree Performance
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9974
          1       0.37      0.38      0.38        26

avg / total       1.00      1.00      1.00     10000

mean: 0.997 (std: 0.001)



In [100]:
#Ensemble Methods

rfc = RandomForestClassifier(n_estimators=10, max_features=None)
rfc.fit(train_data, train_labels)
print('Accuracy (a random forest):', rfc.score(test_data, test_labels))
print(classification_report(test_labels, rfc.predict(test_data)))

abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
abc.fit(train_data, train_labels)
print('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels))


Accuracy (a random forest): 0.9975
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9974
          1       0.54      0.27      0.36        26

avg / total       1.00      1.00      1.00     10000

Accuracy (adaboost with decision trees): 0.9974


# Resampling 

In [102]:
# resampling - train with a new data set - 50% is_attributed = 1

new_train_sample = pd.read_csv('50%_attributed_train.csv')

print(new_train_sample.groupby('is_attributed').count())

# time processing
# create time attributes from the time stamp

new_train_sample['click_time'] = pd.to_datetime(new_train_sample['click_time'])
new_train_sample['click_date'] = new_train_sample['click_time'].dt.date
new_train_sample['click_day'] = new_train_sample['click_time'].dt.weekday
new_train_sample['click_hour'] = new_train_sample['click_time'].dt.hour
new_train_sample['click_minute'] = new_train_sample['click_time'].dt.minute

# Convert the features in to a single np array for faster processing

ip = list(new_train_sample['ip'])
app = list(new_train_sample['app'])
device = list(new_train_sample['device'])
os = list(new_train_sample['os'])
channel = list(new_train_sample['channel'])
click_hour = list(new_train_sample['click_hour'])


X = []
i = 0
for x in range(len(new_train_sample)):
    # comment out the applicable
    X.append([ip[i], app[i], device[i], os[i], channel[i], click_day[i], click_count[i]])
    #X.append([ip[i], app[i], device[i], os[i], channel[i], click_hour[i]])
    i += 1
    
Y = new_train_sample.is_attributed

# Define new training data set
train_data, train_labels = X, Y

# Use the same test_data, test_labels
#test_data, test_labels = X[90000:], Y[90000:]

# DecisionTree

dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, train_labels)

print('Accuracy (a decision tree):', dt.score(test_data, test_labels))
print("Decision Tree Performance")
print(classification_report(test_labels, dt.predict(test_data)))



scores = cross_val_score(dt, train_data, train_labels, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )

rfc = RandomForestClassifier(n_estimators=10, max_features=None)
rfc.fit(train_data, train_labels)

print('Accuracy (a random forest):', rfc.score(test_data, test_labels))
print(classification_report(test_labels, rfc.predict(test_data)))


abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
abc.fit(train_data, train_labels)
print('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels))


               Unnamed: 0     ip    app  device     os  channel  click_time  \
is_attributed                                                                 
0                   49464  49464  49464   49464  49464    49464       49464   
1                   49464  49464  49464   49464  49464    49464       49464   

               attributed_time  
is_attributed                   
0                            0  
1                        49464  
Accuracy (a decision tree): 0.8458
Decision Tree Performance
             precision    recall  f1-score   support

          0       1.00      0.85      0.92      9974
          1       0.02      0.96      0.03        26

avg / total       1.00      0.85      0.91     10000

mean: 0.867 (std: 0.011)

Accuracy (a random forest): 0.9199
             precision    recall  f1-score   support

          0       1.00      0.92      0.96      9974
          1       0.03      0.96      0.06        26

avg / total       1.00      0.92      0.96     10000


In [103]:
# resampling - train with a new data set - 30% is_attributed = 1

new_train_sample = pd.read_csv('30%_attributed_train.csv')

print(new_train_sample.groupby('is_attributed').count())

# time processing
# create time attributes from the time stamp

new_train_sample['click_time'] = pd.to_datetime(new_train_sample['click_time'])
new_train_sample['click_date'] = new_train_sample['click_time'].dt.date
new_train_sample['click_day'] = new_train_sample['click_time'].dt.weekday
new_train_sample['click_hour'] = new_train_sample['click_time'].dt.hour
new_train_sample['click_minute'] = new_train_sample['click_time'].dt.minute

# Convert the features in to a single np array for faster processing

ip = list(new_train_sample['ip'])
app = list(new_train_sample['app'])
device = list(new_train_sample['device'])
os = list(new_train_sample['os'])
channel = list(new_train_sample['channel'])
click_hour = list(new_train_sample['click_hour'])


X = []
i = 0
for x in range(len(new_train_sample)):
    # comment out the applicable
    X.append([ip[i], app[i], device[i], os[i], channel[i], click_day[i], click_count[i]])
    #X.append([ip[i], app[i], device[i], os[i], channel[i], click_hour[i]])
    i += 1
    
Y = new_train_sample.is_attributed

# Define new training data set
train_data, train_labels = X, Y

# Use the same test_data, test_labels
#test_data, test_labels = X[90000:], Y[90000:]

# DecisionTree

dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, train_labels)

print('Accuracy (a decision tree):', dt.score(test_data, test_labels))
print("Decision Tree Performance")
print(classification_report(test_labels, dt.predict(test_data)))



scores = cross_val_score(dt, train_data, train_labels, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )

rfc = RandomForestClassifier(n_estimators=10, max_features=None)
rfc.fit(train_data, train_labels)

print('Accuracy (a random forest):', rfc.score(test_data, test_labels))
print(classification_report(test_labels, rfc.predict(test_data)))


abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
abc.fit(train_data, train_labels)
print('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels))


  interactivity=interactivity, compiler=compiler, result=result)


               Unnamed: 0     ip    app  device     os  channel  click_time  \
is_attributed                                                                 
0                   69250  69250  69250   69250  69250    69250       69250   
1                   29678  29678  29678   29678  29678    29678       29678   

               attributed_time  
is_attributed                   
0                            0  
1                        29678  
Accuracy (a decision tree): 0.9042
Decision Tree Performance
             precision    recall  f1-score   support

          0       1.00      0.90      0.95      9974
          1       0.03      1.00      0.05        26

avg / total       1.00      0.90      0.95     10000

mean: 0.890 (std: 0.006)

Accuracy (a random forest): 0.9539
             precision    recall  f1-score   support

          0       1.00      0.95      0.98      9974
          1       0.05      1.00      0.10        26

avg / total       1.00      0.95      0.97     10000


In [104]:
# resampling - train with a new data set - 10% is_attributed = 1

new_train_sample = pd.read_csv('10%_attributed_train.csv')

print(new_train_sample.groupby('is_attributed').count())

# time processing
# create time attributes from the time stamp

new_train_sample['click_time'] = pd.to_datetime(new_train_sample['click_time'])
new_train_sample['click_date'] = new_train_sample['click_time'].dt.date
new_train_sample['click_day'] = new_train_sample['click_time'].dt.weekday
new_train_sample['click_hour'] = new_train_sample['click_time'].dt.hour
new_train_sample['click_minute'] = new_train_sample['click_time'].dt.minute

# Convert the features in to a single np array for faster processing

ip = list(new_train_sample['ip'])
app = list(new_train_sample['app'])
device = list(new_train_sample['device'])
os = list(new_train_sample['os'])
channel = list(new_train_sample['channel'])
click_hour = list(new_train_sample['click_hour'])


X = []
i = 0
for x in range(len(new_train_sample)):
    # comment out the applicable
    X.append([ip[i], app[i], device[i], os[i], channel[i], click_day[i], click_count[i]])
    #X.append([ip[i], app[i], device[i], os[i], channel[i], click_hour[i]])
    i += 1
    
Y = new_train_sample.is_attributed

# Define new training data set
train_data, train_labels = X, Y

# Use the same test_data, test_labels
#test_data, test_labels = X[90000:], Y[90000:]

# DecisionTree

dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, train_labels)

print('Accuracy (a decision tree):', dt.score(test_data, test_labels))
print("Decision Tree Performance")
print(classification_report(test_labels, dt.predict(test_data)))



scores = cross_val_score(dt, train_data, train_labels, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )

rfc = RandomForestClassifier(n_estimators=10, max_features=None)
rfc.fit(train_data, train_labels)

print('Accuracy (a random forest):', rfc.score(test_data, test_labels))
print(classification_report(test_labels, rfc.predict(test_data)))


abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
abc.fit(train_data, train_labels)
print('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels))


  interactivity=interactivity, compiler=compiler, result=result)


               Unnamed: 0     ip    app  device     os  channel  click_time  \
is_attributed                                                                 
0                   89035  89035  89035   89035  89035    89035       89035   
1                    9893   9893   9893    9893   9893     9893        9893   

               attributed_time  
is_attributed                   
0                            0  
1                         9893  
Accuracy (a decision tree): 0.9601
Decision Tree Performance
             precision    recall  f1-score   support

          0       1.00      0.96      0.98      9974
          1       0.05      0.85      0.10        26

avg / total       1.00      0.96      0.98     10000

mean: 0.947 (std: 0.002)

Accuracy (a random forest): 0.9789
             precision    recall  f1-score   support

          0       1.00      0.98      0.99      9974
          1       0.10      0.88      0.18        26

avg / total       1.00      0.98      0.99     10000


# Feature Engingeering - Adding Dummy variables for top IP addresses

In [42]:
# check the top 5 most common IP

df_top_freq = train_sample.groupby(['ip'])['click_time'].agg(
    {"code_count": len}).sort_values(
    "code_count", ascending=False).head(5).reset_index()

top5_ip = list(df_top_freq['ip'])
print(top5_ip)

is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


[5348, 5314, 73487, 73516, 53454]


In [68]:
train_sample.shape

(100000, 8)

In [74]:
# Redeining X, Y

is_5348 = [1 if x == 5348 else 0 for x in list(train_sample['ip'])]
is_5314 = [1 if x == 5314 else 0 for x in list(train_sample['ip'])]
is_73487 = [1 if x == 73487 else 0 for x in list(train_sample['ip'])]
is_73516 = [1 if x == 73516 else 0 for x in list(train_sample['ip'])]
is_53454 = [1 if x == 53454 else 0 for x in list(train_sample['ip'])]
not_a_frequent_ip = [1 if x not in top5_ip else 0 for x in list(train_sample['ip'])]

ip = list(train_sample['ip'])
app = list(train_sample['app'])
device = list(train_sample['device'])
os = list(train_sample['os'])
channel = list(train_sample['channel'])

X = []
i = 0

for x in range(100000):
    # comment out the applicable
    X.append([is_5348[i], is_5314[i], is_73487[i], is_73516[i], is_53454[i], not_a_frequent_ip[i], 
              app[i], device[i], os[i], channel[i], click_day[i], click_count[i]])
    #X.append([ip[i], app[i], device[i], os[i], channel[i], click_hour[i]])
    i += 1
    

Y = train_sample.is_attributed

In [75]:
# DecisionTree

dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, train_labels)

print('Accuracy (a decision tree):', dt.score(test_data, test_labels))
print("Decision Tree Performance")
print(classification_report(test_labels, dt.predict(test_data)))



scores = cross_val_score(dt, train_data, train_labels, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )

Accuracy (a decision tree): 0.9609
Decision Tree Performance
             precision    recall  f1-score   support

          0       1.00      0.96      0.98      9974
          1       0.05      0.81      0.10        26

avg / total       1.00      0.96      0.98     10000

mean: 0.950 (std: 0.002)



In [76]:
#Ensemble Methods

rfc = RandomForestClassifier(n_estimators=10, max_features=None)
rfc.fit(train_data, train_labels)
print('Accuracy (a random forest):', rfc.score(test_data, test_labels))
print(classification_report(test_labels, rfc.predict(test_data)))

abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
abc.fit(train_data, train_labels)
print('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels))


Accuracy (a random forest): 0.9803
             precision    recall  f1-score   support

          0       1.00      0.98      0.99      9974
          1       0.10      0.85      0.18        26

avg / total       1.00      0.98      0.99     10000

Accuracy (adaboost with decision trees): 0.988
