In [36]:
import matplotlib
import numpy as np
import scipy
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline  
import numpy.ma as ma

from sklearn import linear_model as lm
from sklearn.ensemble import RandomForestClassifier as RFC, GradientBoostingClassifier as GBC
from sklearn.metrics import precision_recall_curve

In [2]:
# load data  
data3_rows = np.loadtxt('/home/dante/SHAD/dataset3',delimiter='\t')
data3_cols = data3_rows.T


### Define abnormality as rate of "how often does current object appear in the tails of distributions"

In [6]:
# calculate abnormality
p1=[]
p5=[]
p95=[]
p99=[]

N = data3_rows.shape[0]
K = data3_rows.shape[1]

for i in range(0,K):
    p1.append(np.percentile(data3_cols[i], 1))
    p5.append(np.percentile(data3_cols[i], 5))
    p95.append(np.percentile(data3_cols[i], 95))
    p99.append(np.percentile(data3_cols[i], 99))

#print p1, p5,p95,p99

abnormality = []
for i in range(0, N):
    abn = 0
    for j in range(0, K):
        c = data3_rows[i][j]
        # 1 point of abnormalito for getting into top or bottom 5%
        # +2 extra points of abnormality for getting into top or bottom 1%
        abn += 2*(c < p1[j]) + (c < p5[j]) + (c > p95[j]) + 2*(c > p99[j]) 
    
    # let's make binary abnormality, mark everybody with >=6 abn points
    abnormality.append( (abn >=6)+0 )

print '\nWe have',np.sum(abnormality),'abnormal users'




We have 191 abnormal users


### Try to fit abnormality

In [32]:
# methods
Logit = lm.LogisticRegression(
    C=1.0, 
    class_weight=None, 
    dual=False, 
    fit_intercept=True, 
    intercept_scaling=1, 
    penalty='l2', 
    random_state=1, 
    tol=0.001
)
RF = RFC()
GBC = GBC()


In [41]:
features = data3_rows
labels = abnormality

N_learn = 3000
learn_features = features[:N_learn]
test_features =  features[N_learn:]
learn_labels = labels[:N_learn]
test_labels =  labels[N_learn:]

cur_method = Logit
cur_method.fit(learn_features, learn_labels)

predict = cur_method.predict(features)

# print 'logit score: test\t%s' % Logit.score(test_features, test_labels)
# print 'logit score: learn\t%s' % Logit.score(learn_features, learn_labels)
print '%s score: test\t%s' % (cur_method.__class__, cur_method.score(test_features, test_labels))
print '%s score: learn\t%s' % (cur_method.__class__, cur_method.score(learn_features, learn_labels))


print "\nDistribution of labels"
print np.histogram(labels,2)
print "\nDistribution of predicted labels"
print np.histogram(predict,2)

# examples of 
# print predict[:20]

<class 'sklearn.linear_model.logistic.LogisticRegression'> score: test	0.987090367428
<class 'sklearn.linear_model.logistic.LogisticRegression'> score: learn	0.983

Distribution of labels
(array([3816,  191]), array([ 0. ,  0.5,  1. ]))

Distribution of predicted labels
(array([3836,  171]), array([ 0. ,  0.5,  1. ]))


### Calculate precision, recall, etc.

In [42]:
# estimate whether our fit is good or not
print list(predict)[:20]
print labels[:20]
print np.corrcoef(predict, labels)

print '\nAbnormalities', np.sum(labels)
print 'Predicted abnormalities', np.sum(predict)
print 'Correct predictions for abnormalities', np.sum(predict*labels)

prec = np.sum(predict*labels)/(np.sum(predict)+.0)
recall = np.sum(predict*labels)/(np.sum(labels)+.0)
print '\nPrecision of abnormality detection ', prec
print 'Recall of abnormality detection ', recall
print "\nF1-measure", 2*prec*recall/(prec+recall)


print '\nOverall stats:'
print 'Right verdicts', np.sum(predict==labels)
print 'Wrong verdicts', np.sum(predict!=labels)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]
[[ 1.          0.81623447]
 [ 0.81623447  1.        ]]

Abnormalities 191
Predicted abnormalities 171
Correct predictions for abnormalities 149

Precision of abnormality detection  0.87134502924
Recall of abnormality detection  0.780104712042

F1-measure 0.82320441989

Overall stats:
Right verdicts 3943
Wrong verdicts 64
