# X-class Day 3: Prototyping Source Classification with Random Forest

Using RandomForestClassifier from scikit learn to train a classifier on photon event metadata. 

Input files: 
 - event list 
 - region files for training sets (bkg and src)
 - region files for testing sets (bkg and src)

In [1]:
# %load class2.py
import pandas as pd
import numpy as np
import copy
#import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
from astropy.table import Table, vstack
from sklearn.ensemble import RandomForestClassifier

In [2]:
def parse_region_file(reg_file): 
    f = open(reg_file,'r')
    x,y,r =[],[],[]
    for l in f.readlines(): 
        if l[0] not in ['#','\n',' ']:
            xt,yt,rt = l.split('(')[1].split(')')[0].split(',')
            x.append(float(xt));y.append(float(yt));r.append(float(rt))
    return np.asarray(x),np.asarray(y),np.asarray(r)

def get_events(evt,x,y,r=6.0):
    """get events from a position and calculate offset"""
    revt = np.sqrt((evt['x']-x)**2+(evt['y']-y)**2)
    tevt = copy.copy(evt[revt<=r])
    tevt['xoff'] = tevt['x']-x
    tevt['yoff'] = tevt['y']-y
    del tevt['x']
    del tevt['y']
    return tevt

def merge_pos(evt,x,y,lab,r=6.0): 
    for i in range(len(x)): 
        if (i == 0): 
            mevt = get_events(evt,x[i],y[i],r)
            levt = np.zeros(len(mevt))
            levt[:] = lab[i]
        else:
            tmpevt = get_events(evt,x[i],y[i],r)
            tmplevt = np.zeros(len(tmpevt))
            tmplevt[:] = lab[i] 
            mevt = vstack([mevt,tmpevt])
            levt = np.hstack([levt,tmplevt])
            
    return mevt,levt

def build_rfc(evt,lab,rfc = None):
    if (rfc is None): 
        rfc = RandomForestClassifier(n_estimators=200,oob_score=True)
    X = copy.copy(evt.to_pandas())
    Y = copy.copy(lab)
    rfc.fit(X.values,Y)
    return rfc,X.values,Y

def do_rfc(evt,rfc): 
    X = copy.copy(evt.to_pandas())
    Y = rfc.predict(X.values)
    print "{0:0.1f} {1:0.1f} ({2})".format(100.*float(len(np.where(Y==0)[0]))/len(Y),
                                           100.*float(len(np.where(Y==1)[0]))/len(Y),len(Y))
    return Y  

In [3]:
# read all events 
e = Table.read('Data/evt_1229.fits',hdu=1)
# rid ec of bad columns
ec = copy.copy(e)
badcols = ['status','ccd_id','expno','node_id','chipx','chipy','tdetx','tdety','detx','dety','pi','pha']
for bc in badcols: 
    del ec[bc]

In [4]:
# read region positions:
b1x,b1y,b1r = parse_region_file('Data/b1_1229.reg')
b2x,b2y,b2r = parse_region_file('Data/b2_1229.reg')
s1x,s1y,s1r = parse_region_file('Data/src1_1229.reg')
s2x,s2y,s2r = parse_region_file('Data/src2_1229.reg')

In [5]:
# using b1,s1 as training, b2,s2 as
trnx,trny = np.hstack((s1x,b1x)),np.hstack((s1y,b1y))
trnl = np.hstack((np.ones(len(s1x)),np.zeros(len(b1x)))) 
trne,trnlab = merge_pos(ec,trnx,trny,trnl)

rfc,X,Y = build_rfc(trne,trnlab) 

print "OOB Score: {0}".format(rfc.oob_score_)
print sorted(zip(trne.colnames,rfc.feature_importances_),key=lambda q: q[1],reverse=True)

OOB Score: 0.780078895464
[('pha_ro', 0.26338217587841684), ('energy', 0.21946754363531804), ('yoff', 0.14281213490135053), ('xoff', 0.14014502815812066), ('time', 0.11569791403222794), ('fltgrade', 0.068761186965070137), ('grade', 0.049734016429495993)]


In [6]:
# sources
print "Source Tests"
print "BG%  S%   (N)"
for xi,yi in zip(s2x,s2y): 
    lp = do_rfc(get_events(ec,xi,yi),rfc)

Source Tests
BG%  S%   (N)
13.0 87.0 (77)
21.3 78.7 (89)
5.7 94.3 (122)
18.9 81.1 (111)
18.8 81.2 (64)
8.2 91.8 (110)


In [7]:
# bg
print "Background Tests"
print "BG%  S%   (N)"
for xi,yi in zip(b2x,b2y):
    lp = do_rfc(get_events(ec,xi,yi),rfc)

Background Tests
BG%  S%   (N)
38.9 61.1 (18)
61.9 38.1 (21)
67.6 32.4 (34)
53.3 46.7 (15)
47.2 52.8 (36)
65.0 35.0 (20)
56.0 44.0 (25)
50.0 50.0 (20)


In [8]:
lp = do_rfc(get_events(ec,4100.333,4130.125),rfc)

40.9 59.1 (44)
