In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score

In [2]:
col = ('signal, lepton pT, lepton eta, lepton phi, missing energy magnitude, '+
        'missing energy phi, jet 1 pt, jet 1 eta, jet 1 phi, jet 1 b-tag, '+
        'jet 2 pt, jet 2 eta, jet 2 phi, jet 2 b-tag, jet 3 pt, jet 3 eta, '+
        'jet 3 phi, jet 3 b-tag, jet 4 pt, jet 4 eta, jet 4 phi, jet 4 b-tag, '+
        'm_jj, m_jjj, m_lv, m_jlv, m_bb, m_wbb, m_wwbb').split(', ')

In [3]:
csv_file = './HIGGS_reduced.csv'

In [4]:
df = pd.read_csv(csv_file,header=None,dtype='float',names=col)

Last two 10000 samples for test and valid

using default SVM setting

In [5]:
tsI = 10000
vlI = 10000
tr_df = df[:-(tsI+vlI)]
vl_df = df[-(tsI+vlI):-tsI]
ts_df = df[-tsI:]

In [7]:
feat = col[1:]
label = 'signal'

In [15]:
svm = SVC()
svm.fit(tr_df[feat],tr_df[label])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
y = svm.predict(ts_df[feat])

In [144]:
yy = ts_df[label]
mse = ((yy-y)*(yy-y)).sum()/len(y)
acc = 1.0*(yy==y).sum()/len(y)
mse,acc,roc_auc_score(yy,y)

(0.33, 0.67000000000000004, 0.66436096326023386)

In [43]:
feat7 = feat[-7:]
svm7 = SVC()
svm7.fit(tr_df[feat7],tr_df[label])

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [143]:
y7 = svm7.predict(ts_df[feat7])
mse = ((yy-y7)*(yy-y7)).sum()/len(y7)
acc = 1.0*(yy==y7).sum()/len(y7)
mse,acc,roc_auc_score(yy,y7)

(0.3424, 0.65759999999999996, 0.64916334804567244)

In [142]:
feat25 = feat[1:26]
svm25 = SVC()
svm25.fit(tr_df[feat25],tr_df[label])
y25 = svm25.predict(ts_df[feat25])
mse = ((yy-y25)*(yy-y25)).sum()/len(y25)
acc = 1.0*(yy==y25).sum()/len(y25)
mse,acc,roc_auc_score(yy,y25)

(0.3546, 0.64539999999999997, 0.63829117336681473)

In [52]:
pca = PCA(n_components=10)
Xpca = pca.fit_transform(tr_df[feat])

In [141]:
svmpca = SVC()
svmpca.fit(Xpca,tr_df[label])
yp = svmpca.predict(pca.transform(ts_df[feat]))
mse = ((yy-yp)*(yy-yp)).sum()/len(yp)
acc = 1.0*(yy==yp).sum()/len(yp)
mse,acc,roc_auc_score(yy,yp)

(0.4244, 0.5756, 0.56666897750527001)

In [113]:
corabs = df.corr().abs()[['signal']]

In [115]:
cortop = corabs.sort_values('signal',ascending=False)[:17]
cortop

Unnamed: 0,signal
signal,1.0
m_bb,0.149082
m_wwbb,0.121644
missing energy magnitude,0.102268
m_wbb,0.062888
jet 1 pt,0.059355
lepton pT,0.048636
jet 2 b-tag,0.044518
jet 4 pt,0.036307
jet 3 b-tag,0.031054


In [107]:
'm_jj, m_jjj, m_lv, m_jlv, m_bb, m_wbb, m_wwbb'.split(', ')

['m_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [121]:
featcor = cortop.index[1:6]
featcor

Index([u'm_bb', u'm_wwbb', u'missing energy magnitude', u'm_wbb', u'jet 1 pt'], dtype='object')

In [140]:
svmcor = SVC()
svmcor.fit(tr_df[featcor],tr_df[label])
ycor = svmcor.predict(ts_df[featcor])
mse = ((yy-ycor)*(yy-ycor)).sum()/len(ycor)
acc = 1.0*(yy==ycor).sum()/len(ycor)
mse,acc,roc_auc_score(yy,ycor)

(0.3332, 0.66679999999999995, 0.66231908059434763)

In [145]:
pcacor = PCA()
svmpc = SVC()
Xpc = pcacor.fit_transform(tr_df[featcor])
svmpc.fit(Xpc,tr_df[label])
ypc = svmpc.predict(pcacor.transform(ts_df[featcor]))
mse = ((yy-ypc)*(yy-ypc)).sum()/len(ypc)
acc = 1.0*(yy==ypc).sum()/len(ypc)
mse,acc,roc_auc_score(yy,ypc)

(0.3332, 0.66679999999999995, 0.66231908059434763)

In [154]:
pcmax = Xpc.max(axis=0)
pcmin = Xpc.min(axis=0)
Xnorm = 2.0*(Xpc-pcmin)/(pcmax-pcmin)-1

In [175]:
svmnorm = SVC()
svmnorm.fit(Xnorm,tr_df[label])
Xtest = pcacor.transform(ts_df[featcor])
Xtest = 2.0*(Xtest-pcmin)/(pcmax-pcmin)-1
ynorm = svmnorm.predict(Xtest)
mse = ((yy-ynorm)*(yy-ynorm)).sum()/len(ynorm)
acc = 1.0*(yy==ynorm).sum()/len(ynorm)
mse,acc,roc_auc_score(yy,ynorm)

(0.3613, 0.63870000000000005, 0.62841217786267978)

In [176]:
X = tr_featcoratcor].as_matrix()
pcmax = X.max(axis=0)
pcmin = X.min(axis=0)
X = 2.0*(X-pcmin)/(pcmax-pcmin)-1
svmn = SVC()
svmn.fit(X,tr_dlabelbel])
X = ts_df[featcor].as_matrix()
X = 2.0*(X-pcmin)/(pcmax-pcmin)-1
yn = svmn.predict(X)
mse = ((yy-yn)*(yy-yn)).sum()/len(yn)
acc = 1.0*(yy==yn).sum()/len(yn)
mse,acc,roc_auc_score(yy,yn)

(0.3598, 0.64019999999999999, 0.63050043777553544)

In [178]:
from sklearn.ensemble import GradientBoostingClassifier as GBC
gbc = GBC()
gbc.fit(tr_df[featcor],tr_df[label])
ygbc = gbc.predict(ts_df[featcor])
mse = ((yy-ygbc)*(yy-ygbc)).sum()/len(ygbc)
acc = 1.0*(yy==ygbc).sum()/len(ygbc)
mse,acc,roc_auc_score(yy,ygbc)

(0.3167, 0.68330000000000002, 0.68339120058335834)

In [179]:
gbc = GBC()
gbc.fit(tr_df[feat],tr_df[label])
ygbc = gbc.predict(ts_df[feat])
mse = ((yy-ygbc)*(yy-ygbc)).sum()/len(ygbc)
acc = 1.0*(yy==ygbc).sum()/len(ygbc)
mse,acc,roc_auc_score(yy,ygbc)

(0.2863, 0.7137, 0.7129647995090751)

In [180]:
gbc = GBC()
gbc.fit(tr_df[feat7],tr_df[label])
ygbc = gbc.predict(ts_df[feat7])
mse = ((yy-ygbc)*(yy-ygbc)).sum()/len(ygbc)
acc = 1.0*(yy==ygbc).sum()/len(ygbc)
mse,acc,roc_auc_score(yy,ygbc)

(0.3069, 0.69310000000000005, 0.69171439190282147)

In [181]:
gbc = GBC()
gbc.fit(tr_df[feat25],tr_df[label])
ygbc = gbc.predict(ts_df[feat25])
mse = ((yy-ygbc)*(yy-ygbc)).sum()/len(ygbc)
acc = 1.0*(yy==ygbc).sum()/len(ygbc)
mse,acc,roc_auc_score(yy,ygbc)

(0.3197, 0.68030000000000002, 0.67811253912378133)

In [192]:
gbc = GBC()
gbc.fit(df[:20000][feat],df[:20000][label])
ygbc = gbc.predict(ts_df[feat])
mse = ((yy-ygbc)*(yy-ygbc)).sum()/len(ygbc)
acc = 1.0*(yy==ygbc).sum()/len(ygbc)
mse,acc,roc_auc_score(yy,ygbc)

(0.2915, 0.70850000000000002, 0.70725831195807631)

In [193]:
gbc = GBC()
gbc.fit(df[:30000][feat],df[:30000][label])
ygbc = gbc.predict(ts_df[feat])
mse = ((yy-ygbc)*(yy-ygbc)).sum()/len(ygbc)
acc = 1.0*(yy==ygbc).sum()/len(ygbc)
mse,acc,roc_auc_score(yy,ygbc)

(0.291, 0.70899999999999996, 0.70816900703265218)

In [210]:
from sklearn.ensemble import GradientBoostingRegressor as GBR
gbr = GBR()
gbr.fit(tr_df[feat7],tr_df[label])
ygbr = gbr.predict(ts_df[feat7]).round()
mse = ((yy-ygbr)*(yy-ygbr)).sum()/len(ygbr)
acc = 1.0*(yy==ygbr).sum()/len(ygbr)
mse,acc,roc_auc_score(yy,ygbr)

(0.3067, 0.69330000000000003, 0.69196954699860569)

In [211]:
gbr = GBR()
gbr.fit(tr_df[feat],tr_df[label])
ygbr = gbr.predict(ts_df[feat]).round()
mse = ((yy-ygbr)*(yy-ygbr)).sum()/len(ygbr)
acc = 1.0*(yy==ygbr).sum()/len(ygbr)
mse,acc,roc_auc_score(yy,ygbr)

(0.2869, 0.71309999999999996, 0.7122866325689593)

In [212]:
gbr = GBR()
gbr.fit(tr_df[feat25],tr_df[label])
ygbr = gbr.predict(ts_df[feat25]).round()
mse = ((yy-yg
        br)*(yy-ygbr)).sum()/len(ygbr)
acc = 1.0*(yy==ygbr).sum()/len(ygbr)
mse,acc,roc_auc_score(yy,ygbr)

(0.3206, 0.6794, 0.67724806082127209)

In [215]:
from sklearn.linear_model.logistic import LogisticRegression as LR

In [220]:
lr = LR()
lr.fit(tr_df[feat7],tr_df[label])
y = lr.predict(ts_df[feat7])
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.62660000000000005, 0.61727473816914824)

In [223]:
lr = LR()
lr.fit(tr_df[feat],tr_df[label])
y = lr.predict(ts_df[feat])
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.64480000000000004, 0.63948992089229173)

In [224]:
lr = LR()
lr.fit(tr_df[feat25],tr_df[label])
y = lr.predict(ts_df[feat25])
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.6109, 0.60583303805951183)

In [225]:
lr = LR()
lr.fit(tr_df[featcor],tr_df[label])
y = lr.predict(ts_df[featcor])
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.63519999999999999, 0.62920668910747712)

In [227]:
from sklearn.linear_model import LinearRegression as LNR

In [233]:
lr = LNR()
lr.fit(tr_df[feat7],tr_df[label])
y = lr.predict(ts_df[feat7]).round()
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.62629999999999997, 0.61603167951107785)

In [234]:
lr = LNR()
lr.fit(tr_df[feat],tr_df[label])
y = lr.predict(ts_df[feat]).round()
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.64180000000000004, 0.63585911603362522)

In [235]:
lr = LNR()
lr.fit(tr_df[feat25],tr_df[label])
y = lr.predict(ts_df[feat25]).round()
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.6109, 0.6057191811158269)

In [236]:
lr = LNR()
lr.fit(tr_df[featcor],tr_df[label])
y = lr.predict(ts_df[featcor]).round()
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.63600000000000001, 0.62930297404928248)

In [241]:
pca = PCA()
X = pca.fit_transform(tr_df[feat])
Xt = pca.transform(ts_df[feat])
lr = LR()
lr.fit(X,tr_df[label])
y = lr.predict(Xt).round()
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.64480000000000004, 0.63948992089229173)

In [242]:
pca = PCA()
X = pca.fit_transform(tr_df[feat])
Xt = pca.transform(ts_df[feat])
lr = LNR()
lr.fit(X,tr_df[label])
y = lr.predict(Xt).round()
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.64180000000000004, 0.63585911603362522)

In [243]:
gbc = GBC()
gbc.fit(X,tr_df[label])
y = gbc.predict(Xt)
acc = 1.0*(yy==y).sum()/len(y)
acc,roc_auc_score(yy,y)

(0.68559999999999999, 0.6809457235364047)