In [30]:
import pickle
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler

In [33]:
def load_data(field='absolute'):
    infile = '/userdata/pdaly/supereeg/results/samp100_2h_54pid_pos-1.18.power.pkl'
    data = pickle.load(open(infile, 'rb'))
    labels = pd.read_csv('/home/kscangos/Sandbox/full_patient_list_pd_feb.csv', index_col=0)
    labels['pid'] = labels.index.map(lambda pid: pid[2:])
    df = data[field]
    df = df.reset_index().merge(labels[['pid', 'dep']], left_on='index',
                           right_on='pid').set_index('index').drop('pid', axis=1)
    df.rename(columns={'Dep': 'dep'}, inplace=True)
    return df.drop('dep', axis=1), df.dep

X, y = load_data(field='relative')
boundary = ['92','111','119','131','135','27','115','130', '144','158','162','170','183']

zscore = StandardScaler()
zscore.fit(X.drop(boundary))

Xb = pd.DataFrame(zscore.transform(X.loc[boundary]), columns=X.columns, index=boundary)
yb = y[boundary]

Xnob = pd.DataFrame(zscore.transform(X.drop(boundary)), columns=X.columns, index=X.drop(boundary).index)
ynob = y[Xnob.index]

In [34]:
pipe = Pipeline([('pca', PCA()),
                 ('clf', LogisticRegression(C=0.75, penalty='l1', class_weight='balanced', solver='liblinear',
                                            max_iter=100))])

In [35]:
pipe.fit(Xnob, ynob)

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 LogisticRegression(C=0.75, class_weight='balanced', dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l1', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [36]:
accuracy_score(ynob, pipe.predict(Xnob))

0.8048780487804879

In [37]:
yb_hat = pipe.predict(Xb)

In [38]:
accuracy_score(yb, yb_hat)

0.5384615384615384

In [39]:
pd.DataFrame(list(zip(yb, yb_hat)), columns=['actual', 'pred'])

Unnamed: 0,actual,pred
0,0,1
1,0,1
2,0,0
3,0,1
4,0,0
5,0,0
6,0,0
7,0,1
8,0,1
9,0,0


In [89]:
# components x features
pca = pipe.named_steps['pca']
b = pipe.named_steps['clf'].coef_

In [90]:
U = pca.components_
U = pd.DataFrame(U, index=['pc_' + str(i) for i in range(U.shape[0])], columns=X.columns)
U[U.abs() < 0.2] = 0

In [91]:
Xb_pca = pca.transform(Xb)
Ub = U.T.dot(b.squeeze())
Xb_log_odds = Xb.multiply(Ub)
Xb_log_odds['label'] = yb

In [92]:
Xb_log_odds

Unnamed: 0,alpha_0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,beta_0,beta_1,beta_2,beta_3,...,gammaL_3,gammaL_4,gammaL_5,theta_0,theta_1,theta_2,theta_3,theta_4,theta_5,label
92,-0.051176,0.072562,-0.086846,0.00602,0.098823,0.029639,-0.392592,0.432602,-0.017354,-0.016721,...,-0.010039,-0.03577,0.026597,0.473993,0.133634,0.421442,0.142818,0.397189,0.130266,0
111,0.419771,-0.426474,0.405766,-0.035155,-0.508529,-0.338999,0.435281,-0.535083,0.022638,0.021215,...,0.018694,0.050701,-0.045805,0.131356,0.020216,0.141765,0.024666,0.135778,0.025501,0
119,0.228955,-0.368656,0.253872,-0.020661,-0.352534,-0.311843,0.073986,-0.15434,0.005573,0.009694,...,0.004608,0.008739,-0.0286,0.069764,0.056305,0.179466,0.031766,0.131725,0.045208,0
131,0.11225,-0.084395,0.074328,-0.018682,-0.143263,-0.105771,-0.019959,0.073394,-0.001764,0.002111,...,0.002966,0.00228,-0.012963,0.297343,0.071476,0.366567,0.114791,0.2939,0.072855,0
135,0.232445,-0.337757,0.364945,-0.035589,-0.472627,-0.31528,0.096889,-0.2285,0.011724,0.011211,...,0.005964,0.021085,-0.018732,-0.08196,0.026044,0.066804,0.010997,0.089686,0.035005,0
27,0.071557,0.059259,0.161604,-0.010009,-0.201962,-0.010861,0.076367,-0.080572,0.007921,0.008052,...,0.007645,0.016406,-0.011807,-0.240918,-0.044681,-0.311188,-0.084876,-0.313163,-0.065227,0
115,0.473539,-0.514645,0.423108,-0.046545,-0.623196,-0.42474,0.408168,-0.625838,0.020274,0.021024,...,0.013807,0.039845,-0.042023,-0.308293,-0.050408,-0.139388,-0.071585,-0.150049,-0.068685,0
130,0.035205,-0.013749,-0.001445,-0.000157,-0.153997,-0.104989,-0.156763,0.225272,-0.010067,-0.008503,...,-0.008819,-0.030068,0.025421,0.126284,0.033256,0.14042,0.030271,0.139374,0.038021,0
144,0.447243,-0.255267,0.345121,-0.032919,-0.263891,-0.263597,0.46535,-0.462343,0.023216,0.022721,...,0.016899,0.036921,-0.03666,-0.100894,-0.039395,-0.14567,-0.034792,-0.052926,-0.008084,0
158,0.096077,0.07291,0.105251,-0.023774,-0.210056,-0.088274,0.073437,-0.007834,0.005462,0.007386,...,0.003401,0.002276,-0.002234,-0.107336,-0.069893,-0.082565,0.005789,-0.052914,-0.054757,0


In [93]:
Xb_log_odds.to_csv('/home/pdaly/temp/boundary_pca_l1_gt-02.csv')

In [94]:
Xb_log_odds

Unnamed: 0,alpha_0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,beta_0,beta_1,beta_2,beta_3,...,gammaL_3,gammaL_4,gammaL_5,theta_0,theta_1,theta_2,theta_3,theta_4,theta_5,label
92,-0.051176,0.072562,-0.086846,0.00602,0.098823,0.029639,-0.392592,0.432602,-0.017354,-0.016721,...,-0.010039,-0.03577,0.026597,0.473993,0.133634,0.421442,0.142818,0.397189,0.130266,0
111,0.419771,-0.426474,0.405766,-0.035155,-0.508529,-0.338999,0.435281,-0.535083,0.022638,0.021215,...,0.018694,0.050701,-0.045805,0.131356,0.020216,0.141765,0.024666,0.135778,0.025501,0
119,0.228955,-0.368656,0.253872,-0.020661,-0.352534,-0.311843,0.073986,-0.15434,0.005573,0.009694,...,0.004608,0.008739,-0.0286,0.069764,0.056305,0.179466,0.031766,0.131725,0.045208,0
131,0.11225,-0.084395,0.074328,-0.018682,-0.143263,-0.105771,-0.019959,0.073394,-0.001764,0.002111,...,0.002966,0.00228,-0.012963,0.297343,0.071476,0.366567,0.114791,0.2939,0.072855,0
135,0.232445,-0.337757,0.364945,-0.035589,-0.472627,-0.31528,0.096889,-0.2285,0.011724,0.011211,...,0.005964,0.021085,-0.018732,-0.08196,0.026044,0.066804,0.010997,0.089686,0.035005,0
27,0.071557,0.059259,0.161604,-0.010009,-0.201962,-0.010861,0.076367,-0.080572,0.007921,0.008052,...,0.007645,0.016406,-0.011807,-0.240918,-0.044681,-0.311188,-0.084876,-0.313163,-0.065227,0
115,0.473539,-0.514645,0.423108,-0.046545,-0.623196,-0.42474,0.408168,-0.625838,0.020274,0.021024,...,0.013807,0.039845,-0.042023,-0.308293,-0.050408,-0.139388,-0.071585,-0.150049,-0.068685,0
130,0.035205,-0.013749,-0.001445,-0.000157,-0.153997,-0.104989,-0.156763,0.225272,-0.010067,-0.008503,...,-0.008819,-0.030068,0.025421,0.126284,0.033256,0.14042,0.030271,0.139374,0.038021,0
144,0.447243,-0.255267,0.345121,-0.032919,-0.263891,-0.263597,0.46535,-0.462343,0.023216,0.022721,...,0.016899,0.036921,-0.03666,-0.100894,-0.039395,-0.14567,-0.034792,-0.052926,-0.008084,0
158,0.096077,0.07291,0.105251,-0.023774,-0.210056,-0.088274,0.073437,-0.007834,0.005462,0.007386,...,0.003401,0.002276,-0.002234,-0.107336,-0.069893,-0.082565,0.005789,-0.052914,-0.054757,0


In [100]:
probas = pd.DataFrame(list(zip(pipe.predict_proba(Xb)[:, 1], yb, pipe.predict(Xb))), 
                      index=Xb.index,
                      columns=['proba', 'actual', 'pred'])
probas['match'] = probas.actual == probas.pred
probas

Unnamed: 0,proba,actual,pred,match
92,0.855867,0,1,False
111,0.637771,0,1,False
119,0.24423,0,0,True
131,0.398464,0,0,True
135,0.403001,0,0,True
27,0.315845,0,0,True
115,0.197028,0,0,True
130,0.517533,0,1,False
144,0.59243,0,1,False
158,0.492899,0,0,True


In [101]:
probas.to_csv('/home/pdaly/temp/boundary_pca_l1_gt-02_probas.csv')

In [103]:
720/85

8.470588235294118