In [None]:
import pandas as pd
import numpy as np
import operator
from itertools import combinations
import matplotlib.pyplot as plt
from collections import Counter

import time

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import DRSA

np.random.seed(314)

In [None]:
threshold = 1000
credibility = .9

## Data Loading

In [None]:
df = pd.read_csv("data/data.csv", index_col=0).reset_index(drop=True)
df

In [None]:
target = pd.read_csv("data/target.csv", index_col=0).reset_index(drop=True)
target = target["IC50_nM"]
target = (target < threshold).astype(int)
target

## Exploratory data analysis

In [None]:
pca = PCA(n_components=2)
ss = StandardScaler()
ss.fit(df)
pca.fit(ss.transform(df))
df2 = pd.DataFrame(pca.transform(ss.transform(df)), index=df.index)
df2

In [None]:
pca.explained_variance_ratio_, pca.explained_variance_ratio_.sum()

In [None]:
plt.scatter(df2[0], df2[1], alpha=.25, c=target.values)
plt.show()

In [None]:
q = 120

for x in df:
    if len(set(df[x])) > 40:
        plt.plot(df[x].iloc[df[x].argsort()[q//2:-(q//2-1)]], np.convolve(target.iloc[df[x].argsort()], np.ones(q)/q, mode='valid'))
        plt.grid()
        plt.title(x)
        plt.show()
    else:
        h = []
        w = []
        for xx in set(df[x]):
            w.append(xx)
            h.append(target[df[x]==xx].mean())
        plt.bar(w,h)
        plt.grid()
        plt.title(x)
        plt.show()

In [None]:
for x in df:
    df[x+"-"] = df[x]
df

## Data split

In [None]:
trainFraction = 0.7
valFraction = 0.15
testFraction = 0.15

In [None]:
size = df.shape[0]
idx = np.arange(size)
np.random.shuffle(idx)
trainX, trainY = df.loc[idx[:int(trainFraction*size)]].reset_index(drop=True), target[idx[:int(trainFraction*size)]]
valX = df.loc[idx[int(trainFraction*size):int((trainFraction+valFraction)*size)]].reset_index(drop=True)
valY = target[idx[int(trainFraction*size):int((trainFraction+valFraction)*size)]]
testX = df.loc[idx[int((trainFraction+valFraction)*size):]].reset_index(drop=True)
testY = target[idx[int((trainFraction+valFraction)*size):]]


## Model fit

In [None]:
drsa = DRSA.DRSA([1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1], credibility=credibility)
drsa.fit(trainX, trainY.reset_index(drop=True))

## Prediction

In [None]:
pred2 = drsa.predict(testX)

In [None]:
(pred2 == testY.values).mean()