# Understand the Combined Data
Let's understand what we have to work with.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_pickle("../processed_data/combined_dataset.pkl")
data.sample(frac=1).head()

## Histograms

### Bulk Data

In [None]:
plt.hist(data.acvalue_target,
         bins=np.logspace(np.log10(data.acvalue_target.min()), np.log10(data.acvalue_target.max()), 50),
         label='All Labels')
plt.gca().set_xscale("log")
plt.xlabel('activity on target')
plt.show()

### Discriminated by target

In [None]:
data.groupby('target').size()

In [None]:
targets = data.groupby('target').acvalue_target.apply(list).to_dict()
fig, axes = plt.subplots(len(targets), 1, sharey=True, sharex=True)
fig.set_figheight(10)
axes = iter(axes)
colors = iter(plt.rcParams["axes.prop_cycle"].by_key()["color"])
for name, acvalues in targets.items():
    ax = next(axes)
    ax.hist(acvalues,
            bins=np.logspace(np.log10(data.acvalue_target.min()), np.log10(data.acvalue_target.max()), 50),
            color=next(colors))
    ax.set_xscale('log')
    ax.set_ylabel(name)
ax.set_xlabel('activity')
plt.suptitle('Activity Values by Target')
plt.show()

ST14, TMPRSS11D, and TMPRSS2 appear to have similar activity distributions.

KLKB1 is somewhat flat and skewed right, but also similar.

TMPRSS6 is notably skewed right.

### Scaled Activity Values

In [None]:
targets = data.groupby('target').acvalue_scaled_to_tmprss2.apply(list).to_dict()
fig, axes = plt.subplots(len(targets), 1, sharey=True, sharex=True)
fig.set_figheight(10)
axes = iter(axes)
colors = iter(plt.rcParams["axes.prop_cycle"].by_key()["color"])
for name, acvalues in targets.items():
    ax = next(axes)
    ax.hist(acvalues,
            bins=np.logspace(np.log10(data.acvalue_target.min()), np.log10(data.acvalue_target.max()), 50),
            color=next(colors))
    ax.set_xscale('log')
    ax.set_ylabel(name)
    ax.axvline(np.median(acvalues), color='b', label='median')
    ax.axvline(np.mean(acvalues), color='g', label='mean')
ax.set_xlabel('activity')
ax.legend()
plt.suptitle('Activity values scaled to TMPRSS2')
plt.show()

It appears that the linear regression and scaling is just a fancy way to put the distribution medians inline.  It does have more scientific merit, though, since the scaling only takes into account values from compounds which appear in both datasets.  Still, though, it suggests that if we would like to utilize all the data, including KLKB1 and ST14, it might be smart to scale the activities to match medians.  Definitely don't mean-scale the data though, since it appears to be mainly log-normally distributed.