In [1]:
import sys

In [2]:
sys.path.append("../")

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from helpers import read_configuration_file
from train import main
from train import load_regions
from hmm_helpers import build_hmm
from helpers import WindowType

In [5]:
# let's plot the clusters and see what we have
def plot_cluster(filename, nbins=35, kde=False, rug=True):

    with open(filename) as file:
        context = file.read()
        size = len(context)
        arraystr= context[1:size-1]
        arraystr = arraystr.split(',')
        cluster_0_counts = [float(item) for item in arraystr]
        sns.distplot(cluster_0_counts, bins=nbins, kde=kde, rug=rug)
        plt.show()

In [6]:
def print_hmm_info(hmm_file):
    with open(hmm_file) as json_file:
        hmm_json_map = json.load(json_file)
        hmm_json_map = json.loads(hmm_json_map)
        
        print("Start state: ")
        print("Distribution: ", hmm_json_map["start"]["distribution"])
        print("End state:")
        print("Distribution: ", hmm_json_map["end"]["distribution"])

        states = hmm_json_map["states"]
    
        for state in states:
            print("====================================================")
            print("State: ", state["name"])
            print("State weight: ", state["weight"])
            distribution = state["distribution"]

            if distribution is not None:

                print("Distribution: ", distribution["class"])

                if distribution["class"] == "Distribution":
                    print("Distribution name: ", distribution["name"])

                    if distribution["name"] == "IndependentComponentsDistribution":
                        distributions = distribution["parameters"][0]
                        for dist in distributions:

                            if dist["class"] == "GeneralMixtureModel":
                                gmm_distributions = dist["distributions"]

                                for gmm_dist in gmm_distributions:
                                    print("Distribution name: ", gmm_dist["name"])
                                    print("Distribution params: ", gmm_dist["parameters"])
                                print("Weights: ", dist["weights"])
                            else:
                                print("Distribution name: ", dist["name"])
                                print("Distribution params: ", dist["parameters"])
                                print("Weights: ", distribution["parameters"][1])
                elif distribution["class"] == "GeneralMixtureModel":
                    distributions = distribution["distributions"]

                    for dist in distributions:
                        print("Distribution name: ", dist["name"])
                        print("Distribution params: ", dist["parameters"])
                    print("Weights: ", distribution["weights"])

In [7]:
sns.set(color_codes=True)

In [8]:
ffile = "../"
cluster_files=[(ffile +  "cluster_0_wga_w_mean.txt", ffile +  "cluster_0_no_wga_w_mean.txt"),
               (ffile +  "cluster_1_wga_w_mean.txt", ffile +  "cluster_1_no_wga_w_mean.txt"),
               (ffile +  "cluster_2_wga_w_mean.txt", ffile +  "cluster_2_no_wga_w_mean.txt")]

In [9]:
for i, files in enumerate(cluster_files):
    print("Cluster: ", i)
    plot_cluster(filename=files[0])
    plot_cluster(filename=files[1])

Cluster:  0


FileNotFoundError: [Errno 2] No such file or directory: '../cluster_0_wga_w_mean.txt'

In [10]:
# load the configuration
configuration=read_configuration_file("../config.json")

In [11]:
# set up the configuration for the HMM training
clusters_config = configuration["clusters"]

In [12]:
print(clusters_config)

{'normal': {'filename': '/home/a/ag568/cluster_0.txt', 'state': 'normal', 'distributions': {'type': 'distribution', 'name': 'normal'}}, 'tuf': {'filename': '/home/a/ag568/cluster_1.txt', 'state': 'tuf', 'distributions': {'type': 'gmm', 'uniform': {'params': [2.0, 8.0]}, 'names': ['normal', 'uniform'], 'weights': None}}}


In [13]:
# which cluster is normal
normal_state_cluster = clusters_config['normal']


In [14]:
# set the filename and the distribution
normal_state_cluster['filename']="cluster_0.txt"
normal_state_cluster['state']="normal"
normal_state_cluster['distributions']['type'] = 'distribution'
normal_state_cluster['distributions']['name'] = 'normal'


In [15]:
# which cluster is TUF?
tuf_state_cluster = clusters_config['tuf']

In [16]:
# set the filename and the distribution
tuf_state_cluster['filename']="cluster_1.txt"
tuf_state_cluster['state']="tuf"
tuf_state_cluster['distributions']['type'] = 'gmm'
tuf_state_cluster['distributions']['uniform']['params'] = [2.0, 15.0]

In [17]:
# add one more cluster
extra_cluster = {'filename': '/home/a/ag568/cluster_0.txt', 'state': 'delete',
                'distributions':{'type':'distribution', 'name': 'poisson'}}



In [18]:
clusters_config['delete'] = extra_cluster


In [19]:
print(clusters_config)


{'normal': {'filename': 'cluster_0.txt', 'state': 'normal', 'distributions': {'type': 'distribution', 'name': 'normal'}}, 'tuf': {'filename': 'cluster_1.txt', 'state': 'tuf', 'distributions': {'type': 'gmm', 'uniform': {'params': [2.0, 15.0]}, 'names': ['normal', 'uniform'], 'weights': None}}, 'delete': {'filename': '/home/a/ag568/cluster_0.txt', 'state': 'delete', 'distributions': {'type': 'distribution', 'name': 'poisson'}}}


In [20]:
hmm_states = configuration["HMM"]["states"]

In [21]:
print(hmm_states)

{'tuf': {'name': 'tuf', 'start_prob': 0.48}, 'normal': {'name': 'normal', 'start_prob': 0.48}, 'gap_state': {'name': 'gap_state', 'start_prob': 0.04}}


In [22]:
hmm_states['delete'] = { "name":"delete", "start_prob":0.04}

In [23]:
print(hmm_states)

{'tuf': {'name': 'tuf', 'start_prob': 0.48}, 'normal': {'name': 'normal', 'start_prob': 0.48}, 'gap_state': {'name': 'gap_state', 'start_prob': 0.04}, 'delete': {'name': 'delete', 'start_prob': 0.04}}


In [24]:
# we also need to add the extra transition
hmm_transitions =  configuration["HMM"]["transitions"]

In [25]:

print(hmm_transitions)

{'tuf-tuf': 0.95, 'normal-normal': 0.95, 'tuf-normal': 0.05, 'normal-tuf': 0.05, 'gap_state-gap_state': 0.95, 'gap_state-tuf': 0.05, 'tuf-gap_state': 0.05, 'normal-gap_state': 0.05, 'gap_state-normal': 0.05}


In [26]:
hmm_transitions["delete-delete"] = 0.95
hmm_transitions["delete-normal"] = 0.05
hmm_transitions["normal-delete"] = 0.05
hmm_transitions["tuf-delete"] = 0.05
hmm_transitions["delete-tuf"] = 0.05
hmm_transitions["gap_state-delete"] = 0.05
hmm_transitions["delete-gap_state"] = 0.05


In [27]:
print(hmm_transitions)

{'tuf-tuf': 0.95, 'normal-normal': 0.95, 'tuf-normal': 0.05, 'normal-tuf': 0.05, 'gap_state-gap_state': 0.95, 'gap_state-tuf': 0.05, 'tuf-gap_state': 0.05, 'normal-gap_state': 0.05, 'gap_state-normal': 0.05, 'delete-delete': 0.95, 'delete-normal': 0.05, 'normal-delete': 0.05, 'tuf-delete': 0.05, 'delete-tuf': 0.05, 'gap_state-delete': 0.05, 'delete-gap_state': 0.05}


In [28]:
# now we can train
main(configuration=configuration)

INFO: Set up logger
INFO: Done...
INFO: Load regions...


FileNotFoundError: [Errno 2] No such file or directory: '/home/a/ag568/region_0.txt'

In [29]:
# load the hmm
hmm_file = ffile + "HMM_Model_0.json"

In [45]:
print_hmm_info(hmm_file=hmm_file)

FileNotFoundError: [Errno 2] No such file or directory: '../HMM_Model_0.json'

In [46]:
# this is the model we trained
hmm = build_hmm(hmm_file)

NameError: name 'build_hmm' is not defined

In [None]:
# visualize the model we just trained
plt.figure( figsize=(20,18) )
hmm.plot()
plt.show()

if we don't want to allow updates to the model, comment this out

In [47]:

hmm.freeze()


NameError: name 'hmm' is not defined

The regions are stored in the ```regions_files``` entry. We can edit this list and add/subtrat accordingly.
Here we load all the regions used in fitting the HMM.

In [50]:
regions = load_regions(configuration=configuration)

FileNotFoundError: [Errno 2] No such file or directory: '/home/a/ag568/region_0.txt'

In [51]:
print("Number or regions: {0}".format(len(regions)))

NameError: name 'regions' is not defined

Construct sequences of size ```size``` from the first region. The ```window_type``` indicates which of the two samples to use. Valid options are ```WindowType.WGA```, ```WindowType.NO_WGA``` and ```WindowType.BOTH```.
```n_seqs``` indicates how many sequences to extract. If set to ```None``` the whole region is partitioned into sequences with size ```size```.  

In [None]:
n_seqs=None
sequences = regions[0].get_region_as_sequences(size=100, window_type=WindowType.BOTH, n_seqs=n_seqs)

For each of the sequences, run the Viterbi algorithm on the sequence given the model. This finds the ML path of hidden states given the sequence. Returns a tuple of the log probability of the ML path, or (-inf, None) if the sequence is impossible under the model. 
If a path is returned, it is a list of tuples of the form (sequence index, state object).

In [52]:
for seq in sequences:
    
    viterbi_path = hmm.viterbi(sequence)
    
    print("Log-probability of ML Viterbi path: ", viterbi_path[0])

    if viterbi_path[1] is not None:
        print("Viterbi path length: ", len(viterbi_path[1]))

        for item in range(len(sequence)):
            print("sequnce item: {0} state {1}".format(sequence[item], viterbi_path[1][item][1].name))

NameError: name 'sequences' is not defined

Calculate the probability of a given sequence coming from the model we trained. This uses the forward algorithm internally

In [53]:
p_d_given_m = hmm.log_probability(sequence=sequences[0])
print("P(D|M): ", p_d_given_m)

NameError: name 'hmm' is not defined

The raw normalized probability matrices can be called using

In [None]:
hmm.predict_proba(sequences[0])