# Sliding windows
This notebook extracts a sliding window from the various features for each input mutation.

In [7]:
import pandas as pd
import numpy as np

dataset_vec = np.load('../processing/training_set/dataset_vec.npy')
position_vec = np.load('../processing/training_set/position_vec.npy')
input_list_file = '../processing/input_list.txt'

Execute this block to use the swissprot-derived profiles

In [58]:
# a small pseudocount is needed for this db when calculating shannon since some rows of the profiles are 0
in_profiles_path = '../processing/psiblast_processing/swissprot_db/profiles/'
in_shannon_path = '../processing/profiles_processing/swissprot_db/shannon_10e-10_pseudocount/'
in_pssm_path = '../processing/psiblast_processing/swissprot_db/pssm_array/'
out_path = '../processing/training_set/swissprot_db/'
out_shannon_name = 'shannon_10e-10_pseudocount_sliding_windows.npz'
out_pssm_name = 'pssm_sliding_windows.npz'

Execute this block to use the nr-derived profiles

In [2]:
in_profiles_path = '../processing/psiblast_processing/nr_db/profiles/'
in_shannon_path = '../processing/profiles_processing/nr_db/shannon_10e-10_pseudocount/'
in_pssm_path = '../processing/psiblast_processing/nr_db/pssm_array/'
out_path = '../processing/training_set/nr_db/'
out_shannon_name = 'shannon_10e-10_pseudocount_sliding_windows.npz'
out_pssm_name = 'pssm_sliding_windows.npz''

I first create a series of dictionaries for the features that I want to process.

In [3]:
with open(input_list_file) as handle:
    dataset_list = [line.rstrip() for line in handle]
    
profile_vec_dict = {dataset:np.load(in_profiles_path + dataset + '.profile.npy') for dataset in dataset_list}
shannon_vec_dict = {dataset:np.load(in_shannon_path + dataset + '.shannon.npy') for dataset in dataset_list}

In [4]:
def get_sliding_window(feature_vec_dict, window_size = 1, position_vec=position_vec, dataset_vec=dataset_vec):
    assert window_size % 2 == 1
    sliding_windows_list = []
    central_position = ((window_size - 1) // 2)
    for position, dataset in zip(position_vec, dataset_vec):
        feature_vec = feature_vec_dict[dataset[0]]
        lower_bound = position[0] - ((window_size - 1) // 2)
        upper_bound = position[0] + ((window_size - 1) // 2) + 1
        if lower_bound >= 0 and upper_bound <= len(feature_vec):
            sliding_window = feature_vec[lower_bound:upper_bound]
        elif not lower_bound >= 0 and upper_bound <= len(feature_vec):
            sliding_window = np.pad(feature_vec[:upper_bound], ((abs(lower_bound),0), (0,0)), 'constant', constant_values=0)
        elif lower_bound >= 0 and not upper_bound <= len(feature_vec):
            sliding_window = np.pad(feature_vec[lower_bound:], ((0,upper_bound-len(feature_vec)), (0,0)), 'constant', constant_values=0)
        elif not lower_bound >= 0 and not upper_bound <= len(feature_vec):
            sliding_window = np.pad(feature_vec, ((abs(lower_bound),upper_bound-len(feature_vec)), (0,0)), 'constant', constant_values=0)
        else:
            raise AssertionError
        assert sliding_window.shape == (window_size, feature_vec.shape[1])
        assert np.array_equal(sliding_window[central_position], feature_vec[position[0]])
        sliding_windows_list.append(sliding_window)
    sliding_windows_vec = np.array(sliding_windows_list)
    return sliding_windows_vec

In [5]:
profile_sliding_windows = get_sliding_window(profile_vec_dict, 17)
np.savez_compressed(out_path + 'profiles_sliding_windows.npz', profile_sliding_windows)

In [6]:
shannon_sliding_windows = get_sliding_window(shannon_vec_dict, 17)
np.savez_compressed(out_path + shannon_name, shannon_sliding_windows)