In [None]:
import pandas as pd, numpy as np
from collections import Counter
import math, json, gc, random, os, sys
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
#get comp data
train = pd.read_json('/kaggle/input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json('/kaggle/input/stanford-covid-vaccine/test.json', lines=True)
sample_sub = pd.read_csv("/kaggle/input/stanford-covid-vaccine/sample_submission.csv")


In [None]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']


flattened the data from [artgor notebook](https://www.kaggle.com/artgor/openvaccine-eda-feature-engineering-and-modelling/#data)

In [None]:
train_data = []
for mol_id in train['id'].unique():
    sample_data = train.loc[train['id'] == mol_id]
    seq_scored = sample_data['seq_scored'].values[0]
    signal_to_noise = sample_data['signal_to_noise'].values[0]
    SN_filter = sample_data['SN_filter'].values[0]
    
    for seq_order in range(seq_scored):
        i = seq_order
        sample_tuple = (mol_id, seq_order, signal_to_noise, SN_filter, 
                        sample_data['sequence'].values[0][i],
                        sample_data['structure'].values[0][i], sample_data['predicted_loop_type'].values[0][i],
                        sample_data['reactivity'].values[0][i], sample_data['reactivity_error'].values[0][i],
                        sample_data['deg_Mg_pH10'].values[0][i], sample_data['deg_error_Mg_pH10'].values[0][i],
                        sample_data['deg_pH10'].values[0][i], sample_data['deg_error_pH10'].values[0][i],
                        sample_data['deg_Mg_50C'].values[0][i], sample_data['deg_error_Mg_50C'].values[0][i],
                        sample_data['deg_50C'].values[0][i], sample_data['deg_error_50C'].values[0][i])
        train_data.append(sample_tuple)
        
train_data = pd.DataFrame(train_data, columns=['mol_id', 'seq_order', 'signal_to_noise', 'SN_filter', 
                                               'sequence', 'structure', 'predicted_loop_type', 
                                               'reactivity', 'reactivity_error', 'deg_Mg_pH10', 'deg_error_Mg_pH10',
                                               'deg_pH10', 'deg_error_pH10', 'deg_Mg_50C', 'deg_error_Mg_50C', 
                                               'deg_50C', 'deg_error_50C'])


In [None]:
train_data.head()

# 1. SN_filter vs Targets

In [None]:
feature = 'SN_filter'
feature_values = [[0,1], [0], [1]]
fig, ax = plt.subplots(figsize = (15, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        sns.distplot(train_data.loc[train_data[feature].isin(feature_value) , target_], color = color);
        plt.title(f'{target_} distribution when {feature} in {feature_value}');

lets plot without outlier 

In [None]:
feature = 'SN_filter'
feature_values = [[0,1], [0], [1]]
fig, ax = plt.subplots(figsize = (15, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        
        plt.title(f'{target_} distribution when {feature} in {feature_value}');


When the data include `SN_filter == 0`, all the target have very sharp spike on value `0` 
lets check the `value_counts`

In [None]:
feature = 'SN_filter'
feature_values = [[0,1], [0], [1]]
fig, ax = plt.subplots(figsize = (15, 27))
for target_idx, target_ in enumerate(target_cols): 
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), 3, plt_idx);
        
        df_show = train_data.loc[train_data[feature].isin(feature_value) , target_].value_counts(normalize = True).iloc[:5] * 100
        df_show = pd.DataFrame(df_show).reset_index()
        df_show.columns = [f'{target_} value', 'value_pct']
        df_show = df_show.round(3)
        df_show[f'{target_} value'] = df_show[f'{target_} value'].astype('category')
        sns.barplot(data = df_show, y = f'{target_} value', x = 'value_pct');
        
        plt.title(f'{target_} value pct when {feature} in {feature_value}');

The last plot show that all the target is dominated by 0 value.

But 0 value is more dominant when `SN_filter = 0` about **20%** of the data while `SN_filter = 1` only about **3%**






Lets re-plot the distribution chart without 0 value

In [None]:
feature = 'SN_filter'
feature_values = [[0,1], [0], [1]]
fig, ax = plt.subplots(figsize = (15, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), 3, plt_idx);
        
        without_zero_df = train_data.loc[train_data[target_] != 0]
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        # remove 0 value
        show_data = show_data.loc[show_data[target_] != 0]
        sns.distplot(show_data[target_], color = color);
        
        plt.title(f'{target_} distribution when {feature} in {feature_value}');
        

the plot without 0 value on target, really change the game on the `SN_filte = 0` which originally have two peaks become only one peak 

Lets see how many 0 happen per target column

In [None]:
print("Number of row that have value == 0")
(train_data[target_cols] == 0).sum(axis = 0)

every target columns have different number of row of 0 value

In [None]:
at_least_one_target_zero = (train_data[target_cols] == 0).sum(axis = 1) >= 1 
print("total row of at least one target value == 0 is" , 
      at_least_one_target_zero.sum())

all_scored_target_zero = (train_data[['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']] == 0).sum(axis = 1) == 3
print("total row of scored target value == 0 is" , 
      all_scored_target_zero.sum())

all_target_zero = (train_data[target_cols] == 0).sum(axis = 1) == 5
print("total row of all target value == 0 is" , 
      all_target_zero.sum())


look at rows that has all zero, we might get interesthing pattern on that

In [None]:
train_data.loc[all_scored_target_zero].sample(10, random_state = 1)

there is lack of `SN_filter == 1` on the 20 random samples while `SN_filter == 1` is dominant in whole training dataset. 

the proportion of `SN_filter == 1` and `SN_filter == 0` is 2:1

this might be explained by previous bar plot

In [None]:
train_data.loc[:, 'SN_filter'].value_counts(normalize = True)

In [None]:
train_data.loc[all_scored_target_zero, 'SN_filter'].value_counts(normalize = True)

There are small amount of `SN_filter == 1` only `4.2%` while full the whole data, it is `66.2%`

Should we investigate more on target value == 0 ?

To close `SN_filter vs targets` investigation, the first plot say that all of the target distribution on `SN_filter = 1` are  **positive skewed** and `target value = 0` hardly occured when `SN_filter = 1`

# 2. sequence vs targets

In [None]:
feature = 'sequence'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (22, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} distribution when {feature} in {feature_value}');

lets see if we add`SN_filter = 1` make more interesting insight

In [None]:
feature = 'sequence'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (22, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} distribution when {feature} in {feature_value}');

well, by the shape of distribution `sequence = A` appear different than the rest

but by the scale, `A` and `G` has larger value than `C` and `U`

if it more visible if we do **log transformation** before plot.


In [None]:
feature = 'sequence'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (22, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
#         # add log(x+1)
#         sns.distplot(np.log1p(show_data[target_]) , color = color);

        ## I remove non positive value because log(x+1) is too much to add 1 hence i use np.log
        show_data = show_data.loc[show_data[target_] > 0 ]
        sns.distplot(np.log(show_data[target_]) , color = color);
        plt.title(f'{target_} distribution when {feature} in {feature_value}');

interesting right? 

just a wild thought, if we only process non positive data, and play with log value, what is the impact.

we are going to lose 6.1 % of training rows

In [None]:
pd.DataFrame((train_data.loc[train_data['SN_filter'] == 1, target_cols] < 0 ).value_counts(normalize = True)).head() * 100

# 3. Structure vs Targets

In [None]:
feature = 'structure'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (22, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} distribution when {feature} in {feature_value}');

By pattern The `structure` affect a lot in `reactivity`, `deg_Mg_50C`, `deg_50C`, but at  `deg_Mg_pH10`, `deg_pH10` is relatively same

But see the scale on `structure = '.'` its **bigger for all targets**

hence it is significant for all targets

# 4. Predicted Loop Type vs Targets

In [None]:
feature = 'predicted_loop_type'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (37, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} | {feature} in {feature_value}');

# 5. Sequence Lags vs Targets

In [None]:
total_lag = 3
for lag in range(1, total_lag + 1) : 
    train_data[f'sequence_lag_{lag}'] = train_data.groupby('mol_id')[['sequence']].shift(lag)
    train_data[f'sequence_lag_{lag}'] = train_data[f'sequence_lag_{lag}'].fillna('Z') # Z for null

## lag 1

In [None]:
feature = 'sequence_lag_1'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (25, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} | {feature} in {feature_value}');

`G` tend to have longer right tail

## lag 2

In [None]:
feature = 'sequence_lag_2'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (25, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} | {feature} in {feature_value}');

Already blended on lag 2

## lag 3

In [None]:
feature = 'sequence_lag_3'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (25, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} | {feature} in {feature_value}');

Yup, like lag 2, its already blended, do not see the difference at glance

# 6. Sequence Lead vs Target

In [None]:
total_lead = 3
for lead in range(1, total_lead + 1) : 
    train_data[f'sequence_lead_{lead}'] = train_data.groupby('mol_id')[['sequence']].shift(-1 * lead)
    train_data[f'sequence_lead_{lead}'] = train_data[f'sequence_lead_{lead}'].fillna('Z') # Z for null

## lead 1

In [None]:
feature = 'sequence_lead_1'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (25, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} | {feature} in {feature_value}');

`Lead` is more interesting than `lag`

By the scale, `sequence_lead_1` of A is similar with U, and C is similar with G

While on `sequence` of A is similar with G, and C is similar with U

## lead 2

In [None]:
feature = 'sequence_lead_2'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (25, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} | {feature} in {feature_value}');

On Lead 2, `A` is very different than the others, its have long right tail. just like `G` in lag 1

In [None]:
feature = 'sequence_lead_3'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (25, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} | {feature} in {feature_value}');

On Lead 3, still `A` is very different than the others, its have long right tail. just like G in lag 1

## See the lag and lead windows value_counts

In [None]:
pd.DataFrame(train_data[['sequence_lag_3', 'sequence_lag_2', 'sequence_lag_1' , 'sequence', 
                        'sequence_lead_1', 'sequence_lead_2', 'sequence_lead_3',]].value_counts().head(10))

at the top 3, there are **7 consecutive `U`**

and overall, A and G has good relation 

# 7. sequence x structure vs target

In [None]:
pd.DataFrame(train_data[['sequence', 'structure']].value_counts())

## Structure of Seq A

In [None]:
sequence_ = 'A'
feature = 'structure'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (22, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        show_data = show_data.loc[show_data['sequence'] == sequence_]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} distribution when {feature} {sequence_} in {feature_value}');

## Structure of Seq C

In [None]:
sequence_ = 'C'
feature = 'structure'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (22, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        show_data = show_data.loc[show_data['sequence'] == sequence_]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} distribution when {feature} {sequence_} in {feature_value}');

## Structure of Seq G

In [None]:
sequence_ = 'G'
feature = 'structure'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (22, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        show_data = show_data.loc[show_data['sequence'] == sequence_]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} distribution when {feature} {sequence_} in {feature_value}');

## Structure of Seq U

In [None]:
sequence_ = 'U'
feature = 'structure'
feature_values = [[val] for val in sorted(train_data[feature].unique())]
fig, ax = plt.subplots(figsize = (22, 27))
colors = ['b', 'g', 'r', 'c' , 'm'] 
for target_idx, target_ in enumerate(target_cols): 
    color = colors[target_idx]
    for feature_idx, feature_value in enumerate(feature_values): 
        plt_idx = target_idx * len(feature_values) + feature_idx + 1
        plt.subplot(len(target_cols), len(feature_values), plt_idx);
        
        show_data = train_data.loc[train_data[feature].isin(feature_value)]
        show_data = show_data.loc[show_data['sequence'] == sequence_]
        # filter SN_filter == 1
        show_data = show_data.loc[show_data['SN_filter'] == 1]
        # remove outlier (1%) to more visible plot
        show_data = show_data.loc[(show_data[target_] > show_data[target_].quantile(0.01)) & 
                                  (show_data[target_] < show_data[target_].quantile(0.99))]
        
        sns.distplot(show_data[target_], color = color);
        plt.title(f'{target_} distribution when {feature} {sequence_} in {feature_value}');

# TO BE CONTINUED

There are a lot more features we can play with:
* `seq_order`

or mix of it:
* `structure__predicted_loop_type`, 
* `sequence__structure__predicted_loop_type`

or making its like n grams
* `sequence_2_before` -> ((G,A), (A,A)) 
* `sequence_2_after 
* `structure_2_before` 
and many more

Please Upvote if you like it or it is useful :D 
# CHEERS!!!