# One-group-out baseline runs

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from collections import defaultdict
from sklearn.model_selection import LeaveOneGroupOut
import concurrent
from sklearn.base import clone
import bin.baseline_models as bm

In [2]:
# edit this value
INPUT_CLUSTER_FILE = f'../../data/csv/clustered_splits/sim80.csv'
OUTPUT_FILE = f'../../data/csv/leave_cluster_out/sim80.csv'
SIMILARITY = 80

In [3]:
# Parameters
INPUT_CLUSTER_FILE = (
    "/SFS/user/wp/benor/test/proto-moto/data/csv/clustered_splits/sim100.csv"
)
OUTPUT_FILE = "/SFS/user/wp/benor/test/proto-moto/data/csv/leave_cluster_out/sim100.csv"
SIMILARITY = 100


In [4]:
DISTANCE = 100 - SIMILARITY
EXPERIMENT_NAME = f'sim{SIMILARITY}'

In [5]:
clusters_df = pd.read_csv(INPUT_CLUSTER_FILE, index_col=0)
clusters_df.head()

Unnamed: 0,sequence_id,cluster
0,12E8:L,4236
1,15C8:L,4108
2,1A0Q:L,2618
3,1A14:L,2791
4,1A2Y:L,3657


In [6]:
models = [
    ('knn_for_position', lambda: bm.KNNWholeSequence(n_neighbors=3)),
    ('average_for_position', lambda: bm.AverageForResidueAtPosition()),
    ('mean_for_sameres_position', lambda: bm.StatisticForSameResidueAtPosition(statistic='mean')),
    ('median_for_sameres_position', lambda: bm.StatisticForSameResidueAtPosition(statistic='median')),  
]
X_light = pd.read_csv('../../data/csv/fasta_aho_L.csv')
Y_light = pd.read_csv('../../data/csv/sasa_aligned_L.csv').rename(columns={'Unnamed: 0':'Id'})  

In [7]:
# remove those sequences that do not have any cluster assigned
keys_to_remove = X_light[~X_light['Id'].isin(clusters_df['sequence_id'])].index.to_list()
X_light.drop(index=keys_to_remove, inplace=True)
Y_light.drop(index=keys_to_remove, inplace=True)
print(f'keys removed {keys_to_remove}')

keys removed [4236]


In [8]:
X_light.drop(columns='Id', inplace=True, errors='ignore')
Y_light.drop(columns='Id', inplace=True, errors='ignore')

In [9]:
X_light.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,140,141,142,143,144,145,146,147,148,149
0,D,I,V,M,T,Q,S,Q,K,F,...,G,A,G,T,K,L,E,L,K,R
1,D,I,V,L,T,Q,S,P,A,I,...,G,G,G,T,K,L,E,I,K,R
2,-,I,E,L,T,Q,S,P,S,S,...,G,G,G,T,K,L,E,I,K,R
3,D,I,E,L,T,Q,T,T,S,S,...,G,G,G,T,-,-,-,-,-,-
4,D,I,V,L,T,Q,S,P,A,S,...,G,G,G,T,K,L,E,I,K,-


In [10]:
Y_light.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,140,141,142,143,144,145,146,147,148,149
0,82.1,6.0,58.1,4.6,46.3,4.0,70.0,51.8,56.7,41.9,...,1.5,43.9,14.6,1.0,39.2,3.4,9.0,26.7,55.3,29.9
1,73.7,2.1,56.3,7.6,54.9,9.3,50.0,38.1,64.1,38.6,...,3.5,82.4,6.1,0.0,44.5,2.4,2.9,15.8,47.7,26.2
2,,27.3,64.2,10.8,64.3,12.8,49.7,33.6,63.7,60.7,...,3.9,58.6,2.2,0.9,50.2,6.3,19.2,0.0,41.4,28.1
3,88.6,2.2,65.7,5.5,44.0,7.8,43.5,49.8,75.7,64.3,...,4.6,80.7,8.5,1.5,,,,,,
4,74.1,22.6,58.7,8.4,67.2,11.9,47.3,40.3,76.3,52.7,...,3.1,83.5,16.2,3.8,45.1,7.1,37.3,61.3,77.0,


In [11]:
cluster_series = clusters_df['cluster'].copy()
cluster_series.index = clusters_df['sequence_id']
cluster_series

sequence_id
12E8:L    4236
15C8:L    4108
1A0Q:L    2618
1A14:L    2791
1A2Y:L    3657
          ... 
7RNJ:L       6
7RSN:L       2
7RSO:L       7
7RTH:L       3
7RW2:L       1
Name: cluster, Length: 4236, dtype: int64

In [12]:
def avg_deviations(actual, predictions):
    actual = actual.fillna(0)
    predictions = predictions.fillna(0) 
    minus = actual.subtract(predictions)
    total = actual.count(axis=1)
    deviations = minus.abs().sum(axis=1) / total
    return deviations

In [13]:
X_light.shape, Y_light.shape, cluster_series.shape

((4236, 155), (4236, 155), (4236,))

In [14]:
def predict(x_train, y_train, x_valid, y_valid, model):
    model = model()
    #print('fitting...')
    model.fit(x_train, y_train)
    #print('predicting...')
    predicts = model.predict(x_valid)
    #print('validating...')
    return avg_deviations(y_valid, predicts).mean()

results = []
splitter = LeaveOneGroupOut()
split = splitter.split(X_light, Y_light, groups=cluster_series)

with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    print('submitting...')
    futures = []
    for train_is, valid_is in split:
        x_train, y_train = X_light.iloc[train_is, :], Y_light.iloc[train_is, :]
        x_valid, y_valid = X_light.iloc[valid_is, :], Y_light.iloc[valid_is, :]

        for model_name, model in models:
            future = executor.submit(
                predict, 
                x_train=x_train,
                y_train=y_train,
                x_valid=x_valid,
                y_valid=y_valid,
                model=model
            )
            futures.append(future)
            
        if len(futures) > 10:
            # FIXME
            # FIXME
            # FIXME
            # FIXME
            # FIXME
            # FIXME
            # FIXME
            # FIXME
            # FIXME
            break
            
    print('waiting for results...')     
    scores = []
    for future in concurrent.futures.as_completed(futures):
        scores.append(future.result())
    error = np.mean(scores)

    for score in scores:
        results.append((DISTANCE, error, score, model_name))

submitting...


waiting for results...


In [15]:
res_dict = defaultdict(list)
clusters_max = cluster_series.max()
for result in results:
    res_dict['number_of_clusters'].append(clusters_max)
    res_dict['distance'].append(result[0] / 100)
    res_dict['similarity'].append(1-(result[0] / 100))
    res_dict['model'].append(result[3])
    res_dict['mean_error'].append(result[1])
    res_dict['errors'].append(result[2])
df = pd.DataFrame(res_dict)
df

Unnamed: 0,number_of_clusters,distance,similarity,model,mean_error,errors
0,4236,0.0,1.0,median_for_sameres_position,5.284958,5.083074
1,4236,0.0,1.0,median_for_sameres_position,5.284958,5.333874
2,4236,0.0,1.0,median_for_sameres_position,5.284958,3.634194
3,4236,0.0,1.0,median_for_sameres_position,5.284958,5.660215
4,4236,0.0,1.0,median_for_sameres_position,5.284958,5.252903
5,4236,0.0,1.0,median_for_sameres_position,5.284958,6.906496
6,4236,0.0,1.0,median_for_sameres_position,5.284958,4.563177
7,4236,0.0,1.0,median_for_sameres_position,5.284958,6.366571
8,4236,0.0,1.0,median_for_sameres_position,5.284958,4.729633
9,4236,0.0,1.0,median_for_sameres_position,5.284958,4.906129


In [16]:
df.to_csv(OUTPUT_FILE)

In [17]:
# viz_df = dict(model=[], error=[])
# for i, r in df.iterrows():
#     for error in r['errors']:
#         viz_df['model'].append(r['model'])
#         viz_df['error'].append(error)
# viz_df = pd.DataFrame(viz_df)
# sns.set(rc={'figure.figsize': (8, 4)})
# sns.violinplot(data=viz_df, x='model', y='error')