# CatBoost

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import catboost as cb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

## Args

In [2]:
INPUT_DATA = 'test.csv'
IS_TRAIN = False
MODEL_SAVE_PATH = 'model.cb'
RESULT_SAVE_PATH = 'result.csv'

## Examine data

In [None]:
%%capture
df = pd.read_csv(INPUT_DATA, sep='\t', error_bad_lines=False, header=0)

In [None]:
df.shape

In [None]:
df.head()

## Preprocessing

### Filters

In [None]:
RELEVANT_FEATURES = [
    'chrom',
    'inputPos',
    'inputRef',
    'inputAlt',
    'transcript',
    'codingEffect',
    'varLocation',
    'alt_pNomen',
    'wtSSFScore',
    'wtMaxEntScore',
    'varSSFScore',
    'varMaxEntScore',
#     'rsId',
#     'rsClinicalSignificance',
    'rsMAF',
    '1000g_AF',
    'gnomadAltFreq_all',
    'espAllMAF',
    'espAllAAF',
    'clinVarMethods',
    'clinVarClinSignifs',
    'nOrthos',
    'conservedOrthos'
] + [column for column in df.columns if 'score' in column] + ['MIM_disease']


RULES = [
    lambda df: df['codingEffect'] != 'synonymous',
    lambda df: df['varLocation'] != 'intron',
    lambda df: np.invert(df['1000g_AF'] > 0.01),
    lambda df: np.invert(df['gnomadAltFreq_all'] > 0.01)
]


def rules_filter(df):
    for rule in RULES:
        df = df[rule(df)]
    return df

In [None]:
def pre_robust(df):
    df = df[RELEVANT_FEATURES]  # Filtering relevant features
    df = rules_filter(df)  # Rules filtering
    return df

### `inputPos` as regression parametr

In [None]:
def pre_pos(df):
    df = df.copy()
    df.inputPos = df.inputPos.astype(float)
    return df

### `clinVarMethods` split

In [None]:
CVM_METHODS = {
    'case-control': 0,
    'clinical_testing': 1,
    'curation': 2,
    'in_vitro': 3,
    'in_vivo': 4,
    'literature_only': 5,
    'not_provided': 6,
    'phenotyping_only': 7,
    'provider_interpretation': 8,
    'reference_population': 9,
    'research': 10,
    'nan': 11
}


def fetch_names(row, delimeters='|,'):
    def unify(name):
        return '_'.join(name.strip().lower().split())
    
    names = [row]
    for delimeter in delimeters:
        new_names = []
        for name in names:
            new_names.extend(name.split(delimeter))
        names = new_names
    return [unify(name) for name in names]


def collect_names(column, delimeters='|,'): 
    all_names = set()
    for row in column:
        all_names.update(set(fetch_names(row)))
    return all_names

In [None]:
def pre_cvm(df):
    df = df.copy()
    
    df.clinVarMethods = df.clinVarMethods.fillna('nan')
    cvm_features = []
    for row in df.clinVarMethods:
        row_features = np.zeros(len(CVM_METHODS))
        for name in fetch_names(row):
            row_features[CVM_METHODS[name]] += 1
        cvm_features.append(row_features)
    cvm_features = np.vstack(cvm_features)
    
    cvm_columns = ['cvm_' + m for _, m in sorted([(i, m) for m, i in CVM_METHODS.items()])]
    cvm_features = pd.DataFrame(cvm_features, columns=cvm_columns, index=df.index)
    df = pd.concat([df, cvm_features], axis=1)
    del df['clinVarMethods']
    return df

### All

In [None]:
PRES = [
    pre_robust,
    pre_pos,
    pre_cvm
]


def pre_all(df):
    for pre in PRES:
        df = pre(df)
    return df

In [None]:
df = pre_all(df)
df.shape

## Making target

In [None]:
RELEVANCES = {
    'benign': 0, 
    'likely_benign': 1, 
    'not_provided': 2, 
    'vus': 2, 
    'likely_pathogenic': 3, 
    'pathogenic': 4
}


def labelize_row(row):
    return max((RELEVANCES[name] for name in fetch_names(row) if name in RELEVANCES), default=2)


def labelize_target(y, bad_word='pathogenic'):
    return y.apply(labelize_row)

## NDCG

In [None]:
def split_relevance(relevance):
    return [r.lower() for it in relevance.split('|') for r in it.split(', ') if r.lower() in RELEVANCES]


def get_relevances(significances):
    return [split_relevance(significance) for significance in significances.values]


def dcg(relevances):
    return np.sum(2 ** relevances / np.log2(np.arange(2, relevances.size + 2)))


def evaluate_serp(df, sign, score, n=30):
    serp = df.sample(n=n, replace=False)
    rel_true = np.array([RELEVANCES[np.random.choice(sign[index], size=1)[0]] for index, row in serp.iterrows()])

    order_true = np.argsort(rel_true)[::-1]
    serp = serp.iloc[order_true]
    rel_true = rel_true[order_true]

    order_pred = np.argsort(score[serp.index])[::-1]
    rel_pred = rel_true[order_pred]
    
    return dcg(rel_pred) / dcg(rel_true)


def evaluate(df, sign, score, k=1000, n=30):
    np.random.seed(42)
    return np.mean([evaluate_serp(df, sign, score, n) for _ in range(k)])

## TT split

In [None]:
def tt_split(df, is_train):
    np.random.seed(42)
    df = df.copy()
    
    df.rename({'clinVarClinSignifs': 'y'}, axis=1, inplace=True)
    del df['MIM_disease']
    df = df.apply(lambda c: c.fillna('NaN') if (c.dtype == object) else c)
    
    if is_train:
        df = df.loc[df.y.notna()]
        sign = pd.Series(data=get_relevances(df.y), index=df.index)
        not_empty = [it != [] for it in sign]
        df = df[not_empty]
        sign = sign[not_empty]
        y = labelize_target(df.y)
        
        del df['y']
        X = df
        
        X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2)
        X_train, X_validate, y_train, y_validate = train_test_split(X_, y_, test_size=0.2)
        data = (X_train, y_train), (X_validate, y_validate), (X_test, y_test)
        cat_features = np.where(X.dtypes != np.float)[0]
        return data, cat_features, sign
    else:
        del df['y']
        X = df
        cat_features = np.where(X.dtypes != np.float)[0]
        return X.sample(frac=1), cat_features

In [None]:
if IS_TRAIN:
    data, cat_features, sign = tt_split(df, is_train=True)
    (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = data
    print('TRAIN', X_train.shape, X_validate.shape, X_test.shape)
else:
    X, cat_features = tt_split(df, is_train=False)
    print('TEST', X.shape)

## Train

In [None]:
model = cb.CatBoostClassifier(iterations=300, loss_function='MultiClass')
if IS_TRAIN:
    model.fit(X_train, y_train, cat_features=cat_features, 
              use_best_model=True, eval_set=(X_validate, y_validate),
              plot=False);
    model.save_model(MODEL_SAVE_PATH)
else:
    model.load_model(MODEL_SAVE_PATH)

## Test

In [None]:
def calc_score(model, df):
    return model.predict(df)[:, 0] + model.predict_proba(df).max(axis=1)

In [None]:
if IS_TRAIN:
    precision = (model.predict(X_test)[:, 0] == y_test).sum() / len(y_test)
    score = pd.Series(data=calc_score(model, X_test), index=X_test.index)
    ndcg = evaluate(X_test, sign, score)
    print(f'precision={precision} ndcg={ndcg}')

## Analysis

In [None]:
model.get_params()

In [None]:
def draw_importance(columns, importances, top_k=20, save_path=None):
    order = np.argsort(importances)[-top_k:]
    
    objects = columns[order]
    y_pos = np.arange(len(objects))
    performance = np.array(importances)[order]

    plt.figure(figsize=(20, 10))
    plt.barh(y_pos, performance, align='center', alpha=0.5)
    plt.yticks(y_pos, objects)
    plt.xlabel('Importance')
    plt.title('Feature')

    if save_path:
        plt.savefig(save_path)
    plt.show()

In [None]:
if IS_TRAIN:
    draw_importance(X_train.columns, model.feature_importances_, save_path=None)

In [None]:
if IS_TRAIN:
    sns.distplot(y_test, kde=False);
    sns.distplot(y_train, kde=False);

## Final flow

1. Filter out all `y` $\in [0, 1]$.
2. Sort `codingEffect` (all but misence) according to score values.
3. Sort all misence data (score) and append to back.

In [None]:
def final_order(model, df):
    df = df.copy()
    
    df['y'] = model.predict(df)[:, 0].astype(int)
    df = df[np.invert((df.y == 0) | (df.y == 1))]
#     del df['y']
    
    df['score'] = -calc_score(model, df)
    df['isCodingEffect'] = (df.codingEffect == 'missense').astype(int)
    df = df.sort_values(by=['isCodingEffect', 'score'])
#     del df['score']
    df['score'] = -df['score'] - df['y']
    del df['isCodingEffect']
    
    return df

In [None]:
X_test = X_test if IS_TRAIN else X
of = final_order(model, X_test)
X_test.shape, of.shape  # Plus `y` column

## FF analysis

In [None]:
of.head()

In [None]:
of.y.value_counts()

In [None]:
sns.distplot(of.y, kde=False);

In [None]:
np.where(of.y == 4)[0]

In [None]:
of[of.y == 4].codingEffect.value_counts()

In [None]:
df.loc[of[(of.y == 4) & (of.codingEffect != 'missense')].index].MIM_disease.value_counts(dropna=False)

## Save result

In [None]:
if not IS_TRAIN:
    of_ = of.reset_index(drop=True)
    of_[of_.y == 4].to_csv(RESULT_SAVE_PATH)