In [1]:
import sys
sys.path.insert(1, '../code/scripts/')

In [2]:
# data processing
import os.path
from os import path
from scipy import stats
import numpy as np
import pandas as pd
import seaborn as sns
import mdtraj as md
import parseaf as pa

# logistic regression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# figure making
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
states = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
          'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

## Functions

In [5]:
def get_aa_freq(seq, aa):
    cnt = 0
    for i in seq:
        if i == aa:
            cnt += 1
    aa_freq = cnt / len(seq)
    return aa_freq

In [6]:
def append_percent_aa(row, aa):
    seq = row['region_seq']
    if len(seq) == 0:
        print(row['uni_id'])
    return get_aa_freq(seq, aa)

## Training logistic regression model using AlphaFold data

In [7]:
df = pd.read_csv('../data/af_regions/sc_af_regions_all.csv')

In [8]:
for aa in states:
    df['freq_'+aa] = df.apply(lambda row: append_percent_aa(row, aa), axis=1)

In [9]:
featurecol = []
for aa in states:
    featurecol.append('freq_'+aa)
X = df[featurecol]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
cnf_all = metrics.confusion_matrix(y_test, y_pred)
metrics.accuracy_score(y_test, y_pred)

0.9544642857142858

In [10]:
df_lr = pd.DataFrame(columns=['aa','coef'])
for i in range(20):
    df_lr = df_lr.append({'aa': states[i], 'coef': logreg.coef_[0][i]}, ignore_index=True)

## Using only P or using only P and G

In [11]:
def get_logreg_accuracy(df, aa_list):
    featurecol = []
    for aa in aa_list:
        featurecol.append('freq_'+aa)
    X = df[featurecol]
    y = df['label']
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=0)
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    return cnf_matrix, accuracy

In [12]:
d_acc = {}

In [13]:
states_ranked = df_lr['aa'].tolist()
d_acc['sc_top5'] = get_logreg_accuracy(df, ['P', 'L', 'S', 'I', 'N'])[1]

In [14]:
d_acc['sc_onlyP'] = get_logreg_accuracy(df, ['P'])[1]

In [15]:
d_acc['sc_onlyG'] = get_logreg_accuracy(df, ['G'])[1]

In [16]:
d_acc['sc_onlyPG'] = get_logreg_accuracy(df, ['P', 'G'])[1]

In [17]:
d_acc

{'sc_top5': 0.9282738095238096,
 'sc_onlyP': 0.7401785714285715,
 'sc_onlyG': 0.5273809523809524,
 'sc_onlyPG': 0.7446428571428572}

## Using model trained on all AF output to classify random non-highly-charged regions

In [18]:
df_random = pd.read_csv('../data/af_regions/random_af_regions_low_thresh.csv')

In [19]:
def append_percent_aa(row, aa):
    seq = row['seq']
    return get_aa_freq(seq, aa)

In [20]:
for aa in states:
    df_random['freq_'+aa] = df_random.apply(lambda row: append_percent_aa(row, aa), axis=1)

In [21]:
featurecol = []
for aa in states:
    featurecol.append('freq_'+aa)
X_rd = df_random[featurecol]
y_rd = df_random['label']
y_pred_rd = logreg.predict(X_rd)
cnf_rd = metrics.confusion_matrix(y_rd, y_pred_rd)
accuracy = metrics.accuracy_score(y_rd, y_pred_rd)
accuracy

0.9104258443465492

## Using model trained on all AF output to classify highly charged regions

In [22]:
def append_percent_aa(row, aa):
    seq = row['region.seq']
    return get_aa_freq(seq, aa)

In [23]:
df_hc = pd.read_csv('../data/charged_regions/cr_trimmed_filtered_aflabel.csv')

In [24]:
for aa in states:
    df_hc['freq_'+aa] = df_hc.apply(lambda row: append_percent_aa(row, aa), axis=1)

In [25]:
featurecol = []
for aa in states:
    featurecol.append('freq_'+aa)
X_hc = df_hc[featurecol]
y_hc = df_hc['label']
y_pred_hc = logreg.predict(X_hc)
cnf_hc = metrics.confusion_matrix(y_hc, y_pred_hc)
accuracy = metrics.accuracy_score(y_hc, y_pred_hc)
accuracy

0.920704845814978

In [26]:
df_hc['lr_label'] = y_pred_hc

In [27]:
df_hc.to_csv('../data/charged_regions/cr_trimmed_filtered_lrlabel.csv')

## Confusion matrices summary figure

In [28]:
uv_all = pd.read_csv('../data/uversky/uversky_all.csv')
uv_rd = pd.read_csv('../data/uversky/uversky_random.csv')
uv_hc = pd.read_csv('../data/uversky/uversky_hc_trimmed.csv')
df_uv = [uv_all, uv_rd, uv_hc]

In [29]:
cnf_uv_all = np.array([[0, 0], [0, 0]])
cnf_uv_rd = np.array([[0, 0], [0, 0]])
cnf_uv_hc = np.array([[0, 0], [0, 0]])
cnf_uv = [cnf_uv_all, cnf_uv_rd, cnf_uv_hc]

In [30]:
for i in range(3):
    df = df_uv[i]
    cnf = cnf_uv[i]
    cnf[0, 0] = len(df[(df.label == 'disordered') & (df.uversky_pred == 'disordered')])
    cnf[0, 1] = len(df[(df.label == 'disordered') & (df.uversky_pred == 'helix')])
    cnf[1, 0] = len(df[(df.label == 'helix') & (df.uversky_pred == 'disordered')])
    cnf[1, 1] = len(df[(df.label == 'helix') & (df.uversky_pred == 'helix')])

In [31]:
sample_size = []
for df in df_uv:
    sample_size.append(len(df))

In [32]:
sample_size

[3360, 3405, 681]

In [33]:
def get_cnf_freq(cnaf_mat):
    cnf_freq = cnf_mat / cnf_mat.sum(axis=1)[:, np.newaxis]
    return cnf_freq

In [34]:
cnf_matrices = [cnf_all, cnf_rd, cnf_hc] + cnf_uv

In [35]:
cnf_df = pd.DataFrame(columns={'TN', 'FP', 'FN', 'TP', 'n'})
i = 0
for mat in cnf_matrices:
    rv = {}
    rv['TN'] = mat[0][0]
    rv['FP'] = mat[0][1]
    rv['FN'] = mat[1][0]
    rv['TP'] = mat[1][1]
    rv['n'] = sample_size[i]
    cnf_df = cnf_df.append(rv, ignore_index=True)
    i += 1
    if i == 3:
        i = 0

In [36]:
cnf_df.to_csv('../misc/cp_cnf_matrices_posttrim.csv', index=False)

## Using model trained on all AF output to classify regions in other proteomes

**S. Pombe**

In [37]:
df = pd.read_csv('../data/af_regions/pombe_af_regions.csv')

In [38]:
for aa in states:
    df['freq_'+aa] = df.apply(lambda row: append_percent_aa(row, aa), axis=1)

KeyError: 'region.seq'

In [None]:
featurecol = []
for aa in states:
    featurecol.append('freq_'+aa)
X = df[featurecol]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
cnf_all = metrics.confusion_matrix(y_test, y_pred)
metrics.accuracy_score(y_test, y_pred)

0.9320502749410841

**Human**

In [None]:
df = pd.read_csv('../data/af_regions/hsapiens_af_regions.csv')

In [None]:
for aa in states:
    df['freq_'+aa] = df.apply(lambda row: append_percent_aa(row, aa), axis=1)

In [None]:
featurecol = []
for aa in states:
    featurecol.append('freq_'+aa)
X = df[featurecol]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
cnf_all = metrics.confusion_matrix(y_test, y_pred)
metrics.accuracy_score(y_test, y_pred)

0.9699797453703703

**C. elegans**

In [None]:
df = pd.read_csv('../data/af_regions/celegans_af_regions.csv')

In [None]:
for aa in states:
    df['freq_'+aa] = df.apply(lambda row: append_percent_aa(row, aa), axis=1)

In [None]:
featurecol = []
for aa in states:
    featurecol.append('freq_'+aa)
X = df[featurecol]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
cnf_all = metrics.confusion_matrix(y_test, y_pred)
metrics.accuracy_score(y_test, y_pred)

0.9530573847601129