# Imports

In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from ipywidgets import interact
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Read Data and Plot

In [2]:
# df = pd.read_csv("./data/covid19_sera_merged.csv")
# df

In [5]:
df_variants = pd.read_csv("../data/MAESTRO-d6178bdd-identified_variants_merged_protein_regions-main.tsv", sep="\t", low_memory=False)

In [6]:
df_variants_keep = df_variants[['Peptide'] + [c for c in df_variants.columns if 'intensity_for_peptide_variant' in c]]
df_variants_keep.replace(0.0, np.nan, inplace=True)
df_variants_keep = df_variants_keep.set_index("Peptide")
df_variants_keep = df_variants_keep.T
df_variants_keep.index = df_variants_keep.index.map(lambda x: ".".join(x.split(".")[:2]))
df_variants_keep["Condition"] = df_variants_keep.index.map(lambda x: x.split(".")[0])
df_variants_keep = df_variants_keep[(df_variants_keep.Condition == "_dyn_#Severe-COVID-19") | (df_variants_keep.Condition == "_dyn_#Non-severe-COVID-19")]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [7]:
df_variants_keep

Peptide,"K.[304.207]GARLIPEMDQIFTEVEMTTLE(K,304.207).V","I.[304.207]FTEVEMTTLE(K,304.207).V",K.[304.207]LYQPEYQEVSTEEQR.E,"K.[304.207]AANSLEAFIFETQD(K,304.207).L","R.[304.207]YSHDF(N,-56.985)FH.I","R.[304.207](P,143.096)SV(C,57.021)REAGPQAHMQQVTSSL(K,304.207).G",K.[304.207]QGSTGEEFHFQTGGR.D,"K.[304.207]HGTDDGVVW(M,15.995)NW(K,304.207).G","K.[304.207](H,100.027)GTDDGVVWMNW(K,304.207).G","K.[304.207]H(G,304.213)TDDGVVWMNW(K,304.207).G",...,"K.[304.207]YLGEE(Y,-57.005)V(K,304.207).A","K.[304.207]YLGEE(Y,-58.064)V(K,304.207).A","K.[304.207]YLGEE(Y,-60.599)V(K,304.207).A","K.[304.207]YLGEE(Y,-63.608)V(K,304.207).A","K.[304.207]YLGE(E,-68.078)YV(K,304.207).A","K.[304.207]YL(G,55.921)EEYV(K,304.207).A","K.{187.018}[304.207]YLGEEYV(K,304.207).A","R.[304.207]NTYE(K,361.237)YLGEEYV(K,304.207).A","K.[304.207]YLGE(E,125.898)YV(K,304.207).A",Condition
_dyn_#Non-severe-COVID-19.Patient-group-PT,,,0.678696,2.127578,,,,,,0.334896,...,1.033104,1.915291,2.343374,0.833948,,,,0.96477,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG1,,,0.743318,,,,,,,0.278907,...,1.182886,,,0.589243,,,,0.163054,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG10,2.696391,,,,,,,,,0.021225,...,1.21123,,,,,,,,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG11,2.208365,,,,,,,,,0.017387,...,0.984754,,,,,,,,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG13,3.721901,,,,,,,,,0.019735,...,1.148707,,,,,,,,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG14,,,0.448676,1.034965,,,,,,,...,,0.295989,0.889887,,,,,0.2891,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG15,,,0.259212,0.730918,,,,,,,...,0.885065,0.286611,0.836148,,,,,0.243726,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG16,,,0.412793,1.000764,,,,,,,...,,0.712272,0.96528,,,,,0.367987,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG17,,,0.125162,0.523169,0.118659,0.623897,,,,,...,,0.321736,,,,,,0.524086,,_dyn_#Non-severe-COVID-19
_dyn_#Non-severe-COVID-19.XG18,,,0.172893,0.444792,0.116186,0.447639,,,,,...,,0.340495,,,,,,0.46704,,_dyn_#Non-severe-COVID-19


In [5]:
# for col in df_variants_keep.columns:
#     print(col)

In [6]:
print(len(df_variants_keep.Condition.unique()))
df_variants_keep.Condition.unique()

2


array(['_dyn_#Non-severe-COVID-19', '_dyn_#Severe-COVID-19'], dtype=object)

In [7]:
df_variants_keep.groupby(["Condition"]).size()

Condition
_dyn_#Non-severe-COVID-19    25
_dyn_#Severe-COVID-19        18
dtype: int64

In [8]:
def plot_hist(ID):
    sns.displot(df_variants_keep, x=ID, hue="Condition", binwidth=0.05)
    
id_list = [x for x in df_variants_keep.columns if x != "Condition"][:20]

In [9]:
interact(plot_hist, ID=id_list)

interactive(children=(Dropdown(description='ID', options=('K.[304.207]GARLIPEMDQIFTEVEMTTLE(K,304.207).V', 'I.…

<function __main__.plot_hist(ID)>

# Fill NA and Process PCA

In [10]:
df_variants_keep["Condition"] = df_variants_keep["Condition"].map({"_dyn_#Non-severe-COVID-19":0, "_dyn_#Severe-COVID-19":1})

In [11]:
# df_variants_keep = df_variants_keep.fillna(value=0.0)
# df_variants_keep_pca = df_variants_keep.drop(["Condition"], axis=1)
# df_scaled = pd.DataFrame(preprocessing.scale(df_variants_keep_pca), columns=df_variants_keep_pca.columns)

# pca = PCA(n_components=40)
# df_PCA = pd.DataFrame(pca.fit_transform(df_scaled))
# new_col = df_variants_keep["Condition"].copy()
# new_col.reset_index(drop=True, inplace=True)
# df_PCA["Condition"] = new_col

In [12]:
df_variants_keep = df_variants_keep.fillna(value=0.0)
df_variants_keep_pca = df_variants_keep.drop(["Condition"], axis=1)
df_scaled = pd.DataFrame(preprocessing.scale(df_variants_keep_pca), columns=df_variants_keep_pca.columns)
new_col = df_variants_keep["Condition"].copy()
new_col.reset_index(drop=True, inplace=True)
df_scaled["Condition"] = new_col
df_PCA = df_scaled
df_PCA = df_PCA.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df_PCA = df_PCA.loc[:, ~df_PCA.columns.duplicated()]

In [13]:
df_PCA.groupby(["Condition"]).size()

Condition
0    25
1    18
dtype: int64

In [14]:
df_PCA_0 = df_PCA[df_PCA["Condition"] == 0]
df_PCA_1 = df_PCA[df_PCA["Condition"] == 1]

In [15]:
X_0, y_0 = df_PCA_0.drop(["Condition"], axis=1), df_PCA_0["Condition"]
X_1, y_1 = df_PCA_1.drop(["Condition"], axis=1), df_PCA_1["Condition"]

In [16]:
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, y_0, test_size=0.2, random_state=291)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=291)

In [17]:
train_0 = X_train_0.copy()
train_0["Condition"] = y_train_0
train_1 = X_train_1.copy()
train_1["Condition"] = y_train_1
train_data = train_0.append(train_1)
train_data = train_data.sample(frac=1)

In [18]:
test_0 = X_test_0.copy()
test_0["Condition"] = y_test_0
test_1 = X_test_1.copy()
test_1["Condition"] = y_test_1
test_data = test_0.append(test_1)
test_data = test_data.sample(frac=1)

In [19]:
X_train, y_train = train_data.drop(["Condition"], axis=1), train_data["Condition"]
X_test, y_test = test_data.drop(["Condition"], axis=1), test_data["Condition"]
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [20]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [21]:
y_pred = clf.predict(X_test)

In [22]:
y_pred

array([1, 0, 1, 1, 0, 1, 0, 0, 1], dtype=int64)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.40      0.44         5
           1       0.40      0.50      0.44         4

    accuracy                           0.44         9
   macro avg       0.45      0.45      0.44         9
weighted avg       0.46      0.44      0.44         9

