In [1]:
#TODO only standard scale the amide bonds and only one hot encode the condensation domains

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

In [3]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [4]:
clean_data = pd.read_csv("clean_data.csv")

In [5]:
clean_data.drop("Unnamed: 0", axis = 1, inplace = True)

In [6]:
clean_data

Unnamed: 0,# Amide bonds,AMINO ACID STARTER CONDENSATION DOMAIN,"RzmA H139, H140, D144",VibH His126,VibH Gly131,CdaPS1 M165,RzmA Q36,RzmA Y138,RzmA R148,HolA A149,RzmA M143,RzmA Q136,Chain Length
0,12,AELPLTAAQREIWIAEQRLGKRNRVFRVGEYLEIHGRVDLGLFDRA...,1,1,0,0,0,1,0,1,1,1,0
1,6,QYSLTGAQTGIWFAQQLDPDNPIYNTAEYIEINGPVNIALFEEALR...,1,0,1,0,0,0,0,0,0,1,0
2,13,RHGLTSAQHEVWLAQQLDPRGAHYRTGSCLEIDGPLDHAVLSRALR...,1,0,0,0,0,1,1,1,0,0,1
3,7,TFYPLTHAQRRIWYTEKFYPGTSVSNLSGFGKLKSASGIDSGLLTE...,1,0,0,0,0,0,0,1,0,0,0
4,10,RGLPLTAGQRDIWLDQLSRGDSPLYNIGGYAVLKGPFRPELMQRTV...,1,0,1,0,0,0,0,0,0,0,1
5,11,VTAVQEAMWLQQELAPDRPNNVVTLWDVDGDLDIPLITEALRTAVS...,0,1,0,0,0,0,0,1,0,0,0
6,7,FYPLTNAQKRIWYTEKFYPNTSISNLAGFGKLISEDGVQAHYVEKA...,1,1,0,0,0,1,0,1,0,1,1
7,13,LLPLASVQQGIWLDQLAYPHLPYYNIGMTLEIKGDIDIALFEKAIQ...,1,0,1,0,0,0,0,1,0,0,1
8,9,THELASVQQGIWLDQIAHPDLPYYNIGMSLEIKGEIDIPLFEKAIE...,1,1,1,0,0,1,0,0,0,0,1
9,13,PLSTAQSEMWFAQELDADNWLYHSCGYLDIGGPLDLARFEQALRRF...,1,0,1,0,0,1,0,1,0,0,1


In [7]:
clean_data.columns

Index(['# Amide bonds', 'AMINO ACID STARTER CONDENSATION DOMAIN',
       'RzmA H139, H140, D144', 'VibH His126', 'VibH Gly131', 'CdaPS1 M165',
       'RzmA Q36', 'RzmA Y138', 'RzmA R148', 'HolA A149', 'RzmA M143',
       'RzmA Q136', 'Chain Length'],
      dtype='object')

In [8]:
categorical_columns = ["AMINO ACID STARTER CONDENSATION DOMAIN"]
numerical_columns = ["# Amide bonds"]
# numerical_columns = ["# Amide bonds", "RzmA H139, H140, D144", "VibH His126", "VibH Gly131", "CdaPS1 M165", "RzmA Q36", "RzmA Y138", "RzmA R148", "HolA A149", "RzmA M143", "RzmA Q136"]

In [9]:
labels = clean_data.pop("Chain Length")

In [10]:
# Logistic Regression
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [11]:
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

In [12]:
data_train, data_test, target_train, target_test = train_test_split(
    clean_data, labels, random_state = 42)

In [13]:
model.fit(data_train, target_train)

In [14]:
model.predict(data_test)

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=int64)

In [15]:
model.score(data_test, target_test)

0.36363636363636365

In [16]:
cv_results = cross_validate(model, clean_data, labels, cv=5)
cv_results

{'fit_time': array([0.00999308, 0.00700259, 0.00600505, 0.00699997, 0.00699735]),
 'score_time': array([0.00399709, 0.00299788, 0.00299454, 0.00300336, 0.00300717]),
 'test_score': array([0.66666667, 0.66666667, 0.44444444, 0.5       , 0.375     ])}

In [17]:
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")

The mean cross-validation accuracy is: 0.531 ± 0.118


In [18]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

In [19]:
tree_clf =  make_pipeline(preprocessor, DecisionTreeClassifier(criterion = "entropy"))
tree_clf.fit(data_train, target_train)

In [20]:
tree_clf.score(data_test, target_test)

0.36363636363636365

In [21]:
tree_cv_results = cross_validate(tree_clf, clean_data, labels, cv=5)
tree_cv_results

{'fit_time': array([0.00799775, 0.00600863, 0.00400114, 0.00399852, 0.00499773]),
 'score_time': array([0.00499916, 0.00299692, 0.00299978, 0.00300241, 0.00299931]),
 'test_score': array([0.55555556, 0.77777778, 0.44444444, 0.625     , 0.375     ])}

In [22]:
tree_scores = tree_cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{tree_scores.mean():.3f} ± {tree_scores.std():.3f}")

The mean cross-validation accuracy is: 0.556 ± 0.141


In [23]:
# Support Vector Machine (SVM)
from sklearn.svm import SVC

In [24]:
svm_model = make_pipeline(preprocessor, SVC(kernel = 'linear',gamma = 'scale', shrinking = False,))
svm_model.fit(data_train, target_train)

In [25]:
svm_model.score(data_test, target_test)

0.36363636363636365

In [26]:
svm_cv_results = cross_validate(svm_model, clean_data, labels, cv=5)
svm_cv_results

{'fit_time': array([0.006001  , 0.00600004, 0.00500154, 0.00598955, 0.00499964]),
 'score_time': array([0.00499916, 0.00399876, 0.00300288, 0.00400424, 0.00400209]),
 'test_score': array([0.66666667, 0.66666667, 0.44444444, 0.625     , 0.375     ])}

In [27]:
svm_scores = svm_cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{svm_scores.mean():.3f} ± {svm_scores.std():.3f}")

The mean cross-validation accuracy is: 0.556 ± 0.122
