Predicting protein secondary structure

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [4]:
def load_data(file):
    data = pd.read_csv(file)
    return data

In [5]:
def extract_features(sequence, window_size=5):
    features = []
    half_window = window_size // 2
    padded_sequence = "-" * half_window +sequence + "-" * half_window
    for i in range(half_window, len(padded_sequence) - half_window):
        window = padded_sequence[i - half_window: i + half_window + 1]
        features.append([ord(aa) for aa in window])
    return np.array(features)

In [6]:
# train model
def train_model(x, y):
    model = RandomForestClassifier()
    model.fit(x, y)
    return model

In [7]:
# predict and evaluate
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    plt.imshow(cm, cmap='coolwarm', interpolation='nearest')
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.show()

In [8]:
if __name__ == "__main__":
    data = load_data('/content/another.csv')  #replace with actual file here
    sequence = data['sequence']
    labels = data['Secondary Structure']

In [9]:
# feature extraction
x = np.vstack([extract_features(seq) for seq in sequence])
y = np.concatenate([list(lbl) for lbl in labels])

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)
model = train_model(x_train, y_train)
evaluate_model(model, x_test, y_test)

In [None]:
# predict on new sequence
new_sequence = "ACDEFGHIKLMNPQRSTVWY"
new_features = extract_features(new_sequence)
prediction = model.predict(new_features)
print("Predicted Sedondary Structure : ", "".join(prediction))
