In [30]:
#This is a random forrest classifier for endolysin prediction 
from multiprocessing import cpu_count
from pathlib import Path 
from random import shuffle
from typing import Callable, Union, List, Dict, Tuple, Any, Hashable
import joblib
import numpy as np
import pandas as pd
from numpy import mean
from numpy import std
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# ML model architectures available for training
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct, PairwiseKernel, Matern, RationalQuadratic, ExpSineSquared, WhiteKernel
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC

class ENDOLYSORFOREST():
    def __init__(self, classifier, featurefile, labelfile):
        #use the following parameters:
        #For MLPC: hidden_layer_sizes=200, random_state= 1964, max_iter = 250
        #For RF and GP: n_jobs = 1, random_state= 1964
        #For the SVC: cache_size = 3000, probability = True
        self.classifier = classifier(hidden_layer_sizes=200, random_state= 1964, max_iter = 250)
        
    def split_data(self, featurefile):
        #instead of a singular file you can submit a list of files, which enables combining features from different methods 
        if len(featurefile) == 1:
            for f in featurefile:
                endofeatures = pd.read_csv(f, index_col = 0, sep ='\t')
                trainfile, testfile = train_test_split(endofeatures, test_size=0.2, random_state=25)
                getmetricslist = trainfile.columns.tolist()
        elif len(featurefile) > 1:
            dfs = []
            for f in featurefile:
                df = pd.read_csv(f, index_col = 0, sep ='\t')
                dfs.append(df)
            endofeatures = pd.concat(dfs, axis =1, join = 'inner')
            trainfile, testfile = train_test_split(endofeatures, test_size = 0.2, random_state = 25)
            getmetricslist = trainfile.columns.tolist()
        else:
            print('Empty file list')
        return trainfile, testfile, getmetricslist
    
    def match_labels(self, testfile, labelfile):
        labeldf = pd.read_csv(labelfile, header = None, sep = '\t')
        labeldf.columns = ['id', 'yesno']
        mylist = []
        for index, row in testfile.iterrows():
            for idx, r in labeldf.iterrows():
                if index == r['id']:
                    sublist = [r['id'], r['yesno']]
                    mylist.append(sublist)
        mydf = pd.DataFrame(mylist)
        mydf.columns = ['id', 'yesno']
        return mydf
            
    def fit_model(self, trainfile, labelfile):
        #convert the representation to a tuple with vector, lable pairs
        mytable = pd.read_csv(labelfile, header = None, sep = '\t')
        mytable.columns = ['id', 'yesno']
        labels = pd.Series(mytable.yesno.values,index=mytable.id).to_dict()
        labeled_vectors = []
        for index, row in trainfile.iterrows(): 
            label = labels[index]
            labeled_vectors.append((np.array(row), label))
        
        #shuffle the data to prevent clumping of positives, which might confuse the model
        shuffle(labeled_vectors)
        
        datalen = len(labeled_vectors)
        vectors = np.zeros(datalen, dtype = object)
        labels = np.zeros(datalen)
        for i, (x, y) in enumerate(labeled_vectors):
            vectors[i], labels[i] = x, y

        vectors, labels = make_classification(n_samples = 150, n_features = 54, random_state= 1964) #change the type of samples and features 
        self.classifier.fit(vectors, labels)
    
    def test_model(self, testfile, mydf):
        "check if a protein is an endolysin. testing the model"
        testdata = []
        labels1 = mydf['yesno'].to_numpy()
        for index, row in testfile.iterrows():
            testdata.append(np.array(row))
        print(len(testdata))
        print(len(labels1))
        predictions = self.classifier.predict(testdata)
        precision, recall, thresholds = metrics.precision_recall_curve(labels1, predictions)
        fpr, tpr, thresholds = metrics.roc_curve(labels1, predictions)
        aucval = metrics.auc(fpr, tpr)
        print("Accuracy:", metrics.accuracy_score(labels1, predictions))
        print("Sensitivity:", metrics.recall_score(labels1, predictions))
        print("Roc_AUC_score:", metrics.roc_auc_score(labels1, predictions))
        print("Precision:", precision)
        print("Recall:", recall)
        print("Thresholds:", thresholds)
        print("ROC_curve:", fpr, tpr)
        print("AUC:", aucval)
        print("Report:", classification_report(labels1, predictions, target_names= ['no', 'yes'], digits = 4, zero_division = 1))
        
        #Plot the precision recall curve
        
        display = metrics._plot.precision_recall_curve.PrecisionRecallDisplay.from_predictions(labels1, predictions)
        _ = display.ax_.set_title("2-class Precision-Recall curve")
        
    def draw_plot(self, testfile, mydf):
        # Compute ROC curve and ROC area for each class
        testdata = []
        labels1 = mydf['yesno'].to_numpy()
        for index, row in testfile.iterrows():
            testdata.append(np.array(row))
        predictions = self.classifier.predict(testdata)
        
        n_classes = 2

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = metrics.roc_curve(labels1, predictions)
            roc_auc[i] = metrics.auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = metrics.roc_curve(labels1.ravel(), predictions.ravel())
        roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])

        plt.figure()
        lw = 2
        plt.plot(
            fpr[1],
            tpr[1],
            color="darkorange",
            lw=lw,
            label="ROC curve (area = %0.2f)" % roc_auc[1],
        )
        plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("Receiver operating characteristic example")
        plt.legend(loc="lower right")
        plt.show()
        

In [31]:
a = ENDOLYSORFOREST(MLPClassifier, ['a'], 'b')

#w celu wytrenowania modelu wywolywalam kolejne funkcje w klasie 