In [None]:
"""!pip install -q transformers
!pip install keras-tuner -q
!pip install visualkeras
!pip install pydot
!pip install graphviz"""



import re
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import torch
import matplotlib.pyplot as plt
import sys
from google.colab import files
from transformers import TFBertModel, BertTokenizer, BertConfig
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, classification_report
from keras.optimizers import Adam
from keras import utils
from keras.utils import to_categorical
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, Concatenate, AveragePooling2D, Dropout
from keras.regularizers import l2
from keras.callbacks import EarlyStopping


pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=sys.maxsize)

In [None]:
uploaded = files.upload()

In [None]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case=False )
model = TFBertModel.from_pretrained("Rostlab/prot_bert_bfd", from_pt=True)

In [None]:
uploaded_fasta_train = list(uploaded.keys())[2]
fasta_content_train = uploaded[uploaded_fasta_train].decode('utf-8')  # Decode the bytes to a string
print(uploaded_fasta_train)
print(len(fasta_content_train))
#print(fasta_content_train)

uploaded_fasta_test = list(uploaded.keys())[1]
fasta_content_test = uploaded[uploaded_fasta_test].decode('utf-8')  # Decode the bytes to a string
print(uploaded_fasta_test)
print(len(fasta_content_test))
#print(fasta_content_test)

## Data Preprocessing

In [None]:
#Reading Train Dataset with FASTA format
sequences_train = {}
current_header_train = None
current_sequence_train = []

#Parse train
for line in fasta_content_train.split('\n'):
    line = line.strip()
    if line.startswith(">"):  # Header line
        if current_header_train is not None:
            sequences_train[current_header_train] = ''.join(current_sequence_train)
        current_header_train = line[1:]  # Remove the '>' character
        current_sequence_train = []
    else:
        current_sequence_train.append(line)

#Don't forget to add the last sequence
if current_header_train is not None:
    sequences_train[current_header_train] = ''.join(current_sequence_train)

labels_train = []

for header in sequences_train.keys():
    if "Positive" in header:
        labels_train.append("Positive")
    elif "Negative" in header:
        labels_train.append("Negative")

y_train = [label == 'Positive' for label in labels_train]
print("Toplam pozitif train örnek sayısı:", sum(y_train))
print("Toplam negatif train örnek sayısı:", len(y_train) - sum(y_train))
print("length of y_train:", len(y_train))
#print(y_train)

#----------------------------------------
print("--------------------------")

#Reading Test Dataset with FASTA format
sequences_test = {}
current_header_test = None
current_sequence_test = []

# Parse the FASTA content test
for line in fasta_content_test.split('\n'):
    line = line.strip()
    if line.startswith(">"):  # Header line
        if current_header_test is not None:
            sequences_test[current_header_test] = ''.join(current_sequence_test)
        current_header_test = line[1:]  # Remove the '>' character
        current_sequence_test = []
    else:
        current_sequence_test.append(line)

# Don't forget to add the last sequence
if current_header_test is not None:
    sequences_test[current_header_test] = ''.join(current_sequence_test)

# pozitif veya negatif diye sırasıyla alan array
labels_test = []

for header in sequences_test.keys():
    if "Positive" in header:
        labels_test.append("Positive")
    elif "Negative" in header:
        labels_test.append("Negative")

print("Toplam pozitif train örnek sayısı:", sum(y_train))
print("Toplam negatif train örnek sayısı:", len(y_train) - sum(y_train))
y_test = [label == 'Positive' for label in labels_test]
print("length of the y_test: ", len(y_test))
#print(y_test)


In [None]:
sequences_Example_train = []
length_train = []

for sequence in sequences_train.values():
    formatted_sequence = ' '.join(sequence)

    sequences_Example_train.append(formatted_sequence)
    length_train.append(len(sequence))

#print(length_train)
print("Min Train Sequence Length:", min(length_train))
print("Max Train Sequence Length:", max(length_train))
print("Train sequences:", sequences_Example_train)

sequences_Example_test = []
length_test = []

for sequence in sequences_test.values():
    formatted_sequence = ' '.join(sequence)
    sequences_Example_test.append(formatted_sequence)
    length_test.append(len(sequence))


#print(length_test)
print("Min Test Sequence Length:", min(length_test))
print("Max Test Sequence Length:",max(length_test))
print("Test sequences:", sequences_Example_test)

In [None]:
#Combining X and y by zipping them together
combined_data = list(zip(sequences_Example_train,y_train))
random.seed(42)
random.shuffle(combined_data)

#Seperate mixed X and y pairs
sequences_Example_train, y_train = zip(*combined_data)

print("Mixed X_train Sequences:", sequences_Example_train)
print("Mixed y_train Values:", y_train)


In [None]:
#Extracting feature embeddings for X_train
sequences_Example_train = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example_train]
ids_train = tokenizer.batch_encode_plus(sequences_Example_train, add_special_tokens=True, padding=True, return_tensors="tf")
input_ids = ids_train['input_ids']
attention_mask = ids_train['attention_mask']
embedding = model(input_ids)[0]
embedding = np.asarray(embedding)
attention_mask = np.asarray(attention_mask)
X_train = []
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embedding[seq_num][1:seq_len-1]
    X_train.append(seq_emd)


#print(X_train)
print("Length of X_train:", len(X_train))


sequences_Example_test = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example_test]
ids_test = tokenizer.batch_encode_plus(sequences_Example_test, add_special_tokens=True, padding=True, return_tensors="tf")

input_ids_test = ids_test['input_ids']
attention_mask_test = ids_test['attention_mask']
embedding_test = model(input_ids_test)[0]
embedding_test = np.asarray(embedding_test)
attention_mask_test = np.asarray(attention_mask_test)
X_test = []
for seq_num in range(len(embedding_test)):
    seq_len = (attention_mask_test[seq_num] == 1).sum()
    seq_emd = embedding_test[seq_num][1:seq_len-1]
    X_test.append(seq_emd)

#print(X_test)
print("Length of X_test:", len(X_test))




In [None]:
#Adding new 1024 features by taking means and stdev for each sequence (1024 + 1024 = 2048 features for every sequence)

result_matrix_train = []
for i in range(len(X_train)):
    mean_matrix_train = np.mean(X_train[i], axis=0, keepdims=True)
    std_matrix_train = np.std(X_train[i], axis=0, keepdims=True)
    result_matrix_train.append(np.hstack((mean_matrix_train, std_matrix_train)) )

X_train_final = np.vstack(result_matrix_train)

print("Shape of new train features:", X_train_final.shape)


result_matrix_test = []
for i in range(len(X_test)):
    mean_matrix_test = np.mean(X_test[i], axis=0, keepdims=True)
    std_matrix_test = np.std(X_test[i], axis=0, keepdims=True)
    result_matrix_test.append(np.hstack((mean_matrix_test, std_matrix_test)) )

X_test_final = np.vstack(result_matrix_test)


print("Shape of new test features:", X_test_final.shape)



In [None]:
mean_matrix_train.shape

## iTTCA-RF (Chemical) Features Extraction

In [None]:
#ITTCA-RF- Feature extraction
#!/usr/bin/env python
#_*_coding:utf-8_*_

from collections import Counter
import numpy as np
import re
import math

sequences_Example_train = [sequence.replace(" ", "") for sequence in sequences_Example_train]
fastas_train = sequences_Example_train

sequences_Example_test = [sequence.replace(" ", "") for sequence in sequences_Example_test]
fastas_test = sequences_Example_test

def Count_1(seq1, seq2):
    sum = 0
    for aa in seq1:
        sum = sum + seq2.count(aa)
    return sum
def Count_2(aaSet, sequence):
    number = 0
    for aa in sequence:
        if aa in aaSet:
            number = number + 1
    cutoffNums = [1, math.floor(0.25 * number), math.floor(0.50 * number), math.floor(0.75 * number), number]
    cutoffNums = [i if i >=1 else 0 for i in cutoffNums]
    code = []
    for cutoff in cutoffNums:
        myCount = 0
        if cutoff == 0:
            code.append(0)
        else:
            for i in range(len(sequence)):
                if sequence[i] in aaSet:
                    myCount += 1
                    if myCount == cutoff:
                        code.append((i + 1) / len(sequence) * 100)
                        break
            if myCount == 0:
                code.append(0)
    return code

AA = 'ACDEFGHIKLMNPQRSTVWY'
group = {
        'alphaticr': 'GAVLMI',
        'aromatic': 'FYW',
        'postivecharger': 'KRH',
        'negativecharger': 'DE',
        'uncharger': 'STCPNQ'}

group1 = {
        'hydrophobicity': 'RKEDQN',  # 疏水性特征
        'normwaalsvolume': 'GASCTPD',  # 范德华力
        'polarity': 'LIFWCMVY',  # 极性
        'polarizability': 'GASDT',  # 极化性质
        'charge': 'KR',  # 电荷性质
        'surfacetension': 'GQDNAHR',  # 表面张力
        'secondarystruct': 'EALMQKRH',  # 二级结构
        'solventaccess': 'ALFCGIVW'}  # 溶剂可及性

group2 = {
        'hydrophobicity': 'GASTPHY',
        'normwaalsvolume': 'NVEQIL',
        'polarity': 'PATGS',
        'polarizability': 'CPNVEQIL',
        'charge': 'ANCQGHILMFPSTWYV',
        'surfacetension': 'KTSEC',
        'secondarystruct': 'VIYCWFT',
        'solventaccess': 'RKQEND'}
group3 = {
        'hydrophobicity': 'CVLIMFW',
        'normwaalsvolume': 'MHKFRYW',
        'polarity': 'HQRKNED',
        'polarizability': 'KMHFRYW',
        'charge': 'DE',
        'surfacetension': 'ILMFPWYV',
        'secondarystruct': 'GNPSD',
        'solventaccess': 'MPSTHY'}
property = ('hydrophobicity', 'normwaalsvolume',
                'polarity', 'polarizability', 'charge', 'surfacetension', 'secondarystruct', 'solventaccess')

def get_feature_names():
    AA = 'ACDEFGHIKLMNPQRSTVWY'  # AA değişkenini burada tanımlıyoruz.
    feature_names = []

    # AAC features
    feature_names.extend([f"AAC_{aa}" for aa in AA])

    # CTDC, CTDT, CTDD features
    for p in property:
        feature_names.extend([f"CTDC_{p}_c1", f"CTDC_{p}_c2", f"CTDC_{p}_c3"])
        feature_names.extend([f"CTDT_{p}_c1221", f"CTDT_{p}_c1331", f"CTDT_{p}_c2332"])
        feature_names.extend([f"CTDD_{p}_cutoff{i}" for i in range(1, 16)])

    # GAAC features
    feature_names.extend([f"GAAC_{g}" for g in group.keys()])

    # GDPC features
    groupKey = group.keys()
    dipeptide = [f"{g1}.{g2}" for g1 in groupKey for g2 in groupKey]
    feature_names.extend([f"GDPC_{d}" for d in dipeptide])

    # GTPC features
    triple = [f"{g1}.{g2}.{g3}" for g1 in groupKey for g2 in groupKey for g3 in groupKey]
    feature_names.extend([f"GTPC_{t}" for t in triple])

    # PAAC features
    dataFile = '/content/PAAC.txt'
    with open(dataFile) as f:
        records = f.readlines()
    AA = ''.join(records[0].rstrip().split()[1:])
    feature_names.extend([f"PAAC_{aa}" for aa in AA])
    for n in range(1, min(3, len(AA))):
        feature_names.extend([f"PAAC_theta_{n}"])

    return feature_names



def get_features(fastas):
    def AAC():
        encoding = []
        for sequence in fastas:
            count = Counter(sequence)
            for key in count:
                count[key] = count[key]/len(sequence) * 100
            code = []
            for aa in AA:
                code.append(count[aa])
            encoding.append(code)
        return encoding

    def CTDC(p):
        encodings = []
        for sequence in fastas:
            code = []
            c1 = Count_1(group1[p], sequence)/len(sequence)*100
            c2 = Count_1(group2[p], sequence)/len(sequence)*100
            c3 = 100 - c1 - c2
            code = code + [c1, c2, c3]
            encodings.append(code)
        return encodings


    def CTDT(p):
        encodings = []
        for sequence in fastas:
            code = []
            aaPair = [sequence[j:j + 2] for j in range(len(sequence) - 1)]
            if not aaPair:
              code = [0,0,0]
            else:
              c1221, c1331, c2332 = 0, 0, 0
              for pair in aaPair:
                  if (pair[0] in group1[p] and pair[1] in group2[p]) or (
                          pair[0] in group2[p] and pair[1] in group1[p]):
                      c1221 = c1221 + 1
                      continue
                  if (pair[0] in group1[p] and pair[1] in group3[p]) or (
                          pair[0] in group3[p] and pair[1] in group1[p]):
                      c1331 = c1331 + 1
                      continue
                  if (pair[0] in group2[p] and pair[1] in group3[p]) or (
                          pair[0] in group3[p] and pair[1] in group2[p]):
                      c2332 = c2332 + 1
              code = code + [c1221/len(aaPair)*100, c1331/len(aaPair)*100, c2332/len(aaPair)*100]
            encodings.append(code)
        return encodings

    def CTDD(p):
        encodings = []
        for sequence in fastas:
            code = []
            code = code + Count_2(group1[p], sequence) + Count_2(group2[p], sequence) + Count_2(group3[p], sequence)
            encodings.append(code)
        return encodings

    def GAAC():
        encoding = []
        groupKey = group.keys()
        for sequence in fastas:
            code = []
            count = Counter(sequence)
            myDict = {}
            for key in groupKey:
                for aa in group[key]:
                    myDict[key] = myDict.get(key, 0) + count[aa]
            for key in groupKey:
                code.append(myDict[key] / len(sequence))
            encoding.append(code)
        return encoding

    def GDPC():
        groupKey = group.keys()
        #baseNum = len(groupKey)
        dipeptide = [g1 + '.' + g2 for g1 in groupKey for g2 in groupKey]
        index = {}
        for key in groupKey:
            for aa in group[key]:
                index[aa] = key
        encodings = []
        for sequence in fastas:
            code = []
            myDict = {}
            for t in dipeptide:
                myDict[t] = 0

            sum = 0
            for j in range(len(sequence) - 2 + 1):
                myDict[index[sequence[j]] + '.' + index[sequence[j + 1]]] = myDict[index[sequence[j]] + '.' + index[
                    sequence[j + 1]]] + 1
                sum = sum + 1

            if sum == 0:
                for t in dipeptide:
                    code.append(0)
            else:
                for t in dipeptide:
                    code.append(myDict[t] / sum)
            encodings.append(code)
        return encodings

    def GTPC():
        groupKey = group.keys()
        baseNum = len(groupKey)
        triple = [g1 + '.' + g2 + '.' + g3 for g1 in groupKey for g2 in groupKey for g3 in groupKey]
        index = {}
        for key in groupKey:
            for aa in group[key]:
                index[aa] = key
        encodings = []

        for sequence in fastas:
            code = []
            myDict = {}
            for t in triple:
                myDict[t] = 0

            sum = 0
            for j in range(len(sequence) - 3 + 1):
                myDict[index[sequence[j]] + '.' + index[sequence[j + 1]] + '.' + index[sequence[j + 2]]] = myDict[index[sequence[j]] + '.' +index[sequence[j + 1]] + '.' +index[sequence[j + 2]]] + 1
                sum = sum + 1
            if sum == 0:
                for t in triple:
                    code.append(0)
            else:
                for t in triple:
                    code.append(myDict[t] / sum)
            encodings.append(code)
        return encodings

    def Rvalue(aa1, aa2, AADict, Matrix):
        return sum([(Matrix[i][AADict[aa1]] - Matrix[i][AADict[aa2]]) ** 2 for i in range(len(Matrix))]) / len(Matrix)


    def PAAC():
        dataFile = '/content/PAAC.txt' #r'PAAC.TXT'
        with open(dataFile) as f:
            records = f.readlines()
        AA = ''.join(records[0].rstrip().split()[1:])
        # AA = "ARNDCQEGHILKMFPSTWYV"
        AADict = {}
        for i in range(len(AA)):  # 20
            AADict[AA[i]] = i
        AAProperty = []
        AAPropertyNames = []
        for i in range(1, len(records)):  # llen(records) 4
            array = records[i].rstrip().split() if records[i].rstrip() != '' else None
            AAProperty.append([float(j) for j in array[1:]])
            AAPropertyNames.append(array[0])

        AAProperty1 = []
        for i in AAProperty:
            meanI = sum(i) / 20
            fenmu = math.sqrt(sum([(j - meanI) ** 2 for j in i]) / 20)
            AAProperty1.append([(j - meanI) / fenmu for j in i])

        encodings = []

        for sequence in fastas:
            code = []
            theta = []
            for n in range(1, min(3, len(sequence))):
            #for n in range(1, 3):
                theta.append(
                    sum([Rvalue(sequence[j], sequence[j + n], AADict, AAProperty1) for j in
                         range(len(sequence) - n)]) / (
                            len(sequence) - n))
            myDict = {}
            for aa in AA:
                myDict[aa] = sequence.count(aa)
            code = code + [myDict[aa] / (1 + 0.05 * sum(theta)) for aa in AA]
            code = code + [(0.05 * j) / (1 + 0.05 * sum(theta)) for j in theta]
            encodings.append(code)
        return encodings

    print('Feature extraction...')
    encoding = []
    encoding.append(AAC())
    for p in property:
        encoding.append(CTDC(p))
        encoding.append(CTDT(p))
        encoding.append(CTDD(p))
    encoding.append(GAAC())
    encoding.append(GDPC())
    encoding.append(GTPC())
    encoding.append(PAAC())
    return np.column_stack(encoding)

ittca_features_train = get_features(fastas_train)
ittca_features_test = get_features(fastas_test)
chemical_feature_names = get_feature_names()

print("There are 365 features for every sequence as you see below:")
print("Shape of chemical features in train set:", ittca_features_train.shape)
print("Shape of chemical features in test set:", ittca_features_test.shape)
print("Feature names:", chemical_feature_names)



In [None]:
# 1024 train feature ı 3d array yapma
X_train_expanded = []

for matrix in X_train:

    new_matrix = np.zeros((50, 1024))
    new_matrix[:matrix.shape[0], :matrix.shape[1]] = matrix
    X_train_expanded.append(new_matrix)



X_train_expanded_3d = np.stack(X_train_expanded, axis = 0)
print(X_train_expanded_3d.shape)

X_train_expanded_11 = []

for matrix in X_train_expanded_3d:
    new_matrix = matrix[:11, :]   # Her bir matrisin ilk 1024 sütununu seç
    X_train_expanded_11.append(new_matrix)

X_train_expanded_11 = np.stack(X_train_expanded_11, axis=0)
print(X_train_expanded_11.shape)

#1024 test feature ı 3d array yapma
X_test_expanded = []

for matrix in X_test:
    new_matrix = np.zeros((50, 1024))
    new_matrix[:matrix.shape[0], :matrix.shape[1]] = matrix
    X_test_expanded.append(new_matrix)

X_test_expanded_3d = np.stack(X_test_expanded, axis = 0)
print(X_test_expanded_3d.shape)


X_test_expanded_11 = []

for matrix in X_test_expanded_3d:
    new_matrix = matrix[:11, :]   # Her bir matrisin ilk 1024 sütununu seç
    X_test_expanded_11.append(new_matrix)

X_test_expanded_11 = np.stack(X_test_expanded_11, axis=0)
print(X_test_expanded_11.shape)

#print(X_train_expanded_11)


### Data Scaling

In [None]:
#Data Scaling
#3d data için scaling yapıldı ama kullanmıyoruz onu sadece ittca rf nin scaling ini kullanıcaz
from sklearn.preprocessing import StandardScaler

print("Before")
print("Min of chemical train features Before Scaling:", ittca_features_train.min())
print("Max of chemical train features Before Scaling:", ittca_features_train.max())
print("Min of chemical test features Before Scaling:", ittca_features_test.min())
print("Max of chemical test features Before Scaling:", ittca_features_test.max())

scaler = StandardScaler()
ittca_features_train_scaled = scaler.fit_transform(ittca_features_train)
ittca_features_test_scaled = scaler.fit_transform(ittca_features_test)

print("After")
print("Min of chemical train features After Scaling:", ittca_features_train_scaled.min())
print("Max of chemical train features After Scaling:", ittca_features_train_scaled.max())
print("Min of chemical test features After Scaling:", ittca_features_test_scaled.min())
print("Max of chemical test features After Scaling:", ittca_features_test_scaled.max())



In [None]:

#kendi featurelarımız + stdev ler 2048 feature , random forest
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score, classification_report


rf_classifier = RandomForestClassifier(max_depth = 15, min_samples_split = 3, n_estimators = 150)
rf_classifier.fit(X_train_final, y_train)
y_pred = rf_classifier.predict(X_test_final)



accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

#kendi featurelarımız + stdev ler 2048 feature , random forest without hyperparameter tuned


In [None]:
"""
#kendi featurelarımız + stdev ler 2048 features , xgboost
from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

xgb_classifier = XGBClassifier(learning_rate= 0.1, max_depth= 3, n_estimators= 200, random_state=0)
xgb_classifier.fit(X_train_final, y_train)
y_pred_xgb = xgb_classifier.predict(X_test_final)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
classification_rep_xgb = classification_report(y_test, y_pred_xgb)

print("XGBoost Accuracy:", accuracy_xgb)
print("XGBoost Classification Report:\n", classification_rep_xgb)
"""


## DeepT-i (Inception Module Based 2D CNN with deep features from ProtBERT)

In [None]:
#0.8729 accuracy

def create_inception_11(input_layer):
    # 1x1 conv
    conv1 = Conv2D(32, (1,1), padding='same', activation='relu')(input_layer)
    # 3x3 conv
    conv3 = Conv2D(16, (3,3), padding='same', activation='relu')(conv1)
    # MaxPooling
    pool = MaxPooling2D((3,3), strides=(1,1), padding='same')(conv3)
    # Concatenate filters
    out = Concatenate()([conv1, conv3, pool])
    return out

def inception_11(input_shape):
    input_layer = Input(shape=input_shape)
    x = create_inception_11(input_layer)
    x = Flatten()(x)
    x = Dense(2, activation='softmax')(x)  # Assuming binary classification
    model = Model(inputs=input_layer, outputs=x)
    return model

# Define model
input_shape = (11, 1024, 1)  # Example input shape
model = inception_11(input_shape)
model.summary()



adam_optimizer = Adam(learning_rate=0.00005)
model.compile(optimizer=adam_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.01, verbose=1)


y_train_categorical = to_categorical(y_train, num_classes=2)
y_test_categorical = to_categorical(y_test, num_classes=2)

history_11 = model.fit(X_train_expanded_11, y_train_categorical, epochs=95, batch_size=948, validation_split=0.1)
model.evaluate(X_test_expanded_11, y_test_categorical)

y_prob11 = model.predict(X_test_expanded_11)

fpr1, tpr1, thresholds = roc_curve(y_test, y_prob11[:, 1])
auc1 = roc_auc_score(y_test, y_prob11[:, 1])
print('AUC: %0.2f' % auc1)

from sklearn.metrics import classification_report

# Predict class labels on the test set
y_pred = model.predict(X_test_expanded_11)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert probabilities to class labels

# Ensure y_test is in the correct format, i.e., it should be a single column of labels, not one-hot encoded
if y_test_categorical.shape[1] > 1:
    y_test_classes = np.argmax(y_test_categorical, axis=1)
else:
    y_test_classes = y_test  # Assuming y_test is already in the form of class labels

# Print the classification report
report = classification_report(y_test_classes, y_pred_classes)
print(report)


## DeepT-Hybrid (Inception Module Based 2D CNN with hybrid features)

In [None]:
def create_inception_11_final(input_layer):
    # 1x1 conv
    conv1 = Conv2D(32, (1,1), padding='same', activation='relu')(input_layer)
    # 3x3 conv
    conv3 = Conv2D(16, (3,3), padding='same', activation='relu')(conv1)
    # MaxPooling
    pool = MaxPooling2D((2,2), strides=(1,1), padding='same')(conv3)
    # Concatenate filters
    out = Concatenate()([conv1, conv3, pool])  # Fixed this
    return out

def inception_11_final(input_shape):
    input_layer = Input(shape=input_shape)
    x = create_inception_11_final(input_layer)
    cnn_out = Flatten()(x)
   # cnn_out = Dropout(0.2)(x)

    extra_features_input = Input(shape=(365,))

    combined = Concatenate()([cnn_out, extra_features_input])
    combined_out = Dense(64, activation='relu')(combined)
    final_output = Dense(2, activation='softmax')(combined_out)  # Use softmax for binary classification

    my_adam = Adam(learning_rate=0.00005)
    model = Model(inputs=[input_layer, extra_features_input], outputs=final_output)
    model.compile(optimizer=my_adam, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define model
input_shape = (11, 1024, 1)  # Example input shape
model = inception_11_final(input_shape)
model.summary()

y_train_categorical = to_categorical(y_train, num_classes=2)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.01, verbose=1)

history_11_final = model.fit([X_train_expanded_11, ittca_features_train_scaled], y_train_categorical, epochs=60, batch_size=948, validation_split=0.1)

y_test_categorical = to_categorical(y_test, num_classes=2)
model.evaluate([X_test_expanded_11, ittca_features_test_scaled], y_test_categorical)

y_prob11_final = model.predict([X_test_expanded_11, ittca_features_test_scaled])

fpr2, tpr2, thresholds = roc_curve(y_test, y_prob11_final[:, 1])
auc2 = roc_auc_score(y_test, y_prob11_final[:, 1])
print('AUC: %0.2f' % auc2)


from sklearn.metrics import classification_report

# Predict class labels on the test set
y_pred_final = model.predict([X_test_expanded_11, ittca_features_test_scaled])
y_pred_final_classes = np.argmax(y_pred_final, axis=1)  # Convert probabilities to class labels

# Ensure y_test is in the correct format, i.e., it should be a single column of labels, not one-hot encoded
if y_test_categorical.shape[1] > 1:
    y_test_classes = np.argmax(y_test_categorical, axis=1)
else:
    y_test_classes = y_test  # Assuming y_test is already in the form of class labels

# Print the classification report
report_final = classification_report(y_test_classes, y_pred_final_classes)
print(report_final)

model.save('model.h5')


In [None]:
"""import matplotlib.pyplot as plt
import networkx as nx
from keras.models import load_model

# Modeli dosyadan yükleyin
model = load_model('model.h5')

def visualize_model_networkx(model):
    G = nx.DiGraph()
    for layer in model.layers:
        G.add_node(layer.name)
        # Katmanın inbound_nodes özelliğini kontrol edin
        for node in layer._inbound_nodes:
            # inbound_layers tek bir katman veya bir liste olabilir
            inbound_layers = node.inbound_layers
            if isinstance(inbound_layers, list):
                # Eğer liste ise, listedeki her katman için kenar ekleyin
                for inbound_layer in inbound_layers:
                    if inbound_layer is not None:
                        G.add_edge(inbound_layer.name, layer.name)
            else:
                # Liste değilse, doğrudan bir kenar ekleyin
                if inbound_layers is not None:
                    G.add_edge(inbound_layers.name, layer.name)

    # Grafiği çizdir
    pos = nx.spring_layout(G)
    nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=2500, edge_color='k', linewidths=1, font_size=10)
    plt.show()

# Modelin NetworkX ile görsel temsilini çizdirin
visualize_model_networkx(model)
"""

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Concatenate, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from IPython.display import Image


def create_inception_11_final(input_layer):
    # 1x1 conv
    conv1 = Conv2D(32, (1,1), padding='same', activation='relu')(input_layer)
    # 3x3 conv
    conv3 = Conv2D(16, (3,3), padding='same', activation='relu')(conv1)


    # MaxPooling
    pool = MaxPooling2D((2,2), strides=(1,1), padding='same')(conv3)
    # Concatenate filters
    out = Concatenate()([conv1, conv3, pool])
    return out

def inception_11_final(input_shape):
    input_layer = Input(shape=input_shape)
    x = create_inception_11_final(input_layer)
    cnn_out = Flatten()(x)

    extra_features_input = Input(shape=(365,))

    combined = Concatenate()([cnn_out, extra_features_input])
    combined_out = Dense(64, activation='relu')(combined)
    final_output = Dense(2, activation='softmax')(combined_out)

    my_adam = Adam(learning_rate=0.00005)
    model = Model(inputs=[input_layer, extra_features_input], outputs=final_output)
    model.compile(optimizer=my_adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# Define model
input_shape = (11, 1024, 1)
model = inception_11_final(input_shape)

# Plot the model and save to file
plot_model(model, to_file='model_diagram.png', show_shapes=True, show_layer_names=True)

# Display the model diagram
Image('model_diagram.png')


##iTTCA-RF (Random forest with chemical features)

In [None]:
#ittca features random forest
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score, classification_report


rf_classifier = RandomForestClassifier(max_depth= 20, min_samples_leaf= 1, min_samples_split= 3, n_estimators= 200, random_state=0)
rf_classifier.fit(ittca_features_train, y_train)
y_pred = rf_classifier.predict(ittca_features_test)
y_prob = rf_classifier.predict_proba(ittca_features_test)[:, 1]


accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)



fpr3, tpr3, thresholds = roc_curve(y_test, y_prob)
auc3 = roc_auc_score(y_test, y_prob)
print('AUC: %0.2f' % auc3)


In [None]:
ittca_features_test.shape

#Scaled Chemical Features Random Forest

In [None]:
"""
#ittca features random forest with scaled features kullanılmayacak
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score, classification_report


rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(ittca_features_train_scaled, y_train)
y_pred = rf_classifier.predict(ittca_features_test_scaled)



accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

fpr4, tpr4, thresholds = roc_curve(y_test, y_pred[:, 1])
auc4 = roc_auc_score(y_test, y_prob11[:, 1])
print('AUC: %0.2f' % auc4)
"""

###X_flatten

In [None]:

lengths_train = [len(seq) for seq in X_train]


max_length_train = max(lengths_train)


lengths_test = [len(seq) for seq in X_test]


max_length_test = max(lengths_test)


max_length = max(max_length_train, max_length_test)

def pad_and_flatten(sequences, max_length):
    padded_sequences = []
    for seq in sequences:
        # Diziyi padding ile doldur
        padding_length = max_length - len(seq)
        # Sıfır vektörlerle doldur
        padded_seq = np.concatenate([seq, np.zeros((padding_length, seq.shape[1]))], axis=0)
        # Düzleştirilmiş hali
        flattened_seq = padded_seq.flatten()
        padded_sequences.append(flattened_seq)
    return np.array(padded_sequences)

X_train_flat = pad_and_flatten(X_train, max_length)
X_test_flat = pad_and_flatten(X_test, max_length)

print(len(X_train))
print(X_train_flat.shape)


# RF + DeepFeatures Stdevs

In [None]:
#ProtBERT features + stdevs -> 2048 feature , random forest hyperparameter tuned
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

param_grid = {
    'n_estimators': [150],
    'max_depth': [10],
    'min_samples_split': [3]
}
"""
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}"""

rf = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_final, y_train)

print("Best Params: ", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_
y_pred1 = best_rf_model.predict(X_test_final)
y_prob1 = best_rf_model.predict_proba(X_test_final)[:, 1]
classification_rep = classification_report(y_test, y_pred1)
print(classification_rep)

fpr6, tpr6, thresholds = roc_curve(y_test, y_prob1)
auc6 = roc_auc_score(y_test, y_prob1)
print('AUC: %0.2f' % auc6)
#kendi featurelarımız + stdev ler 2048 feature , random forest hyperparameter tuned

In [None]:
"""
#ittca random forest hyperparameter tuned
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report



param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}


rf = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(ittca_features_train_scaled, y_train)

print("En iyi parametreler: ", grid_search.best_params_)
#print("En iyi doğruluk skoru: ", grid_search.best_score_)


# En iyi modeli alın
best_rf_model = grid_search.best_estimator_



y_pred = best_rf_model.predict(ittca_features_test_scaled)
classification_rep = classification_report(y_test, y_pred)

print(classification_rep)

fpr7, tpr7, thresholds = roc_curve(y_test, y_prob11[:, 1])
auc7 = roc_auc_score(y_test, y_prob11[:, 1])
print('AUC: %0.2f' % auc7)
#ittca random forest hyperparameter tuned
"""

In [None]:
"""
#ittca xgboost hyperparameter tuned
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 6, 9],
    'colsample_bytree': [0.5, 0.7, 1.0]
}


xgb = XGBClassifier(random_state=42)
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy')
grid_search.fit(ittca_features_train, y_train)

print("En iyi parametreler: ", grid_search.best_params_)

best_xgb_model = grid_search.best_estimator_

y_pred = best_xgb_model.predict(ittca_features_test)
classification_rep = classification_report(y_test, y_pred)

print(classification_rep)

"""

## DeepT-RF (Random Forest with deep features from ProtBERT)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

rf_classifier = RandomForestClassifier(
    n_estimators=150,
    max_depth=15,
    min_samples_split=3,
    random_state=56
)


rf_classifier.fit(X_train_flat, y_train)

y_pred = rf_classifier.predict(X_test_flat)
y_prob_rf = rf_classifier.predict_proba(X_test_flat)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
print("Güncellenmiş Random Forest sınıflandırıcısının doğruluğu:", accuracy)


print(classification_report(y_test, y_pred))

fpr5, tpr5, thresholds = roc_curve(y_test, y_prob_rf)
auc5 = roc_auc_score(y_test, y_prob_rf)
print('AUC: %0.2f' % auc5)


In [None]:
"""
#ProtBERT features + stdevs -> 2048 feature , xgboost hyperparameter tuned
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

param_grid_xgb = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 150],
    'max_depth': [3, 6]
}

param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 6, 9],
    'colsample_bytree': [0.5, 0.7, 1.0]
}

xgb = XGBClassifier(random_state=42)
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy')
grid_search.fit(X_train_final, y_train)

print("En iyi parametreler: ", grid_search.best_params_)


best_xgb_model = grid_search.best_estimator_

y_pred = best_xgb_model.predict(X_test_final)
classification_rep = classification_report(y_test, y_pred)

print(classification_rep)
"""

## DeepT-XGB (XGBoost with deep features from ProtBERT)

In [None]:
import xgboost as xgb
!pip install shap
import shap


xgb_classifier = xgb.XGBClassifier(random_state=42, learning_rate = 0.1, n_estimators = 150, max_depth = 9 )

xgb_classifier.fit(X_train_flat, y_train)

y_pred_xgb = xgb_classifier.predict(X_test_flat)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost sınıflandırıcısının doğruluğu:", accuracy_xgb)
y_prob_xgb = xgb_classifier.predict_proba(X_test_flat)[:, 1]

print(classification_report(y_test, y_pred_xgb))

fpr8, tpr8, thresholds = roc_curve(y_test, y_prob_xgb)
auc8 = roc_auc_score(y_test, y_prob_xgb)
print('AUC: %0.2f' % auc8)


# SHAP analysis
explainer = shap.TreeExplainer(xgb_classifier)
shap_values = explainer.shap_values(X_test_flat)

# Adjust plot size and generate summary plot
plt.figure(figsize=(12, 8))  # Adjust the size as needed
shap.summary_plot(shap_values, X_test_flat, max_display=30)  # Show 30 features

# Adjust plot size and generate feature importance bar plot
plt.figure(figsize=(12, 8))  # Adjust the size as needed
shap.summary_plot(shap_values, X_test_flat, plot_type="bar", max_display=30)  # Show 30 features


## XGB HYBRID Features

In [None]:
#Hybrid features xgboost

x_hybrid_train = np.concatenate((X_train_flat, ittca_features_train_scaled), axis=1)
x_hybrid_test = np.concatenate((X_test_flat, ittca_features_test_scaled), axis=1)

x_hybrid_train.shape

import xgboost as xgb
import shap


xgb_classifier = xgb.XGBClassifier(random_state=42, colsample_bytree= 0.8, learning_rate= 0.01, max_depth= 6, n_estimators= 200, subsample= 0.9) #, colsample_bytree= 1.0, learning_rate= 0.01, max_depth= 5, n_estimators= 100, subsample= 0.8)

xgb_classifier.fit(x_hybrid_train, y_train)

y_pred_xgb = xgb_classifier.predict(x_hybrid_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost sınıflandırıcısının doğruluğu:", accuracy_xgb)
y_prob_xgb = xgb_classifier.predict_proba(x_hybrid_test)[:, 1]

print(classification_report(y_test, y_pred_xgb))

fpr8, tpr8, thresholds = roc_curve(y_test, y_prob_xgb)
auc8 = roc_auc_score(y_test, y_prob_xgb)
print('AUC: %0.2f' % auc8)

# SHAP analysis
explainer = shap.TreeExplainer(xgb_classifier)
shap_values = explainer.shap_values(x_hybrid_test)

# Create feature names
deep_feature_names = [f'ProtBERT Feature {i}' for i in range(20480)]
chemical_feature_names = get_feature_names()
hybrid_feature_names = deep_feature_names + chemical_feature_names

# Adjust plot size and generate summary plot
plt.figure(figsize=(12, 8))  # Adjust the size as needed
shap.summary_plot(shap_values, x_hybrid_test, feature_names=hybrid_feature_names)

# Adjust plot size and generate feature importance bar plot
plt.figure(figsize=(12, 8))  # Adjust the size as needed
shap.summary_plot(shap_values, x_hybrid_test, feature_names=hybrid_feature_names, plot_type="bar")



In [None]:
!pip install graphviz pydotplus


In [None]:
import graphviz
import pydotplus
from xgboost import plot_tree
from IPython.display import Image, display
from subprocess import run

# Assuming xgb_classifier is already trained

# Generate the first decision tree visualization
dot_data = xgb.to_graphviz(xgb_classifier, num_trees=0)

# Convert the dot data to string and adjust line width and font size
dot_string = dot_data.source.replace('fontsize=14', 'fontsize=24').replace('penwidth=1', 'penwidth=5')

# Ensure that penwidth is set properly if not already present
dot_string = dot_string.replace('edge [', 'edge [penwidth=5,')
dot_string = dot_string.replace('node [', 'node [penwidth=5,')

# Create a graph from the modified dot string
graph = pydotplus.graph_from_dot_data(dot_string)

# Save the dot file
graph.write('xgb_tree_high_dpi.dot')

# Use Graphviz to convert the dot file to a high DPI PNG
run(['dot', '-Tpng', 'xgb_tree_high_dpi.dot', '-o', 'xgb_tree_high_dpi.png', '-Gdpi=500'])

# Load and display the image
img = Image(filename='xgb_tree_high_dpi.png')
display(img)


In [None]:
import xgboost as xgb
import graphviz
import pydotplus
from subprocess import run

# Assuming xgb_classifier is already trained
# If not, you need to train it as shown below
# iris = datasets.load_iris()
# X = iris.data
# y = iris.target
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
# xgb_classifier.fit(X_train, y_train)

# Generate the first decision tree visualization
dot_data = xgb.to_graphviz(xgb_classifier, num_trees=0)

# Convert the dot data to string and adjust line width and font size
dot_string = dot_data.source.replace('fontsize=16', 'fontsize=40').replace('penwidth=2', 'penwidth=10')

# Ensure that penwidth is set properly if not already present
dot_string = dot_string.replace('edge [', 'edge [penwidth=10,')
dot_string = dot_string.replace('node [', 'node [penwidth=10,')

# Create a graph from the modified dot string
graph = pydotplus.graph_from_dot_data(dot_string)

# Save the dot file
graph.write('xgb_tree_high_dpi.dot')

# Use Graphviz to convert the dot file to an SVG with specified dimensions
run(['dot', '-Tsvg', 'xgb_tree_high_dpi.dot', '-o', 'xgb_tree_high_dpi.svg', '-Gsize=7.52,1.00!'])

print("SVG file saved as 'xgb_tree_high_dpi.svg'")


In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt

# Assuming xgb_classifier is already trained
plt.figure(figsize=(100, 30))  # Large figure size for clarity
ax = plt.subplot(1, 1, 1)
ax.axis('off')  # Turn off the axis

# Plot the tree with specific Graphviz options
xgb.plot_tree(xgb_classifier, num_trees=0, rankdir='LR', ax=ax,
              show_node_labels=True, fontsize=10,
              graph_attrs={'nodesep': '0.1', 'ranksep': '0.1', 'bgcolor': 'transparent'},
              edge_attrs={'color': '#000000', 'penwidth': '2'})  # Thicker lines and black color

plt.subplots_adjust(left=0, right=1, top=1, bottom=0)  # Adjust the plot margins

# Save the figure with high DPI
plt.savefig('xgboost_tree_optimized.png', dpi=600, bbox_inches='tight', pad_inches=0)
plt.close()  # Close the plot to free up memory


In [None]:
import numpy as np
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score

# Assuming your X_hybrid arrays and y_train, y_test are already defined
x_hybrid_train = np.concatenate((X_train_flat, ittca_features_train_scaled), axis=1)
x_hybrid_test = np.concatenate((X_test_flat, ittca_features_test_scaled), axis=1)

x_hybrid_train.shape

# Define and train the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42, colsample_bytree=0.8, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=0.9)
xgb_classifier.fit(x_hybrid_train, y_train)

# Make predictions
y_pred_xgb = xgb_classifier.predict(x_hybrid_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost classifier accuracy:", accuracy_xgb)

# Print classification report
print(classification_report(y_test, y_pred_xgb))

# Compute ROC curve and AUC score
y_prob_xgb = xgb_classifier.predict_proba(x_hybrid_test)[:, 1]
fpr8, tpr8, thresholds = roc_curve(y_test, y_prob_xgb)
auc8 = roc_auc_score(y_test, y_prob_xgb)
print('AUC: %0.2f' % auc8)

# Initialize the SHAP explainer
explainer = shap.TreeExplainer(xgb_classifier)
shap_values = explainer.shap_values(x_hybrid_test)

# Extract the chemical features (20480 to 20845)
chemical_features = x_hybrid_test[:, 20480:20845]
chemical_shap_values = shap_values[:, 20480:20845]



# Adjust plot size and generate summary plot for chemical features
plt.figure(figsize=(12, 8))
shap.summary_plot(chemical_shap_values, chemical_features, feature_names=feature_names)

# Adjust plot size and generate feature importance bar plot for chemical features
plt.figure(figsize=(12, 8))
shap.summary_plot(chemical_shap_values, chemical_features, feature_names=feature_names, plot_type="bar")


In [None]:
# Initialize the SHAP explainer
explainer = shap.TreeExplainer(xgb_classifier)
shap_values = explainer.shap_values(x_hybrid_test)

# Extract the chemical features (20480 to 20845)
chemical_features = x_hybrid_test[:, 20480:20845]
chemical_shap_values = shap_values[:, 20480:20845]

# Create a feature names list for SHAP plot with original indices
feature_names = [f'Feature {i}' for i in range(20480, 20845)]

# Adjust plot size and generate summary plot for chemical features
plt.figure(figsize=(12, 8))
shap.summary_plot(chemical_shap_values, chemical_features, feature_names=feature_names)

# Adjust plot size and generate feature importance bar plot for chemical features
plt.figure(figsize=(12, 8))
shap.summary_plot(chemical_shap_values, chemical_features, feature_names=feature_names, plot_type="bar")

In [None]:
"""!pip install shap

import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve
import time



# Define the best parameters found
best_params = {
    'n_estimators': 300,
    'max_depth': 10,
    'min_samples_split': 10,
    'min_samples_leaf': 2,
    'bootstrap': False
}

# Train the classifier with the best parameters
best_rf_classifier = RandomForestClassifier(**best_params, random_state=56)
best_rf_classifier.fit(x_hybrid_train, y_train)

# Predict using the test data
y_pred = best_rf_classifier.predict(x_hybrid_test)
accuracy = accuracy_score(y_test, y_pred)
print("Updated Random Forest classifier accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))

# Calculate AUC
y_prob_rf = best_rf_classifier.predict_proba(x_hybrid_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob_rf)
auc = roc_auc_score(y_test, y_prob_rf)
print('AUC: %0.2f' % auc)

# SHAP analysis
explainer = shap.Explainer(best_rf_classifier, x_hybrid_train)
shap_values = explainer(x_hybrid_test)

# Summary plot
shap.summary_plot(shap_values, x_hybrid_test)

# Feature importance bar plot
shap.summary_plot(shap_values, x_hybrid_test, plot_type="bar")
"""

## DeepT-HybridRF (RF HYBRİD FEATURES)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

rf_classifier = RandomForestClassifier(bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=300, random_state=56)


rf_classifier.fit(x_hybrid_train, y_train)

y_pred = rf_classifier.predict(x_hybrid_test)
y_prob_rf = rf_classifier.predict_proba(x_hybrid_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
print("Güncellenmiş Random Forest sınıflandırıcısının doğruluğu:", accuracy)


print(classification_report(y_test, y_pred))

fpr5, tpr5, thresholds = roc_curve(y_test, y_prob_rf)
auc5 = roc_auc_score(y_test, y_prob_rf)
print('AUC: %0.2f' % auc5)

## DeepT-HybridRF hyperparameter tuning

In [None]:
# %90.6 accuracy

import time
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve

# Zamanlayıcıyı başlat
start_time = time.time()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

best_params = None
best_score = 0
iteration = 0

# Her parametre kombinasyonunu deneyin
for params in ParameterGrid(param_grid):
    iteration += 1
    print(f"Iteration {iteration}: Testing parameters: {params}")

    rf_classifier = RandomForestClassifier(**params, random_state=56)
    rf_classifier.fit(x_hybrid_train, y_train)

    y_pred = rf_classifier.predict(x_hybrid_test)
    score = accuracy_score(y_test, y_pred)

    print(f"Iteration {iteration}: Accuracy: {score}")

    if score > best_score:
        best_score = score
        best_params = params

# En iyi parametrelerle eğitim
print("En iyi hyperparameterlar:", best_params)
best_rf_classifier = RandomForestClassifier(**best_params, random_state=56)
best_rf_classifier.fit(x_hybrid_train, y_train)

# Tahmin ve sonuçlar
y_pred = best_rf_classifier.predict(x_hybrid_test)
accuracy = accuracy_score(y_test, y_pred)
y_prob_rf = best_rf_classifier.predict_proba(x_hybrid_test)[:, 1]

print("Güncellenmiş Random Forest sınıflandırıcısının doğruluğu:", accuracy)
print(classification_report(y_test, y_pred))

# AUC hesaplama
fpr, tpr, thresholds = roc_curve(y_test, y_prob_rf)
auc = roc_auc_score(y_test, y_prob_rf)
print('AUC: %0.2f' % auc)

# Zamanlayıcıyı durdur ve süreyi yazdır
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Model eğitimi ve tuning süresi: {elapsed_time:.2f} saniye")


## DeepT-HybridXGB HYPERparameter tuning

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, classification_report
import time

# Zamanlayıcıyı başlat
start_time = time.time()

# Verileri birleştirme
x_hybrid_train = np.concatenate((X_train_flat, ittca_features_train_scaled), axis=1)
x_hybrid_test = np.concatenate((X_test_flat, ittca_features_test_scaled), axis=1)

# XGBoost sınıflandırıcısı ve hyperparameter tuning için parametreler

"""param_grid = {
    'max_depth': [4, 5, 6, 7],
    'learning_rate': [0.005, 0.01, 0.02],
    'n_estimators': [160, 180, 200, 220],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.85],
    'subsample': [0.75, 0.8, 0.85, 0.9]
}""" #en iyi sonuclar

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}


best_params = None
best_score = 0
iteration = 0

# Her parametre kombinasyonunu deneyin
for params in ParameterGrid(param_grid):
    iteration += 1
    print(f"Iteration {iteration}: Testing parameters: {params}")

    xgb_classifier = xgb.XGBClassifier(**params, random_state=42)
    xgb_classifier.fit(x_hybrid_train, y_train)

    y_pred = xgb_classifier.predict(x_hybrid_test)
    score = accuracy_score(y_test, y_pred)

    print(f"Iteration {iteration}: Accuracy: {score}")

    if score > best_score:
        best_score = score
        best_params = params

# En iyi parametrelerle eğitim
print("En iyi hyperparameterlar:", best_params)
xgb_classifier_best = xgb.XGBClassifier(**best_params, random_state=42)
xgb_classifier_best.fit(x_hybrid_train, y_train)

# Tahmin ve sonuçlar
y_pred_xgb = xgb_classifier_best.predict(x_hybrid_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
y_prob_xgb = xgb_classifier_best.predict_proba(x_hybrid_test)[:, 1]

print("XGBoost sınıflandırıcısının doğruluğu:", accuracy_xgb)
print(classification_report(y_test, y_pred_xgb))

# Zamanlayıcıyı durdur ve süreyi yazdır
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Model eğitimi ve tuning süresi: {elapsed_time:.2f} saniye")


In [None]:
plt.figure(figsize=(8, 6), dpi=600)
#plt.plot(fpr1, tpr1, label='DeepT-CNN (AUC = %0.2f)' % auc1, linewidth=2)

plt.plot(fpr3, tpr3, label='iTTCA-RF (AUC = %0.2f)' % auc3, linewidth=2)
plt.plot(fpr2, tpr2, label='DeepT-Hybrid (AUC = %0.2f)' % auc2, linewidth=2)
plt.plot(fpr5, tpr5, label='DeepT-HybridRF (AUC = %0.2f)' % auc5, linewidth=2)
plt.plot(fpr8, tpr8, label='DeepT-HybridXGB (AUC = %0.2f)' % auc8, linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', linewidth=2)  # Rastgele şansın eğrisi
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=14, fontweight='bold')
plt.title('Receiver Operating Characteristic', fontsize=16, fontweight='bold')
plt.legend(loc="lower right", fontsize=12, prop={'weight':'bold'})
plt.show()

