In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import glob
from itertools import product
from sklearn.preprocessing import StandardScaler

def transform_sequence(sequence):
    length = len(sequence)
    total1 = 0
    for i in range(length - 1):
        sequence2 = sequence[i + 1:]
        length2 = len(sequence2)
        count1 = sum(1 for j in range(length2) if sequence[j] == sequence2[j])
        total1 += (count1 % length2) * 100
    
    total1 = (total1 / (length - 1)) * 100 if length > 1 else 0
    return total1

def generate_trimers():
    bases = ['A', 'C', 'G', 'T']
    return [''.join(p) for p in product(bases, repeat=3)]

def transform_sequence_kmer(sequence):
    return [transform_sequence(sequence[i:i + 6]) for i in range(len(sequence) - 6)]

def load_and_transform_data(file_path, pos_range, neg_range):
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    pos_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in pos_sequences])
    neg_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in neg_sequences])
    
    pos_data['label'] = 1
    neg_data['label'] = 0
    data = pd.concat([pos_data, neg_data], ignore_index=True)
    
    # Ensure the dataframe contains only numerical values
    X = np.array(data.drop('label', axis=1).values.tolist())
    y = data['label'].values
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X, y

def evaluate_classifiers(file_path, combinations, classifiers):
    results = []
    
    for pos_range, neg_range in combinations:
        X, y = load_and_transform_data(file_path, pos_range, neg_range)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
        
        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            results.append({
                'Classifier': name,
                'Pos_Range': f"{pos_range[0]}-{pos_range[1]}",
                'Neg_Range': f"{neg_range[0]}-{neg_range[1]}",
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1 Score': f1_score(y_test, y_pred, zero_division=0),
                'Name': 'Synechocystis sp. PCC 6803'
            })
    
    return pd.DataFrame(results)

# Classifier definitions
classifiers = {
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=101, max_iter=500),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(verbose=-1, random_state=101, n_jobs=-1),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=101, n_jobs=-1)
}

# Define combinations
combinations = [
    ((350, 500), (600, 750)),   
]

# Initialize results dataframe
df1 = pd.DataFrame()

# Evaluate and collect results for the specific file
file_path = "C:/Users/saich/Downloads/TSS zip/TSS/Mycobacterium tuberculosis H37Rv.txt"
results_df = evaluate_classifiers(file_path, combinations, classifiers)
print(f"Results for file: {file_path}")
print(results_df.to_string(index=False))

df1 = pd.concat([df1, results_df], ignore_index=True)



Results for file: C:/Users/saich/Downloads/TSS zip/TSS/Mycobacterium tuberculosis H37Rv.txt
         Classifier Pos_Range Neg_Range  Accuracy  Precision   Recall  F1 Score                       Name
                SVM   350-500   600-750  0.623413   0.614610 0.681564  0.646358 Synechocystis sp. PCC 6803
      Random Forest   350-500   600-750  0.638928   0.646552 0.628492  0.637394 Synechocystis sp. PCC 6803
Logistic Regression   350-500   600-750  0.612130   0.613699 0.625698  0.619640 Synechocystis sp. PCC 6803
        Naive Bayes   350-500   600-750  0.599436   0.586854 0.698324  0.637755 Synechocystis sp. PCC 6803
               K-NN   350-500   600-750  0.504937   0.509915 0.502793  0.506329 Synechocystis sp. PCC 6803
  Gradient Boosting   350-500   600-750  0.638928   0.644886 0.634078  0.639437 Synechocystis sp. PCC 6803
           AdaBoost   350-500   600-750  0.622003   0.625698 0.625698  0.625698 Synechocystis sp. PCC 6803
      Decision Tree   350-500   600-750  0.578279   

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from itertools import product
from sklearn.preprocessing import StandardScaler

def transform_sequence(sequence):
    length = len(sequence)
    total1 = 0
    for i in range(length - 1):
        sequence2 = sequence[i + 1:]
        length2 = len(sequence2)
        count1 = sum(1 for j in range(length2) if sequence[j] == sequence2[j])
        total1 += (count1 % length2) * 100
    
    total1 = (total1 / (length - 1)) * 100 if length > 1 else 0
    return total1

def generate_trimers():
    bases = ['A', 'C', 'G', 'T']
    return [''.join(p) for p in product(bases, repeat=3)]

def transform_sequence_kmer(sequence):
    return [transform_sequence(sequence[i:i + 6]) for i in range(len(sequence) - 6)]

def load_and_transform_data(file_path, pos_range, neg_range):
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    pos_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in pos_sequences])
    neg_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in neg_sequences])
    
    pos_data['label'] = 1
    neg_data['label'] = 0
    data = pd.concat([pos_data, neg_data], ignore_index=True)
    
    X = np.array(data.drop('label', axis=1).values.tolist())
    y = data['label'].values
    
    return X, y

def train_and_save_models(file_path, combinations, classifiers):
    results = []
    organism_name = os.path.splitext(os.path.basename(file_path))[0]  # Extract organism name from file path
    output_dir = f"models/{organism_name}"
    os.makedirs(output_dir, exist_ok=True)
    
    for pos_range, neg_range in combinations:
        X, y = load_and_transform_data(file_path, pos_range, neg_range)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
        
        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            model_path = os.path.join(output_dir, f"{name}.pkl")
            with open(model_path, 'wb') as model_file:
                pickle.dump(clf, model_file)
            
            results.append({
                'Classifier': name,
                'Pos_Range': f"{pos_range[0]}-{pos_range[1]}",
                'Neg_Range': f"{neg_range[0]}-{neg_range[1]}",
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1 Score': f1_score(y_test, y_pred, zero_division=0),
                'Organism': organism_name
            })
    
    return pd.DataFrame(results)

# Classifier definitions
classifiers = {
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=101, max_iter=500),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(verbose=-1, random_state=101, n_jobs=-1),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=101, n_jobs=-1)
}

# Define combinations
combinations = [
    ((350, 500), (600, 750)),   
]

# Example usage
file_path = "C:/Users/saich/Downloads/TSS zip/TSS/Escherichia coli str. K-12 substr. MG1655.txt"
results_df = train_and_save_models(file_path, combinations, classifiers)
print(f"Results for file: {file_path}")
print(results_df.to_string(index=False))




Results for file: C:/Users/saich/Downloads/TSS zip/TSS/Escherichia coli str. K-12 substr. MG1655.txt
         Classifier Pos_Range Neg_Range  Accuracy  Precision   Recall  F1 Score                                  Organism
                SVM   350-500   600-750  0.676030   0.686047 0.657993  0.671727 Escherichia coli str. K-12 substr. MG1655
      Random Forest   350-500   600-750  0.653558   0.664062 0.631970  0.647619 Escherichia coli str. K-12 substr. MG1655
Logistic Regression   350-500   600-750  0.634831   0.642308 0.620818  0.631380 Escherichia coli str. K-12 substr. MG1655
        Naive Bayes   350-500   600-750  0.705993   0.727642 0.665428  0.695146 Escherichia coli str. K-12 substr. MG1655
               K-NN   350-500   600-750  0.513109   0.527273 0.323420  0.400922 Escherichia coli str. K-12 substr. MG1655
  Gradient Boosting   350-500   600-750  0.644195   0.657371 0.613383  0.634615 Escherichia coli str. K-12 substr. MG1655
           AdaBoost   350-500   600-750  0.61

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from itertools import product
from sklearn.preprocessing import StandardScaler

# Ensure Matplotlib does not require GUI
import matplotlib
matplotlib.use('Agg')

def transform_sequence(sequence):
    length = len(sequence)
    total1 = 0
    for i in range(length - 1):
        sequence2 = sequence[i + 1:]
        length2 = len(sequence2)
        count1 = sum(1 for j in range(length2) if sequence[j] == sequence2[j])
        total1 += (count1 % length2) * 100
    
    total1 = (total1 / (length - 1)) * 100 if length > 1 else 0
    return total1

def generate_trimers():
    bases = ['A', 'C', 'G', 'T']
    return [''.join(p) for p in product(bases, repeat=3)]

def transform_sequence_kmer(sequence):
    return [transform_sequence(sequence[i:i + 6]) for i in range(len(sequence) - 6)]

def load_and_transform_data(file_path, pos_range, neg_range):
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    pos_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in pos_sequences])
    neg_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in neg_sequences])
    
    pos_data['label'] = 1
    neg_data['label'] = 0
    data = pd.concat([pos_data, neg_data], ignore_index=True)
    
    X = np.array(data.drop('label', axis=1).values.tolist())
    y = data['label'].values
    
    return X, y

def save_bar_chart(results_df, metric, output_dir):
    """Creates and saves a bar chart for the given metric"""
    plt.figure(figsize=(10, 5))
    plt.bar(results_df['Classifier'], results_df[metric], color='skyblue')
    plt.xlabel('Classifier')
    plt.ylabel(metric)
    plt.title(f'{metric} Comparison')
    plt.xticks(rotation=45, ha="right")
    
    os.makedirs(output_dir, exist_ok=True)
    chart_path = os.path.join(output_dir, f"{metric}.png")
    plt.savefig(chart_path, bbox_inches='tight')
    plt.close()  # Close the figure to prevent overlapping plots

def train_and_save_models(file_path, combinations, classifiers):
    results = []
    organism_name = os.path.splitext(os.path.basename(file_path))[0]  # Extract organism name from file path
    
    # Paths for models and visualization
    model_dir = f"models/{organism_name}"
    vis_dir = f"visualization/{organism_name}"
    
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(vis_dir, exist_ok=True)
    
    for pos_range, neg_range in combinations:
        X, y = load_and_transform_data(file_path, pos_range, neg_range)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
        
        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            model_path = os.path.join(model_dir, f"{name}.pkl")
            with open(model_path, 'wb') as model_file:
                pickle.dump(clf, model_file)
            
            results.append({
                'Classifier': name,
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1 Score': f1_score(y_test, y_pred, zero_division=0),
                'Organism': organism_name
            })
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Save visualizations
    for metric in ["Accuracy", "Precision", "Recall", "F1 Score"]:
        save_bar_chart(results_df, metric, vis_dir)

    return results_df

# Classifier definitions
classifiers = {
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=101, max_iter=500),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(verbose=-1, random_state=101, n_jobs=-1),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=101, n_jobs=-1)
}

# Define combinations
combinations = [
    ((350, 500), (600, 750)),   
]

# Example usage
file_path = "C:/Users/saich/Downloads/TSS zip/TSS/Escherichia coli str. K-12 substr. MG1655.txt"
results_df = train_and_save_models(file_path, combinations, classifiers)

# Display results
print(f"Results for file: {file_path}")
print(results_df.to_string(index=False))




Results for file: C:/Users/saich/Downloads/TSS zip/TSS/Escherichia coli str. K-12 substr. MG1655.txt
         Classifier  Accuracy  Precision   Recall  F1 Score                                  Organism
                SVM  0.676030   0.687500 0.654275  0.670476 Escherichia coli str. K-12 substr. MG1655
      Random Forest  0.659176   0.670588 0.635688  0.652672 Escherichia coli str. K-12 substr. MG1655
Logistic Regression  0.632959   0.639847 0.620818  0.630189 Escherichia coli str. K-12 substr. MG1655
        Naive Bayes  0.705993   0.727642 0.665428  0.695146 Escherichia coli str. K-12 substr. MG1655
               K-NN  0.524345   0.543860 0.345725  0.422727 Escherichia coli str. K-12 substr. MG1655
  Gradient Boosting  0.646067   0.660000 0.613383  0.635838 Escherichia coli str. K-12 substr. MG1655
           AdaBoost  0.616105   0.625000 0.594796  0.609524 Escherichia coli str. K-12 substr. MG1655
      Decision Tree  0.573034   0.577358 0.568773  0.573034 Escherichia coli str. K

In [35]:
import pandas as pd
with open("C:/Users/saich/Downloads/output (2).txt") as file:
    text=file.read()
text=text.replace('\t',' ')
text
list1=text.split('\n')
list1
list2=[]
for string in list1:
    list3=string.split(' ')
    list2.append(list3)
print(list3)
df = pd.DataFrame(list2[1:], columns=list2[0])


['']


ValueError: 19 columns passed, passed data had 25 columns

In [47]:
import pandas as pd
text=text.replace('\t',' ')
text
list1=text.split('\n')
list1
list2=[]
for string in list1:
    list3=string.split(' ')
    list2.append(list3)
list2=list2[:126]
list2.pop(9)
for i in list2:
    print(len(i),i[0])
pd.DataFrame(data=list2[1:],columns=list2[0])


19 ID
19 1
19 2
19 3
19 4
19 5
19 6
19 7
19 8
19 10
19 11
19 12
19 13
19 14
19 15
19 16
19 17
19 18
19 19
19 20
19 21
19 22
19 23
19 24
19 25
19 26
19 27
19 28
19 29
19 30
19 31
19 32
19 33
19 34
19 35
19 36
19 37
19 38
19 39
19 40
19 41
19 42
19 43
19 44
19 45
19 46
19 47
19 48
19 49
19 50
19 51
19 52
19 53
19 54
19 55
19 56
19 57
19 58
19 59
19 60
19 61
19 62
19 63
19 64
19 65
19 66
19 67
19 68
19 69
19 70
19 71
19 72
19 73
19 74
19 75
19 76
19 77
19 78
19 79
19 80
19 81
19 82
19 83
19 84
19 85
19 86
19 87
19 88
19 89
19 90
19 91
19 92
19 93
19 94
19 95
19 96
19 97
19 98
19 99
19 100
19 101
19 102
19 103
19 104
19 105
19 106
19 107
19 108
19 109
19 110
19 111
19 112
19 113
19 114
19 115
19 116
19 117
19 118
19 119
19 120
19 121
19 122
19 123
19 124
19 125


Unnamed: 0,ID,PropertyName,AA,AC,AG,AT,CA,CC,CG,CT,GA,GC,GG,GT,TA,TC,TG,TT,Unnamed: 19
0,1,Twist,38.9,31.12,32.15,33.81,41.41,34.96,32.91,32.15,41.31,38.5,34.96,31.12,33.28,41.31,41.41,38.9,
1,2,Stackingenergy,-12,-11.8,-11.5,-10.6,-12.3,-9.5,-13.1,-11.5,-11.4,-13.2,-9.5,-11.8,-11.2,-11.4,-12.3,-12,
2,3,Rise,3.16,3.41,3.63,3.89,3.23,4.08,3.6,3.63,3.47,3.81,4.08,3.41,3.21,3.47,3.23,3.16,
3,4,Bend,3.07,2.97,2.31,2.6,3.58,2.16,2.81,2.31,2.51,3.06,2.16,2.97,6.74,2.51,3.58,3.07,
4,5,Tip,1.76,2,0.9,1.87,-1.64,0.71,0.22,0.9,1.35,2.5,0.71,2,6.7,1.35,-1.64,1.76,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,121,Flexibility_slide,13.72,9.57,7.58,11.69,1.35,7.36,4.02,7.58,10.28,4.34,7.36,9.57,7.13,10.28,1.35,13.72,
120,122,Flexibility_shift,5.35,9.73,8.98,1.13,4.61,5.51,12.13,8.98,5.44,1.98,5.51,9.73,4.28,5.44,4.61,5.35,
121,123,Enthalpy,-7.6,-8.4,-7.8,-7.2,-8.5,-8,-10.6,-7.8,-8.2,-9.8,-8,-8.4,-7.2,-8.2,-8.5,-7.6,
122,124,Entropy,-21.3,-22.4,-21,-20.4,-22.7,-19.9,-27.2,-21,-22.2,-24.4,-19.9,-22.4,-21.3,-22.2,-22.7,-21.3,


In [17]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    BaggingClassifier, ExtraTreesClassifier
)
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from itertools import product
from sklearn.preprocessing import StandardScaler

# Ensure Matplotlib does not require GUI
import matplotlib
matplotlib.use('Agg')

def transform_sequence(sequence):
    """Transforms a sequence into a numerical representation."""
    length = len(sequence)
    total1 = 0
    for i in range(length - 1):
        sequence2 = sequence[i + 1:]
        length2 = len(sequence2)
        count1 = sum(1 for j in range(length2) if sequence[j] == sequence2[j])
        total1 += (count1 % length2) * 100
    
    total1 = (total1 / (length - 1)) * 100 if length > 1 else 0
    return total1

def generate_trimers():
    """Generates all possible trimers of DNA bases."""
    bases = ['A', 'C', 'G', 'T']
    return [''.join(p) for p in product(bases, repeat=3)]

def transform_sequence_kmer(sequence):
    """Applies transformation to k-mer slices of the sequence."""
    return [transform_sequence(sequence[i:i + 6]) for i in range(len(sequence) - 6 + 1)]

def load_and_transform_data(file_path, pos_range, neg_range):
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    pos_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in pos_sequences])
    neg_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in neg_sequences])
    
    pos_data['label'] = 1
    neg_data['label'] = 0
    data = pd.concat([pos_data, neg_data], ignore_index=True)
    
    X = np.array(data.drop('label', axis=1).values.tolist())
    y = data['label'].values
    
    return X, y

def save_model(clf, model_path, model_type):
    """Saves models in the appropriate format based on type."""
    if model_type == 'XGBoost':
        clf.save_model(model_path + ".json")
    elif model_type == 'CatBoost':
        clf.save_model(model_path + ".cbm")
    else:
        with open(model_path + ".pkl", 'wb') as model_file:
            pickle.dump(clf, model_file)

def save_bar_chart(results_df, metric, output_dir):
    """Creates and saves a bar chart for a given metric."""
    plt.figure(figsize=(10, 5))
    plt.bar(results_df['Classifier'], results_df[metric], color='skyblue')
    plt.xlabel('Classifier')
    plt.ylabel(metric)
    plt.title(f'{metric} Comparison')
    plt.xticks(rotation=45, ha="right")
    
    os.makedirs(output_dir, exist_ok=True)
    chart_path = os.path.join(output_dir, f"{metric}.png")
    plt.savefig(chart_path, bbox_inches='tight')
    plt.close()

def train_and_save_models(file_path, combinations, classifiers):
    """Trains models, saves them in compatible formats, and generates evaluation results."""
    results = []
    organism_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Paths for models and visualization
    model_dir = f"models/{organism_name}"
    vis_dir = f"visualization/{organism_name}"
    
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(vis_dir, exist_ok=True)
    
    for pos_range, neg_range in combinations:
        X, y = load_and_transform_data(file_path, pos_range, neg_range)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
        
        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            model_path = os.path.join(model_dir, name)
            save_model(clf, model_path, name)
            
            results.append({
                'Classifier': name,
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1 Score': f1_score(y_test, y_pred, zero_division=0),
                'Organism': organism_name
            })
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Save visualizations
    for metric in ["Accuracy", "Precision", "Recall", "F1 Score"]:
        save_bar_chart(results_df, metric, vis_dir)

    return results_df

# Classifier definitions
classifiers = {
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=101, max_iter=500),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(verbose=-1, random_state=101, n_jobs=-1),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=101, n_jobs=-1)
}

# Define combinations
combinations = [
    ((350, 499), (600, 749)),  # 150 bases each
]

# Example usage
file_path ="C:/Users/saich/Downloads/TSS zip/TSS/Haloferax_volcanii_DS2.txt"
results_df = train_and_save_models(file_path, combinations, classifiers)

# Display results
print(f"Results for file: {file_path}")
print(results_df.to_string(index=False))




Results for file: C:/Users/saich/Downloads/TSS zip/TSS/Haloferax_volcanii_DS2.txt
         Classifier  Accuracy  Precision   Recall  F1 Score               Organism
                SVM  0.684701   0.700000 0.666667  0.682927 Haloferax_volcanii_DS2
      Random Forest  0.710821   0.739837 0.666667  0.701349 Haloferax_volcanii_DS2
Logistic Regression  0.680970   0.690299 0.677656  0.683919 Haloferax_volcanii_DS2
        Naive Bayes  0.686567   0.727273 0.615385  0.666667 Haloferax_volcanii_DS2
               K-NN  0.555970   0.631579 0.307692  0.413793 Haloferax_volcanii_DS2
  Gradient Boosting  0.722015   0.744094 0.692308  0.717268 Haloferax_volcanii_DS2
           AdaBoost  0.682836   0.710204 0.637363  0.671815 Haloferax_volcanii_DS2
      Decision Tree  0.617537   0.615646 0.663004  0.638448 Haloferax_volcanii_DS2
         Perceptron  0.520522   0.515152 0.996337  0.679151 Haloferax_volcanii_DS2
                SGD  0.604478   0.587896 0.747253  0.658065 Haloferax_volcanii_DS2
     

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    BaggingClassifier, ExtraTreesClassifier
)
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from itertools import product
from sklearn.preprocessing import StandardScaler

# Ensure Matplotlib does not require GUI
import matplotlib
matplotlib.use('Agg')

def transform_sequence(sequence):
    """Transforms a sequence into a numerical representation."""
    length = len(sequence)
    total1 = 0
    for i in range(length - 1):
        sequence2 = sequence[i + 1:]
        length2 = len(sequence2)
        count1 = sum(1 for j in range(length2) if sequence[j] == sequence2[j])
        total1 += (count1 % length2) * 100
    
    total1 = (total1 / (length - 1)) * 100 if length > 1 else 0
    return total1

def generate_trimers():
    """Generates all possible trimers of DNA bases."""
    bases = ['A', 'C', 'G', 'T']
    return [''.join(p) for p in product(bases, repeat=3)]

def transform_sequence_kmer(sequence):
    """Applies transformation to k-mer slices of the sequence."""
    return [transform_sequence(sequence[i:i + 6]) for i in range(len(sequence) - 6 + 1)]

def load_and_transform_data(file_path, pos_range, neg_range):
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    pos_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in pos_sequences])
    neg_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in neg_sequences])
    
    pos_data['label'] = 1
    neg_data['label'] = 0
    data = pd.concat([pos_data, neg_data], ignore_index=True)
    
    X = np.array(data.drop('label', axis=1).values.tolist())
    y = data['label'].values
    
    return X, y

def save_model(clf, model_path, model_type):
    """Saves models in the appropriate format based on type."""
    if model_type == 'XGBoost':
        clf.save_model(model_path + ".json")
    elif model_type == 'CatBoost':
        clf.save_model(model_path + ".cbm")
    else:
        with open(model_path + ".pkl", 'wb') as model_file:
            pickle.dump(clf, model_file)

def save_bar_chart(results_df, metric, output_dir):
    """Creates and saves a bar chart for a given metric."""
    plt.figure(figsize=(10, 5))
    plt.bar(results_df['Classifier'], results_df[metric], color='skyblue')
    plt.xlabel('Classifier')
    plt.ylabel(metric)
    plt.title(f'{metric} Comparison')
    plt.xticks(rotation=45, ha="right")
    
    os.makedirs(output_dir, exist_ok=True)
    chart_path = os.path.join(output_dir, f"{metric}.png")
    plt.savefig(chart_path, bbox_inches='tight')
    plt.close()

def train_and_save_models(file_path, combinations, classifiers):
    """Trains models, saves them in compatible formats, and generates evaluation results."""
    results = []
    organism_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Paths for models and visualization
    model_dir = f"models/{organism_name}"
    vis_dir = f"visualization/{organism_name}"
    
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(vis_dir, exist_ok=True)
    
    for pos_range, neg_range in combinations:
        X, y = load_and_transform_data(file_path, pos_range, neg_range)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
        
        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            model_path = os.path.join(model_dir, name)
            save_model(clf, model_path, name)
            
            results.append({
                'Classifier': name,
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1 Score': f1_score(y_test, y_pred, zero_division=0),
                'Organism': organism_name
            })
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Save visualizations
    for metric in ["Accuracy", "Precision", "Recall", "F1 Score"]:
        save_bar_chart(results_df, metric, vis_dir)

    return results_df

# Classifier definitions
classifiers = {
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=101, max_iter=500),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(verbose=-1, random_state=101, n_jobs=-1),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=101, n_jobs=-1)
}

# Define combinations
combinations = [
    ((350, 499), (600, 749)),  # 150 bases each
]

# Example usage
file_path ="C:/Users/saich/Downloads/TSS zip/TSS/Haloferax_volcanii_DS2.txt"
text=text.replace('\t',' ')
text
list1=text.split('\n')
list1
list2=[]
for string in list1:
    list3=string.split(' ')
    list2.append(list3)
list2=list2[:126]
list2.pop(9)
for i in list2:
    print(len(i),i[0])
df1=pd.DataFrame(data=list2[1:],columns=list2[0])
colums=['AA','AC','AG','AT','CA','CC','CG','CT','GA','GC','GG','GT','TA','TC','TG','TT']
results_df = train_and_save_models(file_path, combinations, classifiers)
# Display results
print(f"Results for file: {file_path}")
print(results_df.to_string(index=False))


In [None]:
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    BaggingClassifier, ExtraTreesClassifier
)
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

def encode_sequence(sequence, property_name, df1):
    """Encodes a DNA sequence using a 2-mer sliding window based on a property column in df1."""
    encoded_values = []
    for i in range(len(sequence) - 1):
        pair = sequence[i:i + 2]  # Extract 2-mer
        if pair in df1.columns:
            encoded_values.append(float(df1.loc[property_name, pair]))  # Fix lookup
        else:
            encoded_values.append(0)  # Default value if pair not found
    return encoded_values

def load_and_transform_data(file_path, pos_range, neg_range, property_name, df1):
    """Loads sequences, encodes them using the selected property, and prepares training data."""
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    
    pos_data = pd.DataFrame([encode_sequence(seq, property_name, df1) for seq in pos_sequences])
    neg_data = pd.DataFrame([encode_sequence(seq, property_name, df1) for seq in neg_sequences])
    
    pos_data['label'] = 1
    neg_data['label'] = 0
    data = pd.concat([pos_data, neg_data], ignore_index=True)
    
    X = np.array(data.drop('label', axis=1).values.tolist())
    y = data['label'].values
    
    return X, y

def save_model(clf, model_path):
    """Saves a trained model to a file."""
    with open(model_path, 'wb') as model_file:
        pickle.dump(clf, model_file)

def train_models(file_path, combinations, classifiers, df1):
    """Trains models for each property, selects the best one, and saves all models for that property."""
    organism_name = os.path.splitext(os.path.basename(file_path))[0]
    best_accuracy = 0
    best_property = None
    best_models = {}
    best_results = []

    # Iterate over all properties in df1
    for property_name in df1.index:  # Use index instead of columns
        for pos_range, neg_range in combinations:
            X, y = load_and_transform_data(file_path, pos_range, neg_range, property_name, df1)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
            
            models = {}
            results = []
            for name, clf in classifiers.items():
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)

                # Store model and its results
                models[name] = clf
                results.append({
                    'Classifier': name,
                    'Accuracy': accuracy,
                    'Precision': precision_score(y_test, y_pred, zero_division=0),
                    'Recall': recall_score(y_test, y_pred, zero_division=0),
                    'F1 Score': f1_score(y_test, y_pred, zero_division=0),
                    'Organism': organism_name,
                    'Property': property_name
                })

                # Update best property if a higher accuracy is found
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_property = property_name
                    best_models = models  # Save all models for this property
                    best_results = results  # Save results for this property

    # Save all models for the best property
    if best_models:
        model_dir = f"models/{organism_name}/{best_property}"
        os.makedirs(model_dir, exist_ok=True)

        for model_name, clf in best_models.items():
            model_path = os.path.join(model_dir, f"{model_name}.pkl")
            save_model(clf, model_path)

        # Save the best property-organism mapping
        with open("best_properties.txt", "a") as f:
            f.write(f"{organism_name}: {best_property}\n")

        # Save the results to a CSV file
        results_df = pd.DataFrame(best_results)
        results_csv = f"models/{organism_name}/best_models_metrics.csv"
        results_df.to_csv(results_csv, index=False)

    return best_property

# Load dataset for encoding
text = text.replace('\t', ' ')
list1 = text.split('\n')
list2 = [line.split(' ') for line in list1[:126]]
list2.pop(9)  # Ensure correct data structure

df1 = pd.DataFrame(data=list2[1:], columns=list2[0])  # No index creation
df1.set_index(df1.columns[0], inplace=True)  # Set first column as index

# Define classifier models
classifiers = {
    'SVM': SVC(probability=True, random_state=101),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=101, max_iter=500),
    'Naive Bayes': GaussianNB(),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=101),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101),
    'Perceptron': Perceptron(random_state=101),
    'SGD': SGDClassifier(random_state=101),
    'Bagging': BaggingClassifier(n_estimators=100, random_state=101),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=101, n_jobs=-1),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=101),
    'LightGBM': LGBMClassifier(verbose=-1, random_state=101, n_jobs=-1),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=101, n_jobs=-1)
}

# Define segment combinations
combinations = [((350, 499), (600, 749))]

# Example usage
file_path = "C:/Users/saich/Downloads/TSS zip/TSS/Haloferax_volcanii_DS2.txt"
best_property = train_models(file_path, combinations, classifiers, df1)
print(f"Best property saved for {file_path}: {best_property}")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt