In [11]:
!pip install scikit-learn



In [20]:
import sklearn
print(sklearn.__version__)


1.5.2


# Merging Data

In [1]:
import pandas as pd
import os
import random

def balance_hgt_non_hgt(df):
    # Separate HGT and non-HGT genes
    hgt_genes = df[df.iloc[:, 1] == 1]  # Assuming HGT-boolean is the second column
    non_hgt_genes = df[df.iloc[:, 1] == 0]
    
    # Get the number of HGT genes
    num_hgt = len(hgt_genes)
    
    # Randomly sample the same number of non-HGT genes
    if len(non_hgt_genes) > num_hgt:
        non_hgt_genes = non_hgt_genes.sample(num_hgt, random_state=42)  # Random sampling
    
    # Concatenate the HGT and sampled non-HGT genes
    balanced_df = pd.concat([hgt_genes, non_hgt_genes], ignore_index=True)
    
    return balanced_df

def merge_genomes(directory_path, output_file):
    merged_data = []
    
    # Loop through all genome files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):  # Adjust extension based on your files
            file_path = os.path.join(directory_path, filename)
            
            # Read the genome file into a pandas DataFrame
            df = pd.read_csv(file_path, sep='\t', header=None)  # Adjust separator and headers
            
            # Balance HGT and non-HGT genes
            balanced_df = balance_hgt_non_hgt(df)
            
            # Append balanced data to the list
            merged_data.append(balanced_df)
    
    # Concatenate all the balanced genome data
    final_df = pd.concat(merged_data, ignore_index=True)
    
    # Save to CSV file
    final_df.to_csv(output_file, index=False)


In [2]:
# Example usage
directory_path = "/disk11/1.jisu/08.ml/nucleotide_info"  # Directory where your genome files are located
output_file = "merged_genome_data.csv"  # Output file name
merge_genomes(directory_path, output_file)

# XG Boost

In [1]:
import pandas as pd
import xgboost as xgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Train Test Split

In [None]:
# Load the merged dataset
data = pd.read_csv('merged_genome_data.csv')  # Path to your merged CSV file

# Separate features (X) and labels (y)
# Assuming the label (HGT or non-HGT) is in the second column and the rest are features
X = data.iloc[:, 2:].values  # Features (all columns except the first two)
y = data.iloc[:, 1].values   # Labels (HGT-boolean column)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Only transform the test data (no fitting)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Initialize the XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')


In [5]:
# Train the model
model.fit(X_train_scaled, y_train)

Parameters: { "use_label_encoder" } are not used.



### Evaluation

In [None]:
from sklearn.metrics import f1_score

# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

# Display detailed classification report (which includes F1 score)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-HGT', 'HGT']))


Accuracy: 57.39%
F1 Score: 0.65
Classification Report:


### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'colsample_bytree': [0.3, 0.7]
}

grid_search = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                           param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)

grid_search.fit(X_train_scaled, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Train final model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)


### Feature Importance

In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt

# Plot feature importance
xgb.plot_importance(model)
plt.show()

In [None]:
# Get the feature importance scores
importance = model.feature_importances_

# Print the feature importance scores
for i, score in enumerate(importance):
    print(f"Feature {i}: {score:.4f}")


### SHAP

In [None]:
pip install shap


In [None]:
import shap

# Explain the model's predictions using SHAP
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# Visualize the feature importance
shap.summary_plot(shap_values, X_test)


# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

### Evaluation

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

### Grid Search

In [None]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30, None],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],    # Minimum samples required to be at a leaf node
    'bootstrap': [True, False]        # Whether to use bootstrap samples
}

# Initialize GridSearchCV with the random forest model and parameter grid
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1')

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by the grid search
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Use the best estimator from the grid search to make predictions
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-HGT', 'HGT']))

### Feature Importance

In [None]:
# Get the feature importance scores
importance = rf_model.feature_importances_

# Print the feature importance scores
for i, score in enumerate(importance):
    print(f"Feature {i}: {score:.4f}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sort the feature importance scores
indices = np.argsort(importance)

# Plot the feature importance
plt.figure(figsize=(10, 8))
plt.title('Feature Importance in Random Forest')
plt.barh(range(len(indices)), importance[indices], align='center')
plt.yticks(range(len(indices)), [f"Feature {i}" for i in indices])
plt.xlabel('Relative Importance')
plt.show()


# Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd

In [None]:
# Define the hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001],  # Kernel coefficient
    'kernel': ['rbf', 'linear']  # Kernel type
}


In [None]:
# Initialize the SVM model
svc = SVC()

# Initialize GridSearchCV with the SVM model and the hyperparameter grid
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by the grid search
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Use the best estimator from the grid search to make predictions
best_svc_model = grid_search.best_estimator_
y_pred = best_svc_model.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-HGT', 'HGT']))


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
# Initialize Logistic Regression model
lr = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
# Define the hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],         # Inverse of regularization strength
    'penalty': ['l1', 'l2'],              # Regularization types (L1/L2)
    'solver': ['liblinear', 'saga']       # Solvers that work with L1/L2 penalties
}

In [None]:
# Fit the model
lr.fit(X_train_scaled, y_train)

In [None]:
y_pred = best_lr_model.predict(X_test_scaled)

# Evaluate the model
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-HGT', 'HGT']))

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by the grid search
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Use the best estimator from the grid search to make predictions
best_lr_model = grid_search.best_estimator_
y_pred = best_lr_model.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-HGT', 'HGT']))

# Neural Network

In [None]:
pip install tensorflow keras


In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [None]:
# Define a function to create the Keras model
def create_model(optimizer='adam', activation='relu'):
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation=activation))  # First hidden layer
    model.add(Dense(32, activation=activation))  # Second hidden layer
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:

# Create a KerasClassifier for use in scikit-learn
model = KerasClassifier(build_fn=create_model, verbose=0)


In [None]:

# Define a hyperparameter grid for tuning
param_grid = {
    'optimizer': ['adam', 'rmsprop'],
    'activation': ['relu', 'tanh'],
    'batch_size': [32, 64],
    'epochs': [10, 50]
}


In [None]:

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by the grid search
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Use the best model to make predictions
best_nn_model = grid_search.best_estimator_
y_pred = best_nn_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-HGT', 'HGT']))
