In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

: 

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('data//vcf_with_labels.csv')
samples = df['samples']
df = df.drop(columns = ['samples', 'score'])
df.head(10)

: 

In [None]:
target = df.label.values
Data = df.drop(columns = 'label').values
D_train, D_test, t_train, t_test = train_test_split(Data, 
                                                    target, 
                                                    test_size = 0.3,
                                                    random_state=999,
                                                    stratify=target)

print("Orginal dataset shape:", df.shape)
print("Features training dataset shape after split:", D_train.shape)
print("Features test dataset shape after split:", D_test.shape)
print("Target training dataset shape after split:", t_train.shape)
print("Target test dataset shape after split:", t_test.shape)

: 

In [None]:
cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                     n_repeats=3,
                                    random_state=999)

: 

In [None]:
scoring_metric = 'accuracy'

: 

In [None]:
rf_classifier = RandomForestClassifier(random_state = 999)
rf_cv_results_full = cross_val_score(estimator=rf_classifier,
                                    X=D_test,
                                    y=t_test,
                                   cv=cv_method, 
                                   scoring=scoring_metric)
rf_cv_results_full.mean().round(3)

: 

In [None]:
num_features = 10
model_rfi = RandomForestClassifier(n_estimators=100)
model_rfi.fit(D_train, t_train)
fs_indices_rfi = np.argsort(model_rfi.feature_importances_)[::-1][0:num_features]

: 

In [None]:
best_features_rfi = df.columns[fs_indices_rfi].values
print('The top 10 features are:', ", ".join([str(i) for i in best_features_rfi]))

: 

In [None]:
feature_importances_rfi = model_rfi.feature_importances_[fs_indices_rfi]
print('The importance scores for the previous 10 features are:', ", ".join([str(round(i,4)) for i in feature_importances_rfi]))

: 

In [None]:
%matplotlib inline 
%config InlineBackend.figure_format = 'retina'
plt.style.use("seaborn")

def plot_imp(best_features, scores, method_name):   
    plt.barh(best_features, scores)
    plt.title(method_name + ' Feature Importances')
    plt.xlabel("Importance")
    plt.ylabel("Features")
    plt.show()

plot_imp(best_features_rfi, feature_importances_rfi, 'Random Forest')

: 

In [None]:
params_RF = {'criterion': ['gini', 'entropy'],
             'n_estimators':[100, 250, 500],
             'max_depth': [2,5,7,10,12]}

gs_RF = GridSearchCV(estimator=rf_classifier, 
                     param_grid=params_RF, 
                     cv=cv_method,
                     verbose=1, 
                     scoring='roc_auc')
gs_RF.fit(D_train,t_train)

: 

In [None]:
print('The best parameters for Random Forest Classifier are: ', gs_RF.best_params_)

: 

In [None]:
print("The best score using area under the ROC curve metric is:", round(gs_RF.best_score_,3))

: 

In [None]:
results_RF = pd.DataFrame(gs_RF.cv_results_['params'])
results_RF['test_score'] = gs_RF.cv_results_['mean_test_score']
results_RF.nlargest(n=10, columns=['test_score']).style.highlight_max(subset = ['test_score'],color = 'lightgreen', axis = 0)

: 

In [None]:
rel_plot=sns.relplot(data=results_RF,
            x='max_depth',
            y='test_score',
           col='criterion',
           hue='n_estimators', style="n_estimators",
           kind='line')
rel_plot.axes[0,0].set_xlabel('Max depth')
rel_plot.axes[0,1].set_xlabel('Max depth')
rel_plot.axes[0,0].set_ylabel('Score')
rel_plot._legend.set_title("Number of\nestimators")
rel_plot._legend._legend_box.align="left"
rel_plot.fig.subplots_adjust(top=0.8)
rel_plot.fig.suptitle("Figure 3. Random forest performance")
plt.show(rel_plot)

: 

In [None]:
rf = RandomForestClassifier(n_estimators = results_RF['n_estimators'].iloc[0],
                           criterion = results_RF['criterion'].iloc[0],
                          max_depth = results_RF['max_depth'].iloc[0])
rf.fit(D_train, t_train)
predictions = rf.predict(D_test)

: 

In [None]:
pred_df = pd.DataFrame({'actual_label':t_test,'predicted_label':predictions})

: 

In [None]:
pred_df.head(10)

: 

In [None]:
accuracy_score(t_test, predictions)

: 

In [None]:
print(classification_report(t_test, predictions))

: 

In [None]:
# Data Augmentation Here
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset
file_path = r'D:\data-augmentation-genetics\vcf_with_labels.csv'
df = pd.read_csv(file_path)

# Split features and labels
target = df.label.values
data = df.drop(columns=['label', 'samples', 'score']).values

# Ensure the data is stored as floating-point values
data = data.astype(float)

# Split the data into training and test sets
D_train, D_test, t_train, t_test = train_test_split(data, target, test_size=0.3, random_state=999, stratify=target)

# Data augmentation: Add random noise to numeric columns
def augment_data(data, numeric_columns, noise_scale=0.1):
    augmented_data = data.copy()
    for col_idx in numeric_columns:
        augmented_data[:, col_idx] += np.random.normal(0, noise_scale, len(augmented_data))
    return augmented_data

numeric_columns = [0, 1, 2, 3]  # Adjust this list to include the indices of your numeric columns
augmented_D_train = augment_data(D_train, numeric_columns)

# Train a Random Forest Classifier on augmented data
rf_classifier = RandomForestClassifier(random_state=999)
rf_classifier.fit(augmented_D_train, t_train)

# Evaluate the model on the original test data
predictions = rf_classifier.predict(D_test)

# Print evaluation metrics and reports
print("Accuracy:", accuracy_score(t_test, predictions))
print("Classification Report:\n", classification_report(t_test, predictions))


: 

#Hyper param tuning

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset
file_path = r'D:\data-augmentation-genetics\vcf_with_labels.csv'
df = pd.read_csv(file_path)

# Split features and labels
target = df.label.values
data = df.drop(columns=['label', 'samples', 'score']).values

# Ensure the data is stored as floating-point values
data = data.astype(float)

# Split the data into training and test sets
D_train, D_test, t_train, t_test = train_test_split(data, target, test_size=0.3, random_state=999, stratify=target)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=999)

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 250, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to your training data
grid_search.fit(D_train, t_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model to make predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(D_test)

# Print evaluation metrics and reports
print("Accuracy:", accuracy_score(t_test, predictions))
print("Classification Report:\n", classification_report(t_test, predictions))