<a href="https://colab.research.google.com/github/spencer-kanjera/Hybrid-Botnet-Detection-Using-GA-and-LightGBM/blob/main/testbed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
with open('/content/drive/My Drive/Colab_files/foo.txt', 'w') as f:
  f.write('Hello Google Drive!')
!cat /content/drive/My\ Drive/Colab_files/foo.txt

In [None]:
import numpy as np
import pandas as pd
import random
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Breast Cancer Dataset
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['Target'] = cancer.target

# Numeric and Boolean Data Type Mutations
def mutate_numeric_column(value, mutation_rate=0.1):
    if random.random() < mutation_rate:
        operation = random.choice(['+', '-', '*', '/'])
        mutation_value = random.uniform(0.5, 1.5)

        if operation == '+':
            return value + mutation_value
        elif operation == '-':
            return value - mutation_value
        elif operation == '*':
            return value * mutation_value
        elif operation == '/':
            if mutation_value != 0:
                return value / mutation_value
    return value

def mutate_boolean_column(value, mutation_rate=0.1):
    if random.random() < mutation_rate:
        value = int(value)
        mutation_value = random.randint(0, 1)
        operation = random.choice(['AND', 'OR', 'XOR'])

        if operation == 'AND':
            return value & mutation_value
        elif operation == 'OR':
            return value | mutation_value
        elif operation == 'XOR':
            return value ^ mutation_value
    return value

def mutate_row(row, mutation_rate=0.1):
    for col in row.index:
        if col == 'Target':
            row[col] = mutate_boolean_column(row[col], mutation_rate)
        else:
            row[col] = mutate_numeric_column(row[col], mutation_rate)
    return row

def create_new_row(df):
    new_row = df.sample(n=1).copy().reset_index(drop=True).iloc[0]
    new_row = mutate_row(new_row)
    return new_row

def evaluate_model_fitness(df):
    X = df.drop(columns=['Target'])
    y = df['Target']

    # Split the data train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # LightGBM Dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    # Parameters
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'verbosity': -1  # Turn off the print statements of the function
    }

    # Train the model
    model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[test_data])

    # Try it on test data
    y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_test = [1 if pred > 0.5 else 0 for pred in y_pred_test]

    # Accuracy Measurement
    test_accuracy = accuracy_score(y_test, y_pred_test)

    return test_accuracy

def add_rows_to_df_with_fitness(df, num_rows=50):
    fitness_scores = []
    for _ in range(num_rows):
        new_row = create_new_row(df)
        temp_df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
        fitness_score = evaluate_model_fitness(temp_df)
        fitness_scores.append((new_row, fitness_score))

    # Sorting according to fitness scores
    fitness_scores.sort(key=lambda x: x[1], reverse=True)

    # Adding best fitness scores to data
    best_rows = [row for row, score in fitness_scores[:num_rows//2]]
    df = pd.concat([df, pd.DataFrame(best_rows)], ignore_index=True)

    return df

# Extend the old data with new data
df_expanded = add_rows_to_df_with_fitness(df, num_rows=1000)

def evaluate_model(df, description=""):
    X = df.drop(columns=['Target'])
    y = df['Target']

    # Train test and validation
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # LightGBM Dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    # Train Model
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'verbosity': -1
    }

    model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[test_data])

    # Predict on test and valdation dataset
    y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

    y_pred_test = [1 if pred > 0.5 else 0 for pred in y_pred_test]
    y_pred_val = [1 if pred > 0.5 else 0 for pred in y_pred_val]

    # Accuracy measurement
    test_accuracy = accuracy_score(y_test, y_pred_test)
    val_accuracy = accuracy_score(y_val, y_pred_val)

    print(f"\n{description} Data Set")
    print(f"Test Accuracy: {test_accuracy:.2f}")
    print(f"Validation Set Accuracy: {val_accuracy:.2f}")

# Evaluate model with starting dataset
evaluate_model(df, "Start")

# Extended dataset evaluation
evaluate_model(df_expanded, "Extended")