In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv('kc1.csv')
df.fillna(df.median(), inplace=True)
df['defects'] = df['defects'].astype(int)

X = df.drop(columns=['defects'])
y = df['defects']

# Scale the data
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Data Augmentation: Add Gaussian Noise
def add_noise(data, noise_level=0.01):
    noise = np.random.normal(0, noise_level, data.shape)
    return data + noise

X_augmented = add_noise(X_scaled)
X_scaled = pd.concat([X_scaled, X_augmented], ignore_index=True)
y = pd.concat([y, y], ignore_index=True)

# SMOTE helper
def apply_smote(X_train, y_train):
    smote = SMOTE(sampling_strategy=0.6, random_state=42)
    return smote.fit_resample(X_train, y_train)

# Evaluation metric helper
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]
    return {
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "F1": f1_score(y_test, preds),
        "ROC-AUC": roc_auc_score(y_test, probs)
    }

# Base Classifier (easy to swap)
def get_classifier(class_weight_dict):
    return RandomForestClassifier(n_estimators=200, max_depth=12, class_weight=class_weight_dict, random_state=42)
    # return LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
    # return XGBClassifier(scale_pos_weight=class_weight_dict[0]/class_weight_dict[1], use_label_encoder=False, eval_metric='logloss')

# Thresholds to try
thresholds = [0.9, 0.85, 0.8, 0.75, 0.7, 0.65]

# Split ratios
split_ratios = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
table1_results = []
table2_results = []

for ratio in split_ratios:
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
    X_train_labeled, X_unlabeled, y_train_labeled, y_unlabeled = train_test_split(
        X_train, y_train, test_size=(1 - ratio), stratify=y_train, random_state=42)

    X_train_labeled, y_train_labeled = apply_smote(X_train_labeled, y_train_labeled)
    class_weights = compute_class_weight("balanced", classes=np.unique(y_train_labeled), y=y_train_labeled)
    class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

    clf = get_classifier(class_weight_dict)
    clf.fit(X_train_labeled, y_train_labeled)

    supervised_metrics = evaluate_model(clf, X_test, y_test)
    best_f1 = 0
    best_metrics = {}
    best_threshold = None

    for threshold in thresholds:
        X_train_final = X_train_labeled.copy()
        y_train_final = y_train_labeled.copy()
        X_unlabeled_copy = X_unlabeled.copy()
        patience, max_patience = 0, 3
        prev_f1 = 0

        for _ in range(10):  # Max 10 rounds
            probs = clf.predict_proba(X_unlabeled_copy)
            max_probs = np.max(probs, axis=1)
            pseudo_labels = np.argmax(probs, axis=1)
            high_confidence_idx = np.where(max_probs >= threshold)[0]

            if len(high_confidence_idx) == 0:
                break

            # Add high confidence pseudo-labels
            X_train_final = pd.concat(
                [pd.DataFrame(X_train_final, columns=X.columns), X_unlabeled_copy.iloc[high_confidence_idx]],
                ignore_index=True
            )
            y_train_final = np.concatenate((y_train_final, pseudo_labels[high_confidence_idx]))

            # Drop used samples
            X_unlabeled_copy = X_unlabeled_copy.drop(X_unlabeled_copy.index[high_confidence_idx])

            # Shuffle and retrain
            X_train_final, y_train_final = shuffle(X_train_final, y_train_final, random_state=42)
            clf.fit(X_train_final, y_train_final)
            current_f1 = f1_score(y_test, clf.predict(X_test))

            # Early stopping if no improvement
            if current_f1 <= prev_f1:
                patience += 1
            else:
                patience = 0
                prev_f1 = current_f1

            if patience >= max_patience:
                break

        semi_metrics = evaluate_model(clf, X_test, y_test)
        table2_results.append([ratio, threshold, *supervised_metrics.values(), *semi_metrics.values()])

        if semi_metrics["F1"] > best_f1:
            best_f1 = semi_metrics["F1"]
            best_metrics = semi_metrics
            best_threshold = threshold

    table1_results.append([ratio, *supervised_metrics.values(), best_threshold, *best_metrics.values()])

# Save to DataFrames
table1_df = pd.DataFrame(table1_results, columns=[
    "Labeled-Unlabeled Ratio", "Sup_Acc", "Sup_Prec", "Sup_Recall", "Sup_F1", "Sup_ROC_AUC",
    "Best Threshold", "Best_Semi_Acc", "Best_Semi_Prec", "Best_Semi_Recall", "Best_Semi_F1", "Best_Semi_ROC_AUC"
])

table2_df = pd.DataFrame(table2_results, columns=[
    "Labeled-Unlabeled Ratio", "Threshold",
    "Supervised Acc", "Supervised Prec", "Supervised Recall", "Supervised F1", "Supervised ROC_AUC",
    "Semi Acc", "Semi Prec", "Semi Recall", "Semi F1", "Semi ROC_AUC"
])

# Display
print("\nTable I: Summary of Best Threshold per Ratio\n")
print(table1_df)

print("\nTable II: All Threshold Results\n")
print(table2_df)

# Optional: Save to CSV
table1_df.to_csv("table1_summary.csv", index=False)
table2_df.to_csv("table2_thresholds.csv", index=False)



Table I: Summary of Best Threshold per Ratio

   Labeled-Unlabeled Ratio   Sup_Acc  Sup_Prec  Sup_Recall    Sup_F1  \
0                      0.1  0.843602  0.489583    0.361538  0.415929   
1                      0.2  0.861374  0.555556    0.500000  0.526316   
2                      0.3  0.870853  0.596330    0.500000  0.543933   
3                      0.4  0.873223  0.596639    0.546154  0.570281   
4                      0.5  0.874408  0.596774    0.569231  0.582677   
5                      0.6  0.870853  0.576642    0.607692  0.591760   
6                      0.7  0.877962  0.609756    0.576923  0.592885   
7                      0.8  0.886256  0.639344    0.600000  0.619048   
8                      0.9  0.880332  0.612403    0.607692  0.610039   

   Sup_ROC_AUC  Best Threshold  Best_Semi_Acc  Best_Semi_Prec  \
0     0.806249            0.85       0.859005        0.569620   
1     0.836598            0.90       0.859005        0.549550   
2     0.853674            0.85       

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.combine import SMOTETomek
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
import random
import tensorflow as tf

# Load dataset
data = pd.read_csv('kc1.csv')
data.dropna(inplace=True)
X = data.drop(['defects'], axis=1).values
y = data['defects'].values

# Resample using SMOTE-Tomek
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X, y)

# Normalize features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42)

# GA + ANN functions
def create_model(input_dim, hidden_units, learning_rate, dropout_rate):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(hidden_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(hidden_units // 2, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

def GA_tune(X, y, population_size=6, generations=5):
    input_dim = X.shape[1]

    # Extended search space
    hidden_units_list = [32, 64, 128, 256]
    learning_rates = [0.0005, 0.001, 0.0025, 0.005, 0.01]
    batch_sizes = [16, 32, 64, 128]
    epochs_list = [20, 30, 40]
    dropout_rates = [0.1, 0.2, 0.3, 0.4]

    def random_individual():
        return {
            'hidden_units': random.choice(hidden_units_list),
            'learning_rate': random.choice(learning_rates),
            'batch_size': random.choice(batch_sizes),
            'epochs': random.choice(epochs_list),
            'dropout_rate': random.choice(dropout_rates)
        }

    def fitness(indiv):
        model = create_model(input_dim, indiv['hidden_units'], indiv['learning_rate'], indiv['dropout_rate'])
        early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        model.fit(X_train, y_train,
                  epochs=indiv['epochs'],
                  batch_size=indiv['batch_size'],
                  verbose=0,
                  validation_split=0.2,
                  callbacks=[early_stop])
        _, acc = model.evaluate(X_test, y_test, verbose=0)
        return acc

    # Initial population
    population = [random_individual() for _ in range(population_size)]

    for gen in range(generations):
        print(f"\nGeneration {gen+1}")
        fitness_scores = [fitness(indiv) for indiv in population]
        sorted_pop = [x for _, x in sorted(zip(fitness_scores, population), key=lambda pair: pair[0], reverse=True)]

        new_population = sorted_pop[:2]  # Elitism
        while len(new_population) < population_size:
            p1, p2 = random.sample(sorted_pop[:4], 2)
            child = {
                'hidden_units': random.choice([p1['hidden_units'], p2['hidden_units']]),
                'learning_rate': random.choice([p1['learning_rate'], p2['learning_rate']]),
                'batch_size': random.choice([p1['batch_size'], p2['batch_size']]),
                'epochs': random.choice([p1['epochs'], p2['epochs']]),
                'dropout_rate': random.choice([p1['dropout_rate'], p2['dropout_rate']])
            }
            # Mutation
            if random.random() < 0.2:
                child['hidden_units'] = random.choice(hidden_units_list)
            new_population.append(child)

        population = new_population

    best_individual = population[0]
    print("\nBest Parameters from GA:", best_individual)
    return best_individual

# Run GA to find best parameters
best_params = GA_tune(X_resampled, y_resampled)

# Train final model using best params
model = create_model(
    X_train.shape[1],
    best_params['hidden_units'],
    best_params['learning_rate'],
    best_params['dropout_rate']
)

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    epochs=best_params['epochs'],
                    batch_size=best_params['batch_size'],
                    verbose=1,
                    callbacks=[early_stop])

# Predict and evaluate
y_pred = (model.predict(X_test) > 0.5).astype("int32")
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=4)

print("\nFinal Test Accuracy:", acc)
print("Final F1 Score:", f1)
print("\nClassification Report:\n", report)



Generation 1

Generation 2

Generation 3

Generation 4

Generation 5

Best Parameters from GA: {'hidden_units': 256, 'learning_rate': 0.001, 'batch_size': 64, 'epochs': 40, 'dropout_rate': 0.4}
Epoch 1/40
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.6993 - loss: 0.5949 - val_accuracy: 0.7125 - val_loss: 0.5612
Epoch 2/40
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7296 - loss: 0.5349 - val_accuracy: 0.7216 - val_loss: 0.5486
Epoch 3/40
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7320 - loss: 0.5274 - val_accuracy: 0.7234 - val_loss: 0.5418
Epoch 4/40
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7396 - loss: 0.5275 - val_accuracy: 0.7308 - val_loss: 0.5346
Epoch 5/40
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7634 - loss: 0.5029 - val_accuracy: 0.7326 - val_loss: 0.5288
Epoc

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.combine import SMOTETomek
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
import random
import tensorflow as tf

# Load dataset
data = pd.read_csv('pc1.csv')
data.dropna(inplace=True)
X = data.drop(['defects'], axis=1).values
y = data['defects'].values

# Resample using SMOTE-Tomek
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X, y)

# Normalize features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42)

# GA + ANN functions
def create_model(input_dim, hidden_units, learning_rate, dropout_rate):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(hidden_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(hidden_units // 2, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

def GA_tune(X, y, population_size=6, generations=5):
    input_dim = X.shape[1]

    # Extended search space
    hidden_units_list = [32, 64, 128, 256]
    learning_rates = [0.0005, 0.001, 0.0025, 0.005, 0.01]
    batch_sizes = [16, 32, 64, 128]
    epochs_list = [20, 30, 40]
    dropout_rates = [0.1, 0.2, 0.3, 0.4]

    def random_individual():
        return {
            'hidden_units': random.choice(hidden_units_list),
            'learning_rate': random.choice(learning_rates),
            'batch_size': random.choice(batch_sizes),
            'epochs': random.choice(epochs_list),
            'dropout_rate': random.choice(dropout_rates)
        }

    def fitness(indiv):
        model = create_model(input_dim, indiv['hidden_units'], indiv['learning_rate'], indiv['dropout_rate'])
        early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        model.fit(X_train, y_train,
                  epochs=indiv['epochs'],
                  batch_size=indiv['batch_size'],
                  verbose=0,
                  validation_split=0.2,
                  callbacks=[early_stop])
        _, acc = model.evaluate(X_test, y_test, verbose=0)
        return acc

    # Initial population
    population = [random_individual() for _ in range(population_size)]

    for gen in range(generations):
        print(f"\nGeneration {gen+1}")
        fitness_scores = [fitness(indiv) for indiv in population]
        sorted_pop = [x for _, x in sorted(zip(fitness_scores, population), key=lambda pair: pair[0], reverse=True)]

        new_population = sorted_pop[:2]  # Elitism
        while len(new_population) < population_size:
            p1, p2 = random.sample(sorted_pop[:4], 2)
            child = {
                'hidden_units': random.choice([p1['hidden_units'], p2['hidden_units']]),
                'learning_rate': random.choice([p1['learning_rate'], p2['learning_rate']]),
                'batch_size': random.choice([p1['batch_size'], p2['batch_size']]),
                'epochs': random.choice([p1['epochs'], p2['epochs']]),
                'dropout_rate': random.choice([p1['dropout_rate'], p2['dropout_rate']])
            }
            # Mutation
            if random.random() < 0.2:
                child['hidden_units'] = random.choice(hidden_units_list)
            new_population.append(child)

        population = new_population

    best_individual = population[0]
    print("\nBest Parameters from GA:", best_individual)
    return best_individual

# Run GA to find best parameters
best_params = GA_tune(X_resampled, y_resampled)

# Train final model using best params
model = create_model(
    X_train.shape[1],
    best_params['hidden_units'],
    best_params['learning_rate'],
    best_params['dropout_rate']
)

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    epochs=best_params['epochs'],
                    batch_size=best_params['batch_size'],
                    verbose=1,
                    callbacks=[early_stop])

# Predict and evaluate
y_pred = (model.predict(X_test) > 0.5).astype("int32")
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=4)

print("\nFinal Test Accuracy:", acc)
print("Final F1 Score:", f1)
print("\nClassification Report:\n", report)



Generation 1

Generation 2

Generation 3

Generation 4

Generation 5

Best Parameters from GA: {'hidden_units': 256, 'learning_rate': 0.0025, 'batch_size': 64, 'epochs': 20, 'dropout_rate': 0.1}
Epoch 1/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 69ms/step - accuracy: 0.6534 - loss: 0.5951 - val_accuracy: 0.8026 - val_loss: 0.4581
Epoch 2/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7956 - loss: 0.4216 - val_accuracy: 0.8355 - val_loss: 0.3928
Epoch 3/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8297 - loss: 0.3766 - val_accuracy: 0.8947 - val_loss: 0.3099
Epoch 4/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8751 - loss: 0.3166 - val_accuracy: 0.8717 - val_loss: 0.3196
Epoch 5/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8638 - loss: 0.3158 - val_accuracy: 0.9013 - val_loss: 0.2820
Epo