MLP

In [None]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load data from ARFF file
data, meta = arff.loadarff('/content/drive/MyDrive/nasa/MW1.arff')  # Replace 'CM1.arff' with your ARFF file path

# Convert data to DataFrame
df = pd.DataFrame(data)

# Extract features and labels
X = df.drop(columns=['Defective']).values  # Features (all columns except the 'defective' column)
y = df['Defective'].values  # Labels (the 'class' column)

# Map 'Y' and 'N' labels to numerical values (e.g., 1 for 'Y' and 0 for 'N')
y = np.where(y == b'Y', 1, 0)

# Selecting only the desired columns from the features DataFrame
selected_columns = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']
X_selected = df[selected_columns].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define batch size and number of epochs
batch_size = 7
epochs = 100

# Initialize and train a multi-layer perceptron (MLP) classifier with batch size and epochs
# Adjust parameters as needed (e.g., hidden layer sizes, activation function, solver, etc.)
clf = MLPClassifier(hidden_layer_sizes=(50,50,50,50,50,50,50,50,50,50), activation='relu', solver='adam', random_state=42, batch_size=batch_size, max_iter=epochs)
clf.fit(X_train_scaled, y_train)

# Predict labels for the test set
y_pred = clf.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract values from confusion matrix
tn, fp, fn, tp = conf_matrix.ravel()

# Calculate detection rate (sensitivity)
detection_rate = tp / (tp + fn)

# Calculate true negative rate (TNR)
tnr = tn / (tn + fp)

# Print detection rate, TNR, and accuracy
print("Detection Rate (Sensitivity):", detection_rate)
print("True Negative Rate (TNR):", tnr)
print("Accuracy:", accuracy)




Detection Rate (Sensitivity): 0.3
True Negative Rate (TNR): 0.926829268292683
Accuracy: 0.803921568627451


CNN

In [None]:
import numpy as np
from scipy.io import arff
import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tqdm import tqdm  # Import tqdm for loading animation

# Load data from ARFF file
data, meta = arff.loadarff('/content/drive/MyDrive/nasa/MW1.arff')  # Replace 'CM1.arff' with your ARFF file path

# Convert data to DataFrame
df = pd.DataFrame(data)

# Extract features and labels
X = df.drop(columns=['Defective']).values  # Features (all columns except the 'defective' column)
y = df['Defective'].values  # Labels (the 'class' column)

# Map 'Y' and 'N' labels to numerical values (e.g., 1 for 'Y' and 0 for 'N')
y = np.where(y == b'Y', 1, 0)

# Selecting only the desired columns from the features DataFrame
selected_columns = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']
X_selected = df[selected_columns].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for CNN input (assuming input shape is (number of samples, number of features, 1))
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Define CNN architecture
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model with loading animation
num_epochs = 2000
with tqdm(total=num_epochs) as pbar:  # Initialize tqdm with total number of epochs
    for epoch in range(num_epochs):
        model.fit(X_train_reshaped, y_train, epochs=1, batch_size=10, validation_split=0.2, verbose=0)
        pbar.update(1)  # Update tqdm progress bar after each epoch

# Evaluate model
# Predict probabilities for the test set
y_pred_prob = model.predict(X_test_reshaped)

# Threshold probabilities to obtain predicted class labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract values from confusion matrix
true_negatives = conf_matrix[0][0]
false_positives = conf_matrix[0][1]
false_negatives = conf_matrix[1][0]
true_positives = conf_matrix[1][1]

# Calculate accuracy, detection rate (sensitivity), and true negative rate (TNR)
accuracy = accuracy_score(y_test, y_pred)
detection_rate = true_positives / (true_positives + false_negatives)
tnr = true_negatives / (true_negatives + false_positives)

# Print the results
print("Accuracy:", accuracy)
print("Detection Rate (Sensitivity):", detection_rate)
print("True Negative Rate (TNR):", tnr)







100%|██████████| 2000/2000 [11:36<00:00,  2.87it/s]

Accuracy: 0.803921568627451
Detection Rate (Sensitivity): 0.1
True Negative Rate (TNR): 0.975609756097561





RNN

In [None]:
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from tqdm import tqdm  # Import tqdm for loading animation

# Load data from ARFF file
data, meta = arff.loadarff('/content/drive/MyDrive/nasa/MW1.arff')  # Replace 'CM1.arff' with your ARFF file path

# Convert data to DataFrame
df = pd.DataFrame(data)

# Extract features and labels
X = df.drop(columns=['Defective']).values  # Features (all columns except the 'defective' column)
y = df['Defective'].values  # Labels (the 'class' column)

# Map 'Y' and 'N' labels to numerical values (e.g., 1 for 'Y' and 0 for 'N')
y = np.where(y == b'Y', 1, 0)

# Selecting only the desired columns from the features DataFrame
selected_columns = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']
X_selected = df[selected_columns].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for RNN input (assuming input shape is (number of samples, number of features))
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

# Define Vanilla RNN architecture
model = Sequential([
    SimpleRNN(units=32, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model with loading animation
num_epochs = 10
with tqdm(total=num_epochs) as pbar:  # Initialize tqdm with total number of epochs
    for epoch in range(num_epochs):
        model.fit(X_train_reshaped, y_train, epochs=100, batch_size=10, validation_split=0.2, verbose=0)
        pbar.update(1)  # Update tqdm progress bar after each epoch

# Evaluate model
y_pred_prob = model.predict(X_test_reshaped)

# Threshold probabilities to obtain predicted class labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract values from confusion matrix
true_negatives = conf_matrix[0][0]
false_positives = conf_matrix[0][1]
false_negatives = conf_matrix[1][0]
true_positives = conf_matrix[1][1]

# Calculate accuracy, detection rate (sensitivity), and true negative rate (TNR)
accuracy = accuracy_score(y_test, y_pred)
detection_rate = true_positives / (true_positives + false_negatives)
tnr = true_negatives / (true_negatives + false_positives)

# Print the results
print("Accuracy:", accuracy)
print("Detection Rate (Sensitivity):", detection_rate)
print("True Negative Rate (TNR):", tnr)


100%|██████████| 10/10 [02:34<00:00, 15.44s/it]


Accuracy: 0.8431372549019608
Detection Rate (Sensitivity): 0.2
True Negative Rate (TNR): 1.0


LSTM

In [None]:
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.models import Sequential
from keras.layers import LSTM, Dense
from tqdm import tqdm  # Import tqdm for loading animation

# Load data from ARFF file
data, meta = arff.loadarff('/content/drive/MyDrive/nasa/MW1.arff')  # Replace 'CM1.arff' with your ARFF file path

# Convert data to DataFrame
df = pd.DataFrame(data)

# Extract features and labels
X = df.drop(columns=['Defective']).values  # Features (all columns except the 'defective' column)
y = df['Defective'].values  # Labels (the 'class' column)

# Map 'Y' and 'N' labels to numerical values (e.g., 1 for 'Y' and 0 for 'N')
y = np.where(y == b'Y', 1, 0)

# Selecting only the desired columns from the features DataFrame
selected_columns = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']
X_selected = df[selected_columns].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for LSTM input (assuming input shape is (number of samples, number of time steps, number of features))
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

# Define LSTM architecture
model = Sequential([
    LSTM(units=32, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model with loading animation
num_epochs = 10
with tqdm(total=num_epochs) as pbar:  # Initialize tqdm with total number of epochs
    for epoch in range(num_epochs):
        model.fit(X_train_reshaped, y_train, epochs=100, batch_size=10, validation_split=0.2, verbose=0)
        pbar.update(1)  # Update tqdm progress bar after each epoch

# Predict probabilities for the test set
y_pred_prob = model.predict(X_test_reshaped)

# Threshold probabilities to obtain predicted class labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract values from confusion matrix
true_negatives = conf_matrix[0][0]
false_positives = conf_matrix[0][1]
false_negatives = conf_matrix[1][0]
true_positives = conf_matrix[1][1]

# Calculate accuracy, detection rate (sensitivity), and true negative rate (TNR)
accuracy = accuracy_score(y_test, y_pred)
detection_rate = true_positives / (true_positives + false_negatives)
tnr = true_negatives / (true_negatives + false_positives)

# Print the results
print("Accuracy:", accuracy)
print("Detection Rate (Sensitivity):", detection_rate)
print("True Negative Rate (TNR):", tnr)



100%|██████████| 10/10 [02:28<00:00, 14.84s/it]


Accuracy: 0.8431372549019608
Detection Rate (Sensitivity): 0.2
True Negative Rate (TNR): 1.0


HYBRID

In [None]:
import numpy as np
from scipy.io import arff
import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tqdm import tqdm  # Import tqdm for loading animation

# Load data from ARFF file
data, meta = arff.loadarff('/content/drive/MyDrive/nasa/PC3.arff')  # Replace 'CM1.arff' with your ARFF file path

# Convert data to DataFrame
df = pd.DataFrame(data)

# Extract features and labels
X = df.drop(columns=['Defective']).values  # Features (all columns except the 'defective' column)
y = df['Defective'].values  # Labels (the 'class' column)

# Map 'Y' and 'N' labels to numerical values (e.g., 1 for 'Y' and 0 for 'N')
y = np.where(y == b'Y', 1, 0)

# Selecting only the desired columns from the features DataFrame
selected_columns = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']
X_selected = df[selected_columns].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for CNN input (assuming input shape is (number of samples, number of features, 1))
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Define CNN architecture
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    LSTM(64, return_sequences=True),  # RNN layer to capture temporal dependencies
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model with loading animation
num_epochs = 100
with tqdm(total=num_epochs) as pbar:  # Initialize tqdm with total number of epochs
    for epoch in range(num_epochs):
        model.fit(X_train_reshaped, y_train, epochs=1, batch_size=10, validation_split=0.2, verbose=0)
        pbar.update(1)  # Update tqdm progress bar after each epoch

# Evaluate model
# Predict probabilities for the test set
y_pred_prob = model.predict(X_test_reshaped)

# Threshold probabilities to obtain predicted class labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract values from confusion matrix
true_negatives = conf_matrix[0][0]
false_positives = conf_matrix[0][1]
false_negatives = conf_matrix[1][0]
true_positives = conf_matrix[1][1]

# Calculate accuracy, detection rate (sensitivity), and true negative rate (TNR)
accuracy = accuracy_score(y_test, y_pred)
detection_rate = true_positives / (true_positives + false_negatives)
tnr = true_negatives / (true_negatives + false_positives)

# Print the results
print("Accuracy:", accuracy)
print("Detection Rate (Sensitivity):", detection_rate)
print("True Negative Rate (TNR):", tnr)







100%|██████████| 100/100 [02:09<00:00,  1.30s/it]


Accuracy: 0.8240740740740741
Detection Rate (Sensitivity): 0.22857142857142856
True Negative Rate (TNR): 0.9392265193370166


GRU

In [None]:
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.models import Sequential
from keras.layers import GRU, Dense
from tqdm import tqdm  # Import tqdm for loading animation

# Load data from ARFF file
data, meta = arff.loadarff('/content/drive/MyDrive/nasa/MW1.arff')  # Replace 'CM1.arff' with your ARFF file path

# Convert data to DataFrame
df = pd.DataFrame(data)

# Extract features and labels
X = df.drop(columns=['Defective']).values  # Features (all columns except the 'defective' column)
y = df['Defective'].values  # Labels (the 'class' column)

# Map 'Y' and 'N' labels to numerical values (e.g., 1 for 'Y' and 0 for 'N')
y = np.where(y == b'Y', 1, 0)

# Selecting only the desired columns from the features DataFrame
selected_columns = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']
X_selected = df[selected_columns].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for GRU input (assuming input shape is (number of samples, number of time steps, number of features))
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

# Define GRU architecture
model = Sequential([
    GRU(units=32, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model with loading animation
num_epochs = 10
with tqdm(total=num_epochs) as pbar:  # Initialize tqdm with total number of epochs
    for epoch in range(num_epochs):
        model.fit(X_train_reshaped, y_train, epochs=100, batch_size=10, validation_split=0.2, verbose=0)
        pbar.update(1)  # Update tqdm progress bar after each epoch

# Predict probabilities for the test set
y_pred_prob = model.predict(X_test_reshaped)

# Threshold probabilities to obtain predicted class labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract values from confusion matrix
true_negatives = conf_matrix[0][0]
false_positives = conf_matrix[0][1]
false_negatives = conf_matrix[1][0]
true_positives = conf_matrix[1][1]

# Calculate accuracy, detection rate (sensitivity), and true negative rate (TNR)
accuracy = accuracy_score(y_test, y_pred)
detection_rate = true_positives / (true_positives + false_negatives)
tnr = true_negatives / (true_negatives + false_positives)

# Print the results
print("Accuracy:", accuracy)
print("Detection Rate (Sensitivity):", detection_rate)
print("True Negative Rate (TNR):", tnr)


100%|██████████| 10/10 [02:52<00:00, 17.22s/it]


Accuracy: 0.803921568627451
Detection Rate (Sensitivity): 0.1
True Negative Rate (TNR): 0.975609756097561


GAN

In [None]:
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.models import Sequential
from keras.layers import Dense
from tqdm import tqdm  # Import tqdm for loading animation

# Load data from ARFF file
data, meta = arff.loadarff('/content/drive/MyDrive/nasa/MW1.arff')  # Replace 'CM1.arff' with your ARFF file path

# Convert data to DataFrame
df = pd.DataFrame(data)

# Extract features and labels
X = df.drop(columns=['Defective']).values  # Features (all columns except the 'defective' column)
y = df['Defective'].values  # Labels (the 'class' column)

# Map 'Y' and 'N' labels to numerical values (e.g., 1 for 'Y' and 0 for 'N')
y = np.where(y == b'Y', 1, 0)

# Selecting only the desired columns from the features DataFrame
selected_columns = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']
X_selected = df[selected_columns].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the discriminator model
def build_discriminator(input_dim):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=input_dim))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define the generator model
def build_generator(latent_dim, output_dim):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=latent_dim))
    model.add(Dense(output_dim, activation='sigmoid'))
    return model

# Define the combined generator and discriminator model (GAN)
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

# Define the size of the random noise vector (latent space)
latent_dim = 10

# Build and compile the discriminator
discriminator = build_discriminator(X_train_scaled.shape[1])

# Build the generator
generator = build_generator(latent_dim, X_train_scaled.shape[1])

# Build and compile the GAN model
gan = build_gan(generator, discriminator)

# Training parameters
epochs = 100
batch_size = 32

# Train the GAN
for epoch in tqdm(range(epochs)):
    # Generate random noise as input to the generator
    noise = np.random.normal(0, 1, (batch_size, latent_dim))

    # Generate fake samples using the generator
    gen_samples = generator.predict(noise)

    # Combine real and fake samples
    X_combined = np.concatenate((X_train_scaled, gen_samples))

    # Assign labels for real and fake samples
    y_combined = np.concatenate((np.ones((len(X_train_scaled), 1)), np.zeros((batch_size, 1))))

    # Train the discriminator
    d_loss = discriminator.train_on_batch(X_combined, y_combined)

    # Train the generator (via the GAN model)
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    y_gen = np.ones((batch_size, 1))
    g_loss = gan.train_on_batch(noise, y_gen)

# Evaluate the discriminator on the test set
y_pred_prob = discriminator.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)


  0%|          | 0/100 [00:00<?, ?it/s]



  1%|          | 1/100 [00:01<03:03,  1.85s/it]



  2%|▏         | 2/100 [00:01<01:21,  1.21it/s]



  3%|▎         | 3/100 [00:02<00:48,  2.01it/s]



  4%|▍         | 4/100 [00:02<00:33,  2.89it/s]



  5%|▌         | 5/100 [00:02<00:25,  3.76it/s]



  6%|▌         | 6/100 [00:02<00:20,  4.60it/s]



  7%|▋         | 7/100 [00:02<00:17,  5.18it/s]



  8%|▊         | 8/100 [00:02<00:16,  5.75it/s]



  9%|▉         | 9/100 [00:02<00:14,  6.28it/s]



 10%|█         | 10/100 [00:02<00:13,  6.68it/s]



 11%|█         | 11/100 [00:03<00:12,  7.12it/s]



 12%|█▏        | 12/100 [00:03<00:12,  7.05it/s]



 13%|█▎        | 13/100 [00:03<00:11,  7.56it/s]



 15%|█▌        | 15/100 [00:03<00:09,  8.71it/s]



 16%|█▌        | 16/100 [00:03<00:09,  8.98it/s]



 18%|█▊        | 18/100 [00:03<00:08,  9.43it/s]



 19%|█▉        | 19/100 [00:03<00:08,  9.43it/s]



 20%|██        | 20/100 [00:04<00:08,  9.54it/s]



 21%|██        | 21/100 [00:04<00:08,  9.51it/s]



 22%|██▏       | 22/100 [00:04<00:08,  9.28it/s]



 23%|██▎       | 23/100 [00:04<00:08,  9.20it/s]



 24%|██▍       | 24/100 [00:04<00:08,  9.07it/s]



 25%|██▌       | 25/100 [00:04<00:08,  8.47it/s]



 26%|██▌       | 26/100 [00:04<00:09,  8.16it/s]



 27%|██▋       | 27/100 [00:04<00:08,  8.19it/s]



 28%|██▊       | 28/100 [00:04<00:08,  8.34it/s]



 29%|██▉       | 29/100 [00:05<00:08,  8.46it/s]



 30%|███       | 30/100 [00:05<00:08,  8.39it/s]



 31%|███       | 31/100 [00:05<00:08,  8.35it/s]



 32%|███▏      | 32/100 [00:05<00:08,  8.21it/s]



 33%|███▎      | 33/100 [00:05<00:08,  8.16it/s]



 34%|███▍      | 34/100 [00:05<00:08,  8.17it/s]



 35%|███▌      | 35/100 [00:05<00:08,  8.09it/s]



 36%|███▌      | 36/100 [00:05<00:07,  8.24it/s]



 37%|███▋      | 37/100 [00:06<00:07,  8.34it/s]



 38%|███▊      | 38/100 [00:06<00:07,  8.46it/s]



 39%|███▉      | 39/100 [00:06<00:07,  8.35it/s]



 40%|████      | 40/100 [00:06<00:07,  8.53it/s]



 41%|████      | 41/100 [00:06<00:07,  8.42it/s]



 42%|████▏     | 42/100 [00:06<00:06,  8.63it/s]



 43%|████▎     | 43/100 [00:06<00:06,  8.90it/s]



 44%|████▍     | 44/100 [00:06<00:06,  9.00it/s]



 45%|████▌     | 45/100 [00:06<00:06,  9.14it/s]



 46%|████▌     | 46/100 [00:07<00:05,  9.31it/s]



 47%|████▋     | 47/100 [00:07<00:05,  9.33it/s]



 48%|████▊     | 48/100 [00:07<00:05,  9.35it/s]



 49%|████▉     | 49/100 [00:07<00:05,  9.18it/s]



 51%|█████     | 51/100 [00:07<00:05,  9.68it/s]



 52%|█████▏    | 52/100 [00:07<00:05,  9.49it/s]



 53%|█████▎    | 53/100 [00:07<00:05,  8.99it/s]



 54%|█████▍    | 54/100 [00:07<00:04,  9.20it/s]



 55%|█████▌    | 55/100 [00:08<00:04,  9.31it/s]



 57%|█████▋    | 57/100 [00:08<00:04,  9.54it/s]



 58%|█████▊    | 58/100 [00:08<00:04,  9.22it/s]



 59%|█████▉    | 59/100 [00:08<00:04,  9.20it/s]



 60%|██████    | 60/100 [00:08<00:04,  9.19it/s]



 61%|██████    | 61/100 [00:08<00:04,  9.14it/s]



 62%|██████▏   | 62/100 [00:08<00:04,  8.59it/s]



 63%|██████▎   | 63/100 [00:08<00:04,  8.56it/s]



 64%|██████▍   | 64/100 [00:09<00:04,  8.19it/s]



 65%|██████▌   | 65/100 [00:09<00:04,  8.42it/s]



 66%|██████▌   | 66/100 [00:09<00:04,  8.23it/s]



 67%|██████▋   | 67/100 [00:09<00:04,  8.07it/s]



 68%|██████▊   | 68/100 [00:09<00:03,  8.48it/s]



 69%|██████▉   | 69/100 [00:09<00:03,  8.82it/s]



 70%|███████   | 70/100 [00:09<00:03,  8.92it/s]



 71%|███████   | 71/100 [00:09<00:03,  8.72it/s]



 72%|███████▏  | 72/100 [00:10<00:03,  8.39it/s]



 73%|███████▎  | 73/100 [00:10<00:03,  8.71it/s]



 74%|███████▍  | 74/100 [00:10<00:02,  8.93it/s]



 75%|███████▌  | 75/100 [00:10<00:02,  9.05it/s]



 76%|███████▌  | 76/100 [00:10<00:02,  8.38it/s]



 77%|███████▋  | 77/100 [00:10<00:02,  8.67it/s]



 78%|███████▊  | 78/100 [00:10<00:02,  8.94it/s]



 79%|███████▉  | 79/100 [00:10<00:02,  7.89it/s]



 80%|████████  | 80/100 [00:10<00:02,  7.30it/s]



 81%|████████  | 81/100 [00:11<00:02,  6.87it/s]



 82%|████████▏ | 82/100 [00:11<00:02,  6.93it/s]



 83%|████████▎ | 83/100 [00:11<00:02,  6.62it/s]



 84%|████████▍ | 84/100 [00:11<00:02,  6.58it/s]



 85%|████████▌ | 85/100 [00:11<00:02,  6.24it/s]



 86%|████████▌ | 86/100 [00:11<00:02,  6.00it/s]



 87%|████████▋ | 87/100 [00:12<00:02,  5.80it/s]



 88%|████████▊ | 88/100 [00:12<00:02,  5.76it/s]



 89%|████████▉ | 89/100 [00:12<00:01,  5.77it/s]



 90%|█████████ | 90/100 [00:12<00:01,  5.78it/s]



 91%|█████████ | 91/100 [00:12<00:01,  5.46it/s]



 92%|█████████▏| 92/100 [00:13<00:01,  5.54it/s]



 93%|█████████▎| 93/100 [00:13<00:01,  5.65it/s]



 94%|█████████▍| 94/100 [00:13<00:01,  5.69it/s]



 95%|█████████▌| 95/100 [00:13<00:00,  5.81it/s]



 96%|█████████▌| 96/100 [00:13<00:00,  5.79it/s]



 97%|█████████▋| 97/100 [00:13<00:00,  6.03it/s]



 98%|█████████▊| 98/100 [00:14<00:00,  6.20it/s]



 99%|█████████▉| 99/100 [00:14<00:00,  6.25it/s]



100%|██████████| 100/100 [00:14<00:00,  6.97it/s]


Accuracy: 0.19607843137254902
Confusion Matrix:
[[ 0 41]
 [ 0 10]]


BERT

In [35]:
import pandas as pd
from scipy.io import arff

# Load ARFF file
data, meta = arff.loadarff('/content/drive/MyDrive/nasa/PC5.arff')  # Replace 'your_arff_file.arff' with the path to your ARFF file

# Convert to DataFrame
df = pd.DataFrame(data)
df.shape
# Save DataFrame to CSV
df.to_csv('output.csv', index=False)  # Replace 'output.csv' with the desired output file path


In [5]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Load data using pandas
data = pd.read_csv('/content/output.csv')
data['text'] = data[['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']].apply(lambda x: ' '.join(map(str, x)), axis=1)
# Convert numerical data to text format
# For example, concatenate numerical values into strings
# Map True to 1 and False to 0
# Map 'Y' to 1 and 'N' to 0
data['Defective'] = data['Defective'].map({"b'Y'": 1, "b'N'": 0})



# Extract features and labels
X = data['text'].tolist()  # Text inputs
y = data['Defective'].tolist()  # Binary labels

# Tokenize inputs using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_inputs = tokenizer(X, padding=True, truncation=True, return_tensors='pt')

# Convert labels to tensors
labels = torch.tensor(y)

# Train-test split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(encoded_inputs['input_ids'], labels, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Train the model
# (Same training loop as in the previous example)

# Evaluation
# (Same evaluation procedure as in the previous example)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
num_epochs=10
# Set the device (GPU or CPU)
device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Move the model to the device
model.to(device)

# Set the model in training mode
model.train()

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        # Move batch to the device
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from the DataLoader
        b_input_ids, b_labels = batch
        # Clear any previously calculated gradients
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids, labels=b_labels)
        # Get the loss
        loss = outputs.loss
        # Perform a backward pass to calculate gradients
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # Accumulate the loss
        total_loss += loss.item()
    # Calculate the average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}')

print("Training finished.")


Epoch 1/10, Average Loss: 0.5655
Epoch 2/10, Average Loss: 0.5337
Epoch 3/10, Average Loss: 0.5353
Epoch 4/10, Average Loss: 0.5250
Epoch 5/10, Average Loss: 0.5060
Epoch 6/10, Average Loss: 0.4966
Epoch 7/10, Average Loss: 0.4475
Epoch 8/10, Average Loss: 0.4332
Epoch 9/10, Average Loss: 0.3875
Epoch 10/10, Average Loss: 0.3603
Training finished.


In [None]:
import pandas as pd
from scipy.io import arff

# Load ARFF file
data, meta = arff.loadarff('CM1.arff')  # Replace 'your_arff_file.arff' with the path to your ARFF file

# Convert to DataFrame
df = pd.DataFrame(data)
df.shape
# Save DataFrame to CSV
df.to_csv('output.csv', index=False)  # Replace 'output.csv' with the desired output file path
print(df)

In [None]:
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load data from ARFF file
data, meta = arff.loadarff('CM1.arff')  # Replace 'your_data.arff' with your ARFF file path
print(meta)

In [None]:
selected_columns = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']
df = df[selected_columns]
print(new_df)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
from scipy.io import arff

# Directory containing the ARFF files
directory = '/content/drive/MyDrive/nasa'

# Attributes to search for
attributes_to_search = ['LOC_EXECUTABLE','CYCLOMATIC_COMPLEXITY','CYCLOMATIC_DENSITY', 'ESSENTIAL_COMPLEXITY', 'DESIGN_COMPLEXITY','DESIGN_COMPLEXITY','HALSTEAD_LENGTH','HALSTEAD_DIFFICULTY','HALSTEAD_LEVEL','HALSTEAD_EFFORT','HALSTEAD_ERROR_EST','HALSTEAD_CONTENT','HALSTEAD_PROG_TIME','LOC_COMMENTS','LOC_BLANK','LOC_CODE_AND_COMMENT','NUM_UNIQUE_OPERATORS','NUM_UNIQUE_OPERANDS','NUM_OPERATORS','NUM_OPERANDS','BRANCH_COUNT']

# List to store ARFF files containing the specified attributes
files_with_attributes = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.arff'):
        filepath = os.path.join(directory, filename)
        # Read the ARFF file
        data, meta = arff.loadarff(filepath)
        # Get attribute names from meta
        attributes_in_file = meta.names()
        # Check if all attributes_to_search are present in the file
        if all(attr in attributes_in_file for attr in attributes_to_search):
            files_with_attributes.append(filename)

# Print the ARFF files containing the specified attributes
for file in files_with_attributes:
    print(file)


PC5.arff
MW1.arff
MC1.arff
PC4.arff
MC2.arff
PC1.arff
CM1.arff
PC3.arff
KC3.arff
