## Model training and result generation

In [34]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shutil
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import (accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,confusion_matrix)
import joblib


X_train = np.load('/content/X_train.npy')
y_train = np.load('/content/y_train.npy')

# handle missing value in X_train: If there are more than 200 NaN values, delete the column directly
nan_counts = np.sum(np.isnan(X_train), axis=0)
columns_to_drop = [i for i, count in enumerate(nan_counts) if count > 200]
X_train_cleaned = np.delete(X_train, columns_to_drop, axis=1)

# Fill in remaining NaN values: use mean if continuous values, mode if discrete values
unique_values_per_column = [np.unique(X_train_cleaned[:, i]).size for i in range(X_train_cleaned.shape[1])]
# print( unique_values_per_column)

# Set threshold
threshold = 0.02  # Define the threshold as 2% of the sample size
samples_count = X_train_cleaned.shape[0]

# Initialize discrete and continuous feature index lists
discrete_indices = []
continuous_indices = []

for i, unique_count in enumerate(unique_values_per_column):
    # Case 1: If it is a continuous value, fill the NaN values with the mean of the column (exclude all NaN values)
    if unique_count / samples_count >= threshold:
        continuous_indices.append(i)
        column_mean = np.nanmean(X_train_cleaned[:, i])
        nan_indices = np.where(np.isnan(X_train_cleaned[:, i]))
        X_train_cleaned[nan_indices, i] = column_mean

    # Case 2: If it is a discrete value, fill the NaN values using mode padding (exclude all NaN values)
    else:
        discrete_indices.append(i)
        non_nan_values = X_train_cleaned[:, i][~np.isnan(X_train_cleaned[:, i])]
        if non_nan_values.size > 0:
          mode_result = stats.mode(non_nan_values)
          mode_value = mode_result.mode if mode_result.mode.size == 1 else mode_result.mode[0]
          nan_indices = np.where(np.isnan(X_train_cleaned[:, i]))
          X_train_cleaned[nan_indices, i] = mode_value

# Check if there are still NaN values
nan_counts_after_filling = np.sum(np.isnan(X_train_cleaned), axis=0)

# Normalization
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_indices),
        ('cat', OneHotEncoder(handle_unknown='ignore'), discrete_indices)
    ])

# Use ColumnTransformer to convert X_train_cleaned
X_train_preprocessed = preprocessor.fit_transform(X_train_cleaned)

# Save the fitted preprocessor for later use
joblib.dump(preprocessor, '/content/preprocessor.joblib')


# Make different feature selections for each label and delete columns with an importance of 0
important_features_indices = {}

for i in range(y_train.shape[1]):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_preprocessed, y_train[:, i])
    feature_importances = model.feature_importances_
    important_indices = np.where(feature_importances > 0)[0]
    important_features = X_train_preprocessed[:, important_indices]

    # Save a dataset of important features for each label
    np.save(f'/content/importance_data/X_train_important_label_{i}.npy', important_features)
    # Save an index of important features for each label
    important_features_indices[i] = important_indices
    # print(f"Label {i} - Retained features: {len(important_indices)} out of {X_train_preprocessed.shape[1]}")

np.save('/content/important_features_indices.npy', important_features_indices)  # Save all indices as one file

#y_train is a two-dimensional array, each column represents a label
y_train = np.load('/content/y_train.npy')


#Iterate through each label
for i in range(y_train.shape[1]):
    # Load important feature data for each tag
    X_train_important_path = f'/content/importance_data/X_train_important_label_{i}.npy'
    X_train_important = np.load(X_train_important_path)

    # Check if oversampling is required
    class_counts = np.bincount(y_train[:, i])
    imbalance_ratio = class_counts[1] / class_counts[0] if class_counts[0] != 0 else np.inf

    # Save processed data
    resampled_X_path = f'/content/resample_data/X_train_resampled_label_{i}.npy'
    resampled_y_path = f'/content/resample_data/y_train_resampled_label_{i}.npy'

    if imbalance_ratio < 0.35:
        # Only copy positive samples
        positive_indices = np.where(y_train[:, i] == 1)[0]
        X_to_duplicate = X_train_important[positive_indices]
        y_to_duplicate = y_train[positive_indices, i]

        # Copy the positive sample and corresponding label
        X_duplicated = np.concatenate([X_train_important, X_to_duplicate])
        y_duplicated = np.concatenate([y_train[:, i], y_to_duplicate])

        # Save the oversampled data
        np.save(resampled_X_path, X_duplicated)
        np.save(resampled_y_path, y_duplicated)
    else:
        # If oversampling is not required, copy only the column of the current label
        shutil.copyfile(X_train_important_path, resampled_X_path)
        np.save(resampled_y_path, y_train[:, i])


# Load the oversampled size of each tag
for i in range(y_train.shape[1]):
    X_train_resampled = np.load(f'/content/resample_data/X_train_resampled_label_{i}.npy')
    y_train_resampled = np.load(f'/content/resample_data/X_train_resampled_label_{i}.npy')
    # print(f"X_train_resampled_{i}", X_train_resampled.shape)
    # print(f"y_train_resampled_{i}", y_train_resampled.shape)


# Initialize evaluation list
accuracies = []
f1_scores = []
precision_scores = []
recall_scores = []
roc_auc_scores = []
confusion_matrices = []
loss_values = []

# Define loss function
def custom_loss(y_true, y_pred_proba):
    epsilon = 1e-15
    y_pred_proba = np.clip(y_pred_proba, epsilon, 1 - epsilon)
    loss_per_sample = -np.mean(y_true * np.log(y_pred_proba) + (1 - y_true) * np.log(1 - y_pred_proba))
    return loss_per_sample



# predict each label
for i in range(y_train.shape[1]):
    X = np.load(f'/content/resample_data/X_train_resampled_label_{i}.npy')
    y = np.load(f'/content/resample_data/y_train_resampled_label_{i}.npy')

    # Partition the data set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train a random forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Get the feature index whose importance is not 0
    feature_importances = model.feature_importances_
    # feature_importances_dict[i] = feature_importances

    # Save model
    dump(model, f'/content/models/model_label_{i}.joblib')

    # Make prediction
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate each evaluation metric
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    confusion = confusion_matrix(y_test, y_pred)

    # Append into list
    accuracies.append(acc)
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)
    roc_auc_scores.append(roc_auc)
    confusion_matrices.append(confusion)

    # Caculate loss value
    loss = custom_loss(y_test, y_pred_proba)
    loss_values.append(loss)

    print(f"Label {i} - Accuracy: {acc}, F1 Score: {f1}, Precision: {precision}, Recall: {recall}, ROC AUC: {roc_auc}, Confusion Matrix: {confusion}, Custom Loss: {loss}")

# Calculate average evaluation metric
average_accuracy = np.mean(accuracies)
average_f1_score = np.mean(f1_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_roc_auc = np.mean(roc_auc_scores)
# For confusion matrices, they are accumulated to get the overall confusion matrix
overall_confusion_matrix = np.sum(confusion_matrices, axis=0)

# Calculate the average custom loss value
average_loss = np.mean(loss_values)

print(f"Average Accuracy: {average_accuracy}")
print(f"Average F1 Score: {average_f1_score}")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average ROC AUC: {average_roc_auc}")
print(f"Overall Confusion Matrix: \n{overall_confusion_matrix}")
print(f"Average Custom Loss: {average_loss}")


# First do preprocessing on X_test
X_test = np.load('/content/X_test.npy')

# Remove the same columns removed from X_train
X_test_cleaned = np.delete(X_test, columns_to_drop, axis=1)


# Fill missing values for continuous features
for i in continuous_indices:
    column_mean = np.nanmean(X_test_cleaned[:, i])
    nan_indices = np.where(np.isnan(X_test_cleaned[:, i]))
    X_test_cleaned[nan_indices, i] = column_mean

# Fill missing values for discrete features
for i in discrete_indices:
    non_nan_values = X_test_cleaned[:, i][~np.isnan(X_test_cleaned[:, i])]
    if non_nan_values.size > 0:
        mode_result = stats.mode(non_nan_values)
        mode_value = mode_result.mode if mode_result.mode.size == 1 else mode_result.mode[0]
        nan_indices = np.where(np.isnan(X_test_cleaned[:, i]))
        X_test_cleaned[nan_indices, i] = mode_value

# Apply the same preprocessing (scaling and encoding)
# Load the fitted preprocessor
preprocessor = joblib.load('/content/preprocessor.joblib')

# Apply the preprocessing to the test set
X_test_preprocessed = preprocessor.transform(X_test_cleaned)
# print(X_test_preprocessed.shape)



# Load the number of saved features
important_features_indices = np.load('/content/important_features_indices.npy', allow_pickle=True).item()


# Prepare to store predictions for each label
predictions = np.zeros((X_test_preprocessed.shape[0], len(important_features_indices)))

# Load models and predict for each label
for label, indices in important_features_indices.items():
    model = joblib.load(f'/content/models/model_label_{label}.joblib')
    # Select the important features for the label
    X_test_important = X_test_preprocessed[:, indices]
    # Predict using the corresponding model
    predictions[:, label] = model.predict(X_test_important)

# Save the predictions array to a file
np.save('y_test.npy', predictions)

prediction = np.load('/content/y_test.npy')
print()
print("The size of final prediction", prediction.shape)
# for i in range(prediction.shape[0]):
#   print(f"The prediction of {i}th sample ", prediction[i])


Label 0 - Accuracy: 0.7066666666666667, F1 Score: 0.06382978723404255, Precision: 0.375, Recall: 0.03488372093023256, ROC AUC: 0.5207020212997174, Confusion Matrix: [[209   5]
 [ 83   3]], Custom Loss: 0.6098601447036104
Label 1 - Accuracy: 0.8928571428571429, F1 Score: 0.8186046511627908, Precision: 0.967032967032967, Recall: 0.7096774193548387, ROC AUC: 0.8554771505376345, Confusion Matrix: [[237   3]
 [ 36  88]], Custom Loss: 0.44835687109284156
Label 2 - Accuracy: 0.9318801089918256, F1 Score: 0.8803827751196173, Precision: 0.989247311827957, Recall: 0.7931034482758621, ROC AUC: 0.9183953839813161, Confusion Matrix: [[250   1]
 [ 24  92]], Custom Loss: 0.40911395440259163
Label 3 - Accuracy: 0.8729729729729729, F1 Score: 0.7929515418502202, Precision: 0.9090909090909091, Recall: 0.703125, ROC AUC: 0.8427976497933884, Confusion Matrix: [[233   9]
 [ 38  90]], Custom Loss: 0.4546024046997047
Label 4 - Accuracy: 0.9110512129380054, F1 Score: 0.8630705394190872, Precision: 0.9541284403

## Code for deleting file

In [None]:
import os
import glob

# delete all X_train_important_label_*.npy
for filename in glob.glob('/content/importance_data/X_train_important_label_*.npy'):
    os.remove(filename)


# delete all X_train_important_label_*.npy
for filename in glob.glob('/content/resample_data/X_train_resampled_label_*.npy'):
    os.remove(filename)

# delete all  y_train_resampled_label_*.npy
for filename in glob.glob('/content/resample_data/y_train_resampled_label_*.npy'):
    os.remove(filename)


for filename in glob.glob('/content/models/model_label_*.joblib'):
    os.remove(filename)
