In [10]:
!pip install bayesian-optimization
!pip install smote_variants
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_curve, auc
from bayes_opt import BayesianOptimization
import smote_variants as sv
#Load data
train_data = pd.read_csv('Desktop/Proj Work/TEST/FilteredTrain.csv')
test_data = pd.read_csv('Desktop/Proj Work/TEST/FilteredTest.csv')

#Preprocess data
X_train = train_data.drop(['ID','SMILE ID', 'LABEL'], axis=1).values
X_test = test_data.drop(['ID', 'SMILE ID','LABEL'], axis=1).values

#Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['LABEL'].values)
y_test = label_encoder.transform(test_data['LABEL'].values)

oversampler= sv.AHC()
X_train, y_train= oversampler.sample(X_train, y_train)

#Define activation function and vectorize it
def s_faf(x, j=0.25, k=0.75, m=0.5):
    if x <= j:
        return 0
    elif j < x <= m:
        return 2 * ((x - j) / (k - j)) ** 2
    elif m < x < k:
        return 1 - 2 * ((x - k) / (k - j)) ** 2
    else:
        return 1
vectorized_s_faf = np.vectorize(s_faf)

def model_performance(enhancement_nodes, alpha):
    enhancement_nodes = int(enhancement_nodes)
    input_size = X_train.shape[1]
    random_weights = np.random.uniform(-1, 1, (input_size, enhancement_nodes))
    biases = np.random.uniform(-1, 1, enhancement_nodes)

    def compute_hidden_layer(X):
        G = np.dot(X, random_weights) + biases
        return vectorized_s_faf(G)

    H_train = compute_hidden_layer(X_train)
    H_test = compute_hidden_layer(X_test)

    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(H_train, y_train)

    raw_scores_train = np.dot(H_train, ridge_model.coef_)
    raw_scores_test = np.dot(H_test, ridge_model.coef_)

    fpr, tpr, thresholds = roc_curve(y_test, raw_scores_test)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]

    y_pred_train = (raw_scores_train > optimal_threshold) * 1
    y_pred_test = (raw_scores_test > optimal_threshold) * 1

    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print(f"Current Iteration: Training Accuracy: {train_accuracy}, Test Accuracy: {test_accuracy}")
    
    return test_accuracy  
#Setup Bayesian optimization
optimizer = BayesianOptimization(
    f=model_performance,
    pbounds={'enhancement_nodes': (10000, 20000), 'alpha': (10,300)},
    random_state=1,
    verbose=2 
)

optimizer.maximize(init_points=30, n_iter=50)

print("Best performance achieved:", optimizer.max['target'])
print("Optimal parameters found:", optimizer.max['params'])


Defaulting to user installation because normal site-packages is not writeable


2024-07-01 17:39:10,133:INFO:AHC: Running sampling via ('AHC', "{'strategy': 'min', 'n_jobs': 1, 'random_state': None, 'class_name': 'AHC'}")


Defaulting to user installation because normal site-packages is not writeable
|   iter    |  target   |   alpha   | enhanc... |
-------------------------------------------------
Current Iteration: Training Accuracy: 0.9957196361690743, Test Accuracy: 0.7548076923076923
| [0m1        [0m | [0m0.7548   [0m | [0m130.9    [0m | [0m1.72e+04 [0m |
Current Iteration: Training Accuracy: 0.9962546816479401, Test Accuracy: 0.7644230769230769
| [95m2        [0m | [95m0.7644   [0m | [95m10.03    [0m | [95m1.302e+04[0m |
Current Iteration: Training Accuracy: 0.9919743178170144, Test Accuracy: 0.7548076923076923
| [0m3        [0m | [0m0.7548   [0m | [0m52.56    [0m | [0m1.092e+04[0m |
Current Iteration: Training Accuracy: 0.9876939539860888, Test Accuracy: 0.7692307692307693
| [95m4        [0m | [95m0.7692   [0m | [95m64.02    [0m | [95m1.346e+04[0m |
Current Iteration: Training Accuracy: 0.9962546816479401, Test Accuracy: 0.7548076923076923
| [0m5        [0m | [0m

In [31]:
 # Original Shubham Bhaiya wala

 import numpy as np
 import pandas as pd
 from sklearn.linear_model import Ridge
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import accuracy_score, roc_curve, auc

 # Load data
 train_data = pd.read_csv('Desktop/Proj Work/TEST/FilteredTrain.csv')
 test_data = pd.read_csv('Desktop/Proj Work/TEST/FilteredTest.csv')

 # Preprocess data
 X_train = train_data.drop(['ID', 'SMILE ID', 'LABEL'], axis=1).values
 X_test = test_data.drop(['ID', 'SMILE ID', 'LABEL'], axis=1).values

 # Label encoding
 label_encoder = LabelEncoder()
 y_train = label_encoder.fit_transform(train_data['LABEL'].values)
 y_test = label_encoder.transform(test_data['LABEL'].values)


 # Define activation function and vectorize it
 def s_faf(x, j=0.25, k=0.75, m=0.5):
     if x <= j:
         return 0
     elif j < x <= m:
         return 2 * ((x - j) / (k - j)) ** 2
     elif m < x < k:
         return 1 - 2 * ((x - k) / (k - j)) ** 2
     else:
         return 1
 vectorized_s_faf = np.vectorize(s_faf)

 # RVFL network setup
 enhancement_nodes = 18781
 input_size = X_train.shape[1]
 random_weights = np.random.uniform(-1, 1, (input_size, enhancement_nodes))
 biases = np.random.uniform(-1, 1, enhancement_nodes)

 def compute_hidden_layer(X):
     G = np.dot(X, random_weights) + biases
     return vectorized_s_faf(G)

 H_train = compute_hidden_layer(X_train)
 H_test = compute_hidden_layer(X_test)

 # Solve the linear system for output weights
 ridge_model = Ridge(alpha=59)
 ridge_model.fit(H_train, y_train)
 output_weights = ridge_model.coef_

 # Compute raw scores for ROC analysis
 raw_scores_train = np.dot(H_train, output_weights)
 raw_scores_test = np.dot(H_test, output_weights)

 # Compute ROC curve and AUC
 fpr, tpr, thresholds = roc_curve(y_test, raw_scores_test)
 roc_auc = auc(fpr, tpr)

 # Determine the optimal threshold
 optimal_idx = np.argmax(tpr - fpr)
 optimal_threshold = thresholds[optimal_idx]

 # Predictions using the optimal threshold
 y_pred_train = (raw_scores_train > optimal_threshold) * 1
 y_pred_test = (raw_scores_test > optimal_threshold) * 1

 # Evaluate model
 train_accuracy = accuracy_score(y_train, y_pred_train)
 test_accuracy = accuracy_score(y_test, y_pred_test)

 print("Training Accuracy:", train_accuracy)
 print("Testing Accuracy:", test_accuracy)
 print("Optimal Threshold:", optimal_threshold)
 print("AUC:", roc_auc)


Training Accuracy: 0.99122106943336
Testing Accuracy: 0.7644230769230769
Optimal Threshold: 0.5236027269442505
AUC: 0.792973124300112


In [35]:
#RVFL+


!pip install optuna
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
import optuna
import pandas as pd

train_data = pd.read_csv('Desktop/Proj Work/TEST/FilteredTrain.csv')
test_data = pd.read_csv('Desktop/Proj Work/TEST/FilteredTest.csv')

# Preprocess data
X_train = train_data.drop(['ID', 'SMILE ID', 'LABEL'], axis=1).values
X_test = test_data.drop(['ID', 'SMILE ID', 'LABEL'], axis=1).values

# Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['LABEL'].values)
y_test = label_encoder.transform(test_data['LABEL'].values)

class RVFL(BaseEstimator, RegressorMixin, ClassifierMixin):
    def __init__(self, n_hidden=20, alpha=0.1, random_state=None):
        self.n_hidden = n_hidden
        self.alpha = alpha
        self.random_state = random_state

    def _init_weights(self, n_features):
        rng = np.random.default_rng(self.random_state)
        self.W = rng.uniform(-1, 1, size=(n_features, self.n_hidden))
        self.b = rng.uniform(-1, 1, size=self.n_hidden)

    def fit(self, X, y):
        self.scaler = StandardScaler()
        X = self.scaler.fit_transform(X)
        n_samples, n_features = X.shape
        self._init_weights(n_features)
        
        Z = np.tanh(X @ self.W + self.b)
        Z = np.hstack((X, Z))  # Augment with original features
        self.ridge = Ridge(alpha=self.alpha)
        self.ridge.fit(Z, y)
        return self

    def predict(self, X):
        X = self.scaler.transform(X)
        Z = np.tanh(X @ self.W + self.b)
        Z = np.hstack((X, Z))  # Augment with original features
        return self.ridge.predict(Z)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        if len(np.unique(y)) > 2:
            return mean_squared_error(y, y_pred)
        else:
            return accuracy_score(y, np.round(y_pred))

# Define objective function for Optuna
def objective(trial):
    n_hidden = trial.suggest_int('n_hidden', 15000, 20000)
    alpha = trial.suggest_float('alpha', 1e-4, 1e2, log=True)
    random_state = trial.suggest_int('random_state', 1, 100)

    model = RVFL(n_hidden=n_hidden, alpha=alpha, random_state=random_state)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

# Run Bayesian optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

print("Best parameters:", study.best_params)
print("Best score:", study.best_value)

# Train final model with best parameters
best_params = study.best_params
final_model = RVFL(**best_params)
final_model.fit(X_train, y_train)

# Predict and evaluate on the test dataset
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, np.round(y_pred))
print("Final model accuracy on test set:", accuracy)


[I 2024-07-01 20:32:21,198] A new study created in memory with name: no-name-527788cd-926a-4abd-95f7-b47c553f701e


Defaulting to user installation because normal site-packages is not writeable


[I 2024-07-01 20:32:23,397] Trial 0 finished with value: 0.7644230769230769 and parameters: {'n_hidden': 16818, 'alpha': 0.3933935728043142, 'random_state': 92}. Best is trial 0 with value: 0.7644230769230769.
[I 2024-07-01 20:32:25,444] Trial 1 finished with value: 0.7692307692307693 and parameters: {'n_hidden': 16285, 'alpha': 0.0038604243116641267, 'random_state': 92}. Best is trial 1 with value: 0.7692307692307693.
[I 2024-07-01 20:32:27,845] Trial 2 finished with value: 0.7692307692307693 and parameters: {'n_hidden': 18105, 'alpha': 0.00017078844055669248, 'random_state': 98}. Best is trial 1 with value: 0.7692307692307693.
[I 2024-07-01 20:32:30,263] Trial 3 finished with value: 0.7596153846153846 and parameters: {'n_hidden': 17623, 'alpha': 0.33121281438825506, 'random_state': 97}. Best is trial 1 with value: 0.7692307692307693.
[I 2024-07-01 20:32:32,571] Trial 4 finished with value: 0.7740384615384616 and parameters: {'n_hidden': 19904, 'alpha': 32.45227604714787, 'random_stat

Best parameters: {'n_hidden': 18746, 'alpha': 0.05674163513104797, 'random_state': 6}
Best score: 0.8028846153846154
Final model accuracy on test set: 0.8028846153846154
