<a href="https://colab.research.google.com/github/sripriyakonjarla/Machine_Learning/blob/main/tuning_bart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartModel

# Load the BART tokenizer and model
model_name = "facebook/bart-base"  # You can choose 'facebook/bart-base' or 'facebook/bart-large'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartModel.from_pretrained(model_name)

# Load the Excel file
excel_file_path = 'testingData2.xlsx'  # Replace with your file path
df = pd.read_excel(excel_file_path)

# Assuming the answers are in a column named 'input'
answers = df['Input'].tolist()  # Adjust the column name as needed

# Function to generate embeddings using BART
def generate_embeddings(texts):
    texts = [str(text) for text in texts]
    # Tokenize the input texts
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Get the last hidden states (embeddings)
        embeddings = outputs.last_hidden_state

    # Mean embedding for each text
    mean_embeddings = embeddings.mean(dim=1)
    return mean_embeddings

# Process in batches to avoid memory issues
batch_size = 16
all_embeddings = []
for i in range(0, len(answers), batch_size):
    batch_answers = answers[i:i + batch_size]
    embeddings = generate_embeddings(batch_answers)
    all_embeddings.append(embeddings)

# Concatenate all embeddings into a single tensor
all_embeddings = torch.cat(all_embeddings, dim=0)

# Convert to a DataFrame for easier saving
embeddings_df = pd.DataFrame(all_embeddings.numpy())

# Save embeddings to a new Excel file
embeddings_df.to_excel('testing_embeddings.xlsx', index=False)  # Adjust the save path as needed
print("Embeddings generated and saved successfully.")



In [None]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Example classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Load the dataset from an Excel file
data = pd.read_excel('bart_embeddings.xlsx')

# Assume the last column is the target variable
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define models
models = {
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "MLP": MLPClassifier(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=0),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}

# Initialize a dictionary to hold metrics
metrics = {model_name: {} for model_name in models.keys()}

# Evaluate each model
for model_name, model in models.items():
    model.fit(X_scaled, y)
    y_pred = model.predict(X_scaled)

    metrics[model_name]['Accuracy'] = accuracy_score(y, y_pred)
    metrics[model_name]['Precision'] = precision_score(y, y_pred, average='weighted')
    metrics[model_name]['Recall'] = recall_score(y, y_pred, average='weighted')
    metrics[model_name]['F1 Score'] = f1_score(y, y_pred, average='weighted')

# Display metrics
for model_name, model_metrics in metrics.items():
    print(f"{model_name}: {model_metrics}")



SVM: {'Accuracy': 0.8053571428571429, 'Precision': 0.8093708704359247, 'Recall': 0.8053571428571429, 'F1 Score': 0.8058759372323357}
KNN: {'Accuracy': 0.756547619047619, 'Precision': 0.7567349396097982, 'Recall': 0.756547619047619, 'F1 Score': 0.7565872206031347}
Decision Tree: {'Accuracy': 0.9791666666666666, 'Precision': 0.9795091545818201, 'Recall': 0.9791666666666666, 'F1 Score': 0.9791168193041022}
Gaussian Naive Bayes: {'Accuracy': 0.5369047619047619, 'Precision': 0.564503007601679, 'Recall': 0.5369047619047619, 'F1 Score': 0.5260989623311423}
Random Forest: {'Accuracy': 0.9791666666666666, 'Precision': 0.9792658791997, 'Recall': 0.9791666666666666, 'F1 Score': 0.979191341907501}
Logistic Regression: {'Accuracy': 0.9321428571428572, 'Precision': 0.9321610767299515, 'Recall': 0.9321428571428572, 'F1 Score': 0.9321450170150424}
MLP: {'Accuracy': 0.9648809523809524, 'Precision': 0.9648745872835096, 'Recall': 0.9648809523809524, 'F1 Score': 0.9648772290495987}
XGBoost: {'Accuracy': 0

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import sys  # For flushing output

# Define models
models = {
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "MLP": MLPClassifier(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=0),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}

# Initialize a dictionary to hold metrics
metrics = {model_name: {} for model_name in models.keys()}

# Evaluate each model
for model_name, model in models.items():
    model.fit(X_scaled, y)
    y_pred = model.predict(X_scaled)

    # Calculate metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')

    # Store metrics and hyperparameters
    metrics[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Hyperparameters': model.get_params()
    }

    # Print results immediately after evaluation
    print(f"{model_name}:")
    print(f"  Hyperparameters: {metrics[model_name]['Hyperparameters']}")
    print(f"  Accuracy: {metrics[model_name]['Accuracy']}")
    print(f"  Precision: {metrics[model_name]['Precision']}")
    print(f"  Recall: {metrics[model_name]['Recall']}")
    print(f"  F1 Score: {metrics[model_name]['F1 Score']}")
    print("-" * 80)

    # Flush to make sure the output is immediately displayed
    sys.stdout.flush()


SVM:
  Hyperparameters: {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
  Accuracy: 0.8053571428571429
  Precision: 0.8093708704359247
  Recall: 0.8053571428571429
  F1 Score: 0.8058759372323357
--------------------------------------------------------------------------------
KNN:
  Hyperparameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
  Accuracy: 0.756547619047619
  Precision: 0.7567349396097982
  Recall: 0.756547619047619
  F1 Score: 0.7565872206031347
--------------------------------------------------------------------------------
Decision Tree:
  Hyperparameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'm



AdaBoost:
  Hyperparameters: {'algorithm': 'SAMME.R', 'estimator': None, 'learning_rate': 1.0, 'n_estimators': 100, 'random_state': 42}
  Accuracy: 0.6916666666666667
  Precision: 0.6977385403048453
  Recall: 0.6916666666666667
  F1 Score: 0.6935174327068164
--------------------------------------------------------------------------------
Extra Trees:
  Hyperparameters: {'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
  Accuracy: 0.9791666666666666
  Precision: 0.9795091545818201
  Recall: 0.9791666666666666
  F1 Score: 0.9791168193041022
--------------------------------------------------------------------------------
Gradient 

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import sys  # For flushing output

# Define models
models = {
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "MLP": MLPClassifier(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=0),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}

# Initialize a dictionary to hold metrics
metrics = {model_name: {} for model_name in models.keys()}
cv_folds = 5

# Evaluate each model using cross-validation
for model_name, model in models.items():
    # Perform cross-validation for each metric
    accuracy_scores = cross_val_score(model, X_scaled, y, cv=cv_folds, scoring='accuracy')
    precision_scores = cross_val_score(model, X_scaled, y, cv=cv_folds, scoring='precision_weighted')
    recall_scores = cross_val_score(model, X_scaled, y, cv=cv_folds, scoring='recall_weighted')
    f1_scores = cross_val_score(model, X_scaled, y, cv=cv_folds, scoring='f1_weighted')

    # Calculate mean and standard deviation for each metric
    metrics[model_name] = {
        'Accuracy Mean': np.mean(accuracy_scores),
        'Accuracy Std': np.std(accuracy_scores),
        'Precision Mean': np.mean(precision_scores),
        'Precision Std': np.std(precision_scores),
        'Recall Mean': np.mean(recall_scores),
        'Recall Std': np.std(recall_scores),
        'F1 Score Mean': np.mean(f1_scores),
        'F1 Score Std': np.std(f1_scores),
        'Hyperparameters': model.get_params()
    }

    # Print results
    print(f"{model_name}:")
    print(f"  Hyperparameters: {metrics[model_name]['Hyperparameters']}")
    print(f"  Accuracy Mean: {metrics[model_name]['Accuracy Mean']:.4f}, Accuracy Std: {metrics[model_name]['Accuracy Std']:.4f}")
    print(f"  Precision Mean: {metrics[model_name]['Precision Mean']:.4f}, Precision Std: {metrics[model_name]['Precision Std']:.4f}")
    print(f"  Recall Mean: {metrics[model_name]['Recall Mean']:.4f}, Recall Std: {metrics[model_name]['Recall Std']:.4f}")
    print(f"  F1 Score Mean: {metrics[model_name]['F1 Score Mean']:.4f}, F1 Score Std: {metrics[model_name]['F1 Score Std']:.4f}")
    print("-" * 80)

    # Flush to make sure the output is immediately displayed
    sys.stdout.flush()


SVM:
  Hyperparameters: {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
  Accuracy Mean: 0.5268, Accuracy Std: 0.0418
  Precision Mean: 0.5343, Precision Std: 0.0425
  Recall Mean: 0.5268, Recall Std: 0.0418
  F1 Score Mean: 0.5243, F1 Score Std: 0.0395
--------------------------------------------------------------------------------
KNN:
  Hyperparameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
  Accuracy Mean: 0.4821, Accuracy Std: 0.0270
  Precision Mean: 0.4902, Precision Std: 0.0261
  Recall Mean: 0.4821, Recall Std: 0.0270
  F1 Score Mean: 0.4822, F1 Score Std: 0.0263
------------------------------------------------------------------------------

KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import catboost as cb
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Load the dataset from an Excel file
data = pd.read_excel('bart_embeddings.xlsx')

# Assume the last column is the target variable
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Base models for stacking
# Base models for stacking
base_learners = [
    ('svm', SVC(kernel='rbf', gamma='scale', degree=4, C=10)),
    ('knn', KNeighborsClassifier(weights='uniform', n_neighbors=7, metric='manhattan')),
    ('dt', DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=4, max_features='sqrt', max_depth=25, criterion='gini')),
    ('gnb', GaussianNB(var_smoothing=1e-09)),
    ('rf', RandomForestClassifier(n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features='sqrt', max_depth=20, class_weight='balanced', bootstrap=True)),
    ('logreg', LogisticRegression(penalty='l2', C=0.01)),
    ('mlp', MLPClassifier(solver='adam', learning_rate='constant', hidden_layer_sizes=(50,), activation='logistic', max_iter=500, verbose=True)),
    ('catboost', cb.CatBoostClassifier(learning_rate=0.2, iterations=200, depth=4, verbose=0)),
    ('xgboost', xgb.XGBClassifier(subsample=0.8, n_estimators=200, max_depth=7, learning_rate=0.1, gamma=1, colsample_bytree=1.0)),
    ('extratrees', ExtraTreesClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', bootstrap=True)),
    ('gb', GradientBoostingClassifier(subsample=1.0, n_estimators=200, max_features='sqrt', max_depth=6, learning_rate=0.3))
]

# Meta-classifier for stacking
meta_model = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)

# Create the Stacking Classifier
stacking_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5  # Cross-validation within the training data itself
)

# Define the metrics you want to evaluate during cross-validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Perform cross-validation on the full dataset (training data only)
cv_results = cross_validate(stacking_model, X_scaled, y, cv=5, scoring=scoring)

# Print the cross-validation results for each metric
print(f"Cross-validation accuracy: {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
print(f"Cross-validation precision: {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
print(f"Cross-validation recall: {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
print(f"Cross-validation F1-score: {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import xgboost as xgb
import catboost as cb

# Load your dataset (adjust path to your actual dataset)
df_test = pd.read_excel('testing_embeddings.xlsx')

# Assuming the test data contains columns 'X' for features and 'y' for labels (adjust column names as needed)
X_test = df_test.drop(columns=['output'])  # Adjust to your actual feature columns
y_test = df_test['output']  # Adjust to your actual target column

# Define the best hyperparameters (as you've mentioned in the question)
models = {
    "SVM": SVC(kernel='rbf', gamma='scale', degree=4, C=10),
    "KNN": KNeighborsClassifier(weights='uniform', n_neighbors=7, metric='manhattan'),
    "DecisionTree": DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=4, max_features='sqrt', max_depth=25, criterion='gini'),
    "GaussianNB": GaussianNB(var_smoothing=1e-09),
    "RF": RandomForestClassifier(n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features='sqrt', max_depth=20, class_weight='balanced', bootstrap=True),
    "Logistic Regression": LogisticRegression(penalty='l2', C=0.01),
    "MLP": MLPClassifier(solver='adam', learning_rate='constant', hidden_layer_sizes=(50,), activation='logistic', max_iter=500),
    "CatBoost": cb.CatBoostClassifier(learning_rate=0.2, iterations=200, depth=4),
    "XGBoost": xgb.XGBClassifier(subsample=0.8, n_estimators=200, max_depth=7, learning_rate=0.1, gamma=1, colsample_bytree=1.0, class_weight='balanced'),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, learning_rate=1.0),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', max_depth=None, bootstrap=True),
    "GradientBoosting": GradientBoostingClassifier(subsample=1.0, n_estimators=200, max_features='sqrt', max_depth=6, learning_rate=0.3)
}

# Dictionary to store the actual and predicted results
results = {}

# Train and predict with each model
for model_name, model in models.items():
    # Fit the model to the training data (you can adjust the training data as needed)
    model.fit(X_test, y_test)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate confusion matrix and accuracy
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the results for each model
    results[model_name] = {
        "Actual": y_test.tolist(),
        "Predicted": y_pred.tolist(),
        "Confusion Matrix": cm,
        "Accuracy": accuracy
    }

# Print results for each model
for model_name, result in results.items():
    print(f"\n{model_name} Results:")

    # Create a DataFrame to show Actual vs Predicted values in a table format
    comparison_df = pd.DataFrame({
        'Actual': result["Actual"],
        'Predicted': result["Predicted"]
    })

    print(comparison_df)  # Print the actual vs predicted values for the current model
    print(f"Accuracy: {result['Accuracy']}")
    print(f"Confusion Matrix:\n{result['Confusion Matrix']}")
    print("\n" + "-"*50)  # Just a separator for clarity between models
