<a href="https://colab.research.google.com/github/sripriyakonjarla/Machine_Learning/blob/main/gpt2_rest_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_validate

# Load the dataset
data = pd.read_excel('gpt2_embeddings.xlsx')

# Assume the last column is the target variable
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Scale the features
X_scaled = StandardScaler().fit_transform(X)

# Apply PCA to retain 99.99% of variance
pca = PCA(n_components=0.9999)
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)



In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=2000, random_state=42)

param_distributions = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
}

random_search = RandomizedSearchCV(
    estimator=mlp,
    param_distributions=param_distributions,
    n_iter=20,
    cv=10,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("MLP Best Parameters:", random_search.best_params_)

best_mlp = random_search.best_estimator_
cv_score = cross_validate(best_mlp, X_train, y_train, cv=10, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], return_train_score=False)

results = {
    'Accuracy Mean': round(cv_score['test_accuracy'].mean(), 4),
    'Accuracy STD': round(cv_score['test_accuracy'].std(), 4),
    'Precision Mean': round(cv_score['test_precision_macro'].mean(), 4),
    'Precision STD': round(cv_score['test_precision_macro'].std(), 4),
    'Recall Mean': round(cv_score['test_recall_macro'].mean(), 4),
    'Recall STD': round(cv_score['test_recall_macro'].std(), 4),
    'F1 Mean': round(cv_score['test_f1_macro'].mean(), 4),
    'F1 STD': round(cv_score['test_f1_macro'].std(), 4),
}

print("MLP Evaluation Metrics:",results)

In [None]:
from sklearn.linear_model import LogisticRegression

# Define the Logistic Regression model
log_reg = LogisticRegression(solver='liblinear', random_state=42)

# Set up the parameter grid for RandomizedSearchCV
param_distributions = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1.0, 10.0],
}

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=param_distributions,
    n_iter=20,
    cv=10,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Logistic Regression Best Parameters:", random_search.best_params_)

# Perform cross-validation on the best model
best_log_reg = random_search.best_estimator_
cv_score = cross_validate(best_log_reg, X_train, y_train, cv=10, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], return_train_score=False)

# Collect and round the results
results = {
    'Accuracy Mean': round(cv_score['test_accuracy'].mean(), 4),
    'Accuracy STD': round(cv_score['test_accuracy'].std(), 4),
    'Precision Mean': round(cv_score['test_precision_macro'].mean(), 4),
    'Precision STD': round(cv_score['test_precision_macro'].std(), 4),
    'Recall Mean': round(cv_score['test_recall_macro'].mean(), 4),
    'Recall STD': round(cv_score['test_recall_macro'].std(), 4),
    'F1 Mean': round(cv_score['test_f1_macro'].mean(), 4),
    'F1 STD': round(cv_score['test_f1_macro'].std(), 4),
}

print("Logistic Regression Evaluation Metrics:", results)




Logistic Regression Best Parameters: {'penalty': 'l2', 'C': 0.1}
Logistic Regression Evaluation Metrics: {'Accuracy Mean': 0.6117, 'Accuracy STD': 0.0339, 'Precision Mean': 0.6137, 'Precision STD': 0.0334, 'Recall Mean': 0.6174, 'Recall STD': 0.0342, 'F1 Mean': 0.6144, 'F1 STD': 0.0333}


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE  # For handling class imbalance

# Optional: Scaling the features (this isn't strictly necessary for Decision Trees but may help)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Optional: Handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Define the Decision Tree model
dtree = DecisionTreeClassifier(random_state=42, class_weight='balanced')  # Use class_weight='balanced' to handle class imbalance

# Set up the parameter grid for RandomizedSearchCV
param_distributions = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6, 10],
    'max_features': [None, 'sqrt', 'log2'],
}

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=dtree,
    param_distributions=param_distributions,
    n_iter=20,
    cv=StratifiedKFold(n_splits=10),
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", random_search.best_params_)

best_dtree = random_search.best_estimator_

cv_score = cross_validate(best_dtree, X_train_resampled, y_train_resampled, cv=StratifiedKFold(n_splits=10),
                          scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], return_train_score=False)

results = {
    'Accuracy Mean': round(cv_score['test_accuracy'].mean(), 4),
    'Accuracy STD': round(cv_score['test_accuracy'].std(), 4),
    'Precision Mean': round(cv_score['test_precision_macro'].mean(), 4),
    'Precision STD': round(cv_score['test_precision_macro'].std(), 4),
    'Recall Mean': round(cv_score['test_recall_macro'].mean(), 4),
    'Recall STD': round(cv_score['test_recall_macro'].std(), 4),
    'F1 Mean': round(cv_score['test_f1_macro'].mean(), 4),
    'F1 STD': round(cv_score['test_f1_macro'].std(), 4),
}

print("Evaluation Metrics:",results)

Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 15, 'criterion': 'gini'}
Evaluation Metrics: {'Accuracy Mean': 0.5468, 'Accuracy STD': 0.0674, 'Precision Mean': 0.5522, 'Precision STD': 0.0672, 'Recall Mean': 0.5469, 'Recall STD': 0.0676, 'F1 Mean': 0.5456, 'F1 STD': 0.0642}


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

xgb = XGBClassifier(random_state=42)

param_distributions = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01,  0.1],
    'subsample': [ 0.8, 1.0],
    'colsample_bytree': [ 0.8, 1.0],
    'gamma': [0, 1],
    'class_weight': ['balanced']
}

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=20,
    cv=StratifiedKFold(n_splits=10),
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)

best_xgb = random_search.best_estimator_

cv_score = cross_validate(best_xgb, X_train, y_train, cv=10,
                          scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], return_train_score=False)

results = {
    'Accuracy Mean': round(cv_score['test_accuracy'].mean(), 4),
    'Accuracy STD': round(cv_score['test_accuracy'].std(), 4),
    'Precision Mean': round(cv_score['test_precision_macro'].mean(), 4),
    'Precision STD': round(cv_score['test_precision_macro'].std(), 4),
    'Recall Mean': round(cv_score['test_recall_macro'].mean(), 4),
    'Recall STD': round(cv_score['test_recall_macro'].std(), 4),
    'F1 Mean': round(cv_score['test_f1_macro'].mean(), 4),
    'F1 STD': round(cv_score['test_f1_macro'].std(), 4),
}

print("Evaluation Metrics:",results)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-e2e97e8bd7b2>", line 27, in <cell line: 27>
    random_search.fit(X_train, y_train)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 1019, in fit
    self._run_search(evaluate_candidates)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 1960, in _run_search
    evaluate_candidates(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 965, in evaluate_candidates
    out = parallel(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 74, in __call__
    return super().__call__(iterable_with_

TypeError: object of type 'NoneType' has no len()

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_validate

# Define the CatBoost model
catboost_model = CatBoostClassifier(silent=True, loss_function='MultiClass') # Changed loss function to MultiClass

# Set up the parameter grid for RandomizedSearchCV
param_distributions = {
    'iterations': [100, 200, 500],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7],
    # 'loss_function': ['Logloss', 'CrossEntropy'] # Removed loss_function from parameter grid
}

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=catboost_model,
    param_distributions=param_distributions,
    n_iter=20,  # Number of random combinations to try
    cv=10,  # 10-fold cross-validation
    scoring='f1_macro',  # Change this to any other scoring metric if needed
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

best_catboost = random_search.best_estimator_

# Use StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=10)

# Perform cross-validation on the best model using StratifiedKFold
cv_score = cross_validate(best_catboost, X_train, y_train, cv=cv,
                          scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                          return_train_score=True)

# Collect and round the results
results = {
    'Accuracy Mean': round(cv_score['test_accuracy'].mean(), 4),
    'Accuracy STD': round(cv_score['test_accuracy'].std(), 4),
    'Training Accuracy Mean': round(cv_score['train_accuracy'].mean(), 4),  # Added training accuracy
    'Training Accuracy STD': round(cv_score['train_accuracy'].std(), 4),
    'Precision Mean': round(cv_score['test_precision_macro'].mean(), 4),
    'Precision STD': round(cv_score['test_precision_macro'].std(), 4),
    'Recall Mean': round(cv_score['test_recall_macro'].mean(), 4),
    'Recall STD': round(cv_score['test_recall_macro'].std(), 4),
    'F1 Mean': round(cv_score['test_f1_macro'].mean(), 4),
    'F1 STD': round(cv_score['test_f1_macro'].std(), 4),
}

# Print the evaluation metrics
print("Evaluation Metrics:", results)