In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification

# Generate a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=feature_names)
y = pd.Series(y, name='target')

# Identify numerical and categorical features (assuming some are categorical for demonstration)
# In a real scenario, you would analyze your data to determine feature types
numerical_features = [f'feature_{i}' for i in range(15)]
categorical_features = [f'feature_{i}' for i in range(15, 20)] # Assuming last 5 are categorical

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Create a pipeline that includes preprocessing
preprocess_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply preprocessing to the data
X_processed = preprocess_pipeline.fit_transform(X)

print("Data loading and preprocessing complete.")
print("Shape of original data:", X.shape)
print("Shape of processed data:", X_processed.shape)

Data loading and preprocessing complete.
Shape of original data: (1000, 20)
Shape of processed data: (1000, 5015)


In [2]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

print("Data split complete.")
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Data split complete.
Shape of X_train: (800, 5015)
Shape of X_test: (200, 5015)
Shape of y_train: (800,)
Shape of y_test: (200,)


In [3]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

# Define the base estimator (Random Forest)
base_estimator = RandomForestClassifier(n_estimators=50, random_state=42)

# Define the AdaBoost model with the Random Forest base estimator
model = AdaBoostClassifier(estimator=base_estimator, n_estimators=50, random_state=42)

print("AdaBoost-Random Forest model defined.")
print(model)

AdaBoost-Random Forest model defined.
AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=50,
                                                    random_state=42),
                   random_state=42)


In [4]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
# Parameters for AdaBoostClassifier
# 'n_estimators': number of boosting rounds
# 'learning_rate': contribution of each weak learner

# Parameters for RandomForestClassifier (as the base estimator)
# 'estimator__n_estimators': number of trees in the forest
# 'estimator__max_depth': maximum depth of the trees
# 'estimator__min_samples_split': minimum number of samples required to split an internal node
# 'estimator__min_samples_leaf': minimum number of samples required to be at a leaf node

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'estimator__n_estimators': [50, 100],
    'estimator__max_depth': [10, 20, None],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2]
}

print("Hyperparameter grid defined.")
print(param_grid)

Hyperparameter grid defined.
{'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0], 'estimator__n_estimators': [50, 100], 'estimator__max_depth': [10, 20, None], 'estimator__min_samples_split': [2, 5], 'estimator__min_samples_leaf': [1, 2]}


In [5]:
# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

print("Performing hyperparameter tuning...")
grid_search.fit(X_train, y_train)

print("Hyperparameter tuning complete.")
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Performing hyperparameter tuning...


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Use the best estimator found by GridSearchCV, or the original model if tuning was interrupted
# In a real scenario, you would use grid_search.best_estimator_ after successful tuning
# For now, we'll use the original 'model' if grid_search was interrupted.
# If grid_search completed successfully, you would use grid_search.best_estimator_
try:
    best_model = grid_search.best_estimator_
except NameError:
    best_model = model
    print("GridSearchCV was interrupted. Using the initial model for evaluation.")


# Predict on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model using real-world metrics
print("Model Evaluation:")
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("AUC Score:", roc_auc_score(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))