In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

In [2]:
data = pd.read_csv('ecoli.csv')
data

Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,label
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


In [3]:
data['label'].value_counts()

label
cp     143
im      77
pp      52
imU     35
om      20
omL      5
imS      2
imL      2
Name: count, dtype: int64

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
x = data.iloc[:,0:7]
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=42)

In [12]:
# Create Gradient Boosting Classifier
gbc = GradientBoostingClassifier(
    loss='log_loss',  # Default loss function that supports multi-class
    learning_rate=0.1,  # Step size shrinkage used in update to prevent overfitting
    n_estimators=100,  # Number of boosting stages
    subsample=1.0,  # Fraction of samples used for fitting the individual base learners
    criterion='friedman_mse',  # Function to measure the quality of a split
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1,  # Minimum number of samples required to be at a leaf node
    max_depth=3,  # Maximum depth of the individual base learners
    random_state=42,  # Random state for reproducibility
    max_features=None,  # Number of features to consider when looking for the best split
    verbose=0,  # Verbosity level
    max_leaf_nodes=None,  # Maximum number of leaf nodes in each individual base learner
    warm_start=False,  # Whether to reuse the solution of the previous call to fit
    validation_fraction=0.1,  # Proportion of training data set aside as validation set
    n_iter_no_change=None,  # Number of iterations with no improvement to wait before stopping
    tol=0.0001,  # Tolerance for the early stopping
    ccp_alpha=0.0  # Complexity parameter used for Minimal Cost-Complexity Pruning
)

# Train the model
gbc.fit(X_train, y_train)

# Make predictions
y_pred = gbc.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7794
Classification Report:
              precision    recall  f1-score   support

          cp       0.91      0.97      0.94        32
          im       0.58      0.54      0.56        13
         imS       0.00      0.00      0.00         1
         imU       0.50      0.50      0.50         4
          om       1.00      0.83      0.91         6
         omL       1.00      1.00      1.00         1
          pp       0.70      0.64      0.67        11

    accuracy                           0.78        68
   macro avg       0.67      0.64      0.65        68
weighted avg       0.79      0.78      0.78        68



In [6]:
# Define parameter ranges
learning_rates = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 1.0]
n_estimators_list = [100, 200, 300, 400, 500]
subsamples = [0.2, 0.5, 0.6, 0.7, 0.8, 1.0]
train_test_ratios = [(0.9, 0.1), (0.8, 0.2), (0.7, 0.3), (0.6, 0.4), (0.5, 0.5)]

# List to store results
results = []

# Loop through all combinations of parameters
for lr in learning_rates:
    for n_est in n_estimators_list:
        for subsample in subsamples:
            for train_ratio, test_ratio in train_test_ratios:
                # Split data into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(
                    x, y, test_size=test_ratio, train_size=train_ratio, random_state=42
                )
                
                # Create Gradient Boosting Classifier with current parameters
                gbc = GradientBoostingClassifier(
                    learning_rate=lr,
                    n_estimators=n_est,
                    subsample=subsample,
                    criterion='friedman_mse',
                    min_samples_split=2,
                    min_samples_leaf=1,
                    max_depth=3,
                    random_state=42,
                    max_features=None,
                    verbose=0,
                    max_leaf_nodes=None,
                    warm_start=False,
                    validation_fraction=0.1,
                    n_iter_no_change=None,
                    tol=0.0001,
                    ccp_alpha=0.0
                )

                # Train the model
                gbc.fit(X_train, y_train)

                # Make predictions
                y_pred = gbc.predict(X_test)

                # Evaluate the model
                accuracy = accuracy_score(y_test, y_pred)
                classification_rep = classification_report(y_test, y_pred, output_dict=True)

                # Store the results
                results.append({
                    'learning_rate': lr,
                    'n_estimators': n_est,
                    'subsample': subsample,
                    'train_test_ratio': f"{int(train_ratio*100)}:{int(test_ratio*100)}",
                    'accuracy': accuracy,
                    'classification_report': classification_rep
                })

# Convert results to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Display results
print(results_df)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

      learning_rate  n_estimators  subsample train_test_ratio  accuracy  \
0               0.1           100        0.2            90:10  0.823529   
1               0.1           100        0.2            80:20  0.088235   
2               0.1           100        0.2            70:30  0.811881   
3               0.1           100        0.2            60:40  0.814815   
4               0.1           100        0.2            50:50  0.839286   
...             ...           ...        ...              ...       ...   
1045            1.0           500        1.0            90:10  0.147059   
1046            1.0           500        1.0            80:20  0.838235   
1047            1.0           500        1.0            70:30  0.831683   
1048            1.0           500        1.0            60:40  0.829630   
1049            1.0           500        1.0            50:50  0.827381   

                                  classification_report  
0     {'cp': {'precision': 0.947368421052

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
results_df

Unnamed: 0,learning_rate,n_estimators,subsample,train_test_ratio,accuracy,classification_report
0,0.1,100,0.2,90:10,0.823529,"{'cp': {'precision': 0.9473684210526315, 'reca..."
1,0.1,100,0.2,80:20,0.088235,"{'cp': {'precision': 0.0, 'recall': 0.0, 'f1-s..."
2,0.1,100,0.2,70:30,0.811881,"{'cp': {'precision': 0.92, 'recall': 1.0, 'f1-..."
3,0.1,100,0.2,60:40,0.814815,"{'cp': {'precision': 0.9538461538461539, 'reca..."
4,0.1,100,0.2,50:50,0.839286,"{'cp': {'precision': 0.9230769230769231, 'reca..."
...,...,...,...,...,...,...
1045,1.0,500,1.0,90:10,0.147059,"{'cp': {'precision': 1.0, 'recall': 0.05263157..."
1046,1.0,500,1.0,80:20,0.838235,"{'cp': {'precision': 0.9142857142857143, 'reca..."
1047,1.0,500,1.0,70:30,0.831683,"{'cp': {'precision': 0.9583333333333334, 'reca..."
1048,1.0,500,1.0,60:40,0.829630,"{'cp': {'precision': 0.9523809523809523, 'reca..."


In [8]:
# Define parameter ranges
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1, 1.0],
    'n_estimators': [100, 200, 300, 400, 500],
    'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
}

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

# Create Gradient Boosting Classifier
gbc = GradientBoostingClassifier(
    criterion='friedman_mse',
    min_samples_split=2,
    min_samples_leaf=1,
    max_depth=3,
    random_state=42,
    max_features=None,
    verbose=0,
    max_leaf_nodes=None,
    warm_start=False,
    validation_fraction=0.1,
    n_iter_no_change=None,
    tol=0.0001,
    ccp_alpha=0.0
)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_
# Print best parameters
print("Best Parameters:", best_params)

# Use best parameters to train the final model 
best_gbc = GradientBoostingClassifier(**best_params)
best_gbc.fit(X_train, y_train)

# Make predictions
y_pred = best_gbc.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)




Best Parameters: {'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.5}
Accuracy: 0.8382352941176471
Classification Report:
               precision    recall  f1-score   support

          cp       0.91      0.97      0.94        32
          im       0.75      0.69      0.72        13
         imS       0.00      0.00      0.00         1
         imU       0.60      0.75      0.67         4
          om       1.00      0.83      0.91         6
         omL       1.00      1.00      1.00         1
          pp       0.73      0.73      0.73        11

    accuracy                           0.84        68
   macro avg       0.71      0.71      0.71        68
weighted avg       0.83      0.84      0.83        68



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
x = data.iloc[:,0:7]
y = data['label']

# Define parameter ranges
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1, 1.0],
    'n_estimators': [100, 200, 300, 400, 500],
    'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
}

# Define different test ratios
test_ratios = [0.1, 0.2, 0.3, 0.4, 0.5]

for test_ratio in test_ratios:
    print(f"Test ratio: {test_ratio}")
    print("--------------------------------------")
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        x, y, test_size=test_ratio, random_state=42
    )

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                               param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Use best parameters to train the final model 
    best_gbc = GradientBoostingClassifier(**best_params)
    best_gbc.fit(X_train, y_train)

    # Make predictions
    y_pred = best_gbc.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Print best parameters and best score
    print("Best Parameters:", best_params)
    print("Best Score:", best_score)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("--------------------------------------")
    print("--------------------------------------")


Test ratio: 0.1
--------------------------------------




Best Parameters: {'learning_rate': 0.1, 'n_estimators': 400, 'subsample': 0.5}
Best Score: 0.8411475409836064
Accuracy: 0.8823529411764706
Classification Report:
               precision    recall  f1-score   support

          cp       1.00      0.89      0.94        19
          im       0.71      1.00      0.83         5
         imU       1.00      1.00      1.00         2
          om       1.00      0.67      0.80         3
         omL       1.00      1.00      1.00         1
          pp       0.60      0.75      0.67         4

    accuracy                           0.88        34
   macro avg       0.89      0.89      0.87        34
weighted avg       0.91      0.88      0.89        34

--------------------------------------
--------------------------------------
Test ratio: 0.2
--------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters: {'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.5}
Best Score: 0.8696715583508036
Accuracy: 0.8382352941176471
Classification Report:
               precision    recall  f1-score   support

          cp       0.91      0.97      0.94        32
          im       0.75      0.69      0.72        13
         imS       0.00      0.00      0.00         1
         imU       0.60      0.75      0.67         4
          om       1.00      0.83      0.91         6
         omL       1.00      1.00      1.00         1
          pp       0.73      0.73      0.73        11

    accuracy                           0.84        68
   macro avg       0.71      0.71      0.71        68
weighted avg       0.83      0.84      0.83        68

--------------------------------------
--------------------------------------
Test ratio: 0.3
--------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters: {'learning_rate': 0.1, 'n_estimators': 500, 'subsample': 0.8}
Best Score: 0.8638297872340426
Accuracy: 0.8316831683168316
Classification Report:
               precision    recall  f1-score   support

          cp       0.94      0.96      0.95        46
          im       0.70      0.70      0.70        20
         imL       0.00      0.00      0.00         0
         imS       0.00      0.00      0.00         1
         imU       0.70      0.64      0.67        11
          om       1.00      0.71      0.83         7
         omL       1.00      1.00      1.00         1
          pp       0.76      0.87      0.81        15

    accuracy                           0.83       101
   macro avg       0.64      0.61      0.62       101
weighted avg       0.83      0.83      0.83       101

--------------------------------------
--------------------------------------
Test ratio: 0.4
--------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters: {'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 1.0}
Best Score: 0.865609756097561
Accuracy: 0.8222222222222222
Classification Report:
               precision    recall  f1-score   support

          cp       0.94      0.97      0.95        62
          im       0.76      0.73      0.75        30
         imS       0.00      0.00      0.00         2
         imU       0.56      0.64      0.60        14
          om       0.83      0.62      0.71         8
         omL       1.00      1.00      1.00         1
          pp       0.74      0.78      0.76        18

    accuracy                           0.82       135
   macro avg       0.69      0.68      0.68       135
weighted avg       0.81      0.82      0.82       135

--------------------------------------
--------------------------------------
Test ratio: 0.5
--------------------------------------
Best Parameters: {'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.6}
Best Score: 0.8807486631016

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# belum tau dipake atau tidak ya ges 

In [13]:

# Define parameter ranges
learning_rates = [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 1.0]
n_estimators_list = [100, 200, 300, 400, 500]
subsamples = [0.2, 0.5, 0.6, 0.7, 0.8, 1.0]

# List to store results
results = []

# Loop through all combinations of parameters
for lr in learning_rates:
    for n_est in n_estimators_list:
        for subsample in subsamples:
            # Create Gradient Boosting Classifier with current parameters
            gbc = GradientBoostingClassifier(
                learning_rate=lr,
                n_estimators=n_est,
                subsample=subsample,
                criterion='friedman_mse',
                min_samples_split=2,
                min_samples_leaf=1,
                max_depth=3,
                random_state=42,
                max_features=None,
                verbose=0,
                max_leaf_nodes=None,
                warm_start=False,
                validation_fraction=0.1,
                n_iter_no_change=None,
                tol=0.0001,
                ccp_alpha=0.0
            )
            
            # Train the model
            gbc.fit(X_train, y_train)
            
            # Make predictions
            y_pred = gbc.predict(X_test)
            
            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            classification_rep = classification_report(y_test, y_pred, output_dict=True)
            
            # Store the results
            results.append({
                'learning_rate': lr,
                'n_estimators': n_est,
                'subsample': subsample,
                'accuracy': accuracy,
            })

# Convert results to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Display results
print(results_df)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

     learning_rate  n_estimators  subsample  accuracy  \
0              0.1           100        0.2  1.000000   
1              0.1           100        0.5  1.000000   
2              0.1           100        0.6  1.000000   
3              0.1           100        0.7  1.000000   
4              0.1           100        0.8  1.000000   
..             ...           ...        ...       ...   
205            1.0           500        0.5  0.633333   
206            1.0           500        0.6  1.000000   
207            1.0           500        0.7  0.966667   
208            1.0           500        0.8  0.966667   
209            1.0           500        1.0  1.000000   

                                 classification_report  
0    {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...  
1    {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...  
2    {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...  
3    {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...  
4    {'0': {'precision': 1.0, 

In [14]:
results_df

Unnamed: 0,learning_rate,n_estimators,subsample,accuracy,classification_report
0,0.1,100,0.2,1.000000,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc..."
1,0.1,100,0.5,1.000000,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc..."
2,0.1,100,0.6,1.000000,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc..."
3,0.1,100,0.7,1.000000,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc..."
4,0.1,100,0.8,1.000000,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc..."
...,...,...,...,...,...
205,1.0,500,0.5,0.633333,"{'0': {'precision': 0.0, 'recall': 0.0, 'f1-sc..."
206,1.0,500,0.6,1.000000,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc..."
207,1.0,500,0.7,0.966667,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc..."
208,1.0,500,0.8,0.966667,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc..."
