In [166]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split,RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import xgboost as xgb
import catboost as cb

import lightgbm as lgb

In [107]:
preprocessed_date_data=pd.read_csv("date_fruits1.csv")

In [108]:
preprocessed_date_data.columns

Index(['AREA', 'PERIMETER', 'MAJOR_AXIS', 'MINOR_AXIS', 'ECCENTRICITY',
       'EQDIASQ', 'SOLIDITY', 'CONVEX_AREA', 'EXTENT', 'ROUNDNESS',
       'COMPACTNESS', 'SHAPEFACTOR_1', 'SHAPEFACTOR_4', 'MeanRR', 'MeanRG',
       'MeanRB', 'StdDevRR', 'StdDevRB', 'SkewRR', 'SkewRG', 'SkewRB',
       'EntropyRR', 'KurtosisRR', 'KurtosisRG', 'EntropyRG', 'ALLdaub4RR',
       'Class'],
      dtype='object')

In [109]:
preprocessed_date_data.shape

(825, 27)

In [85]:
x=preprocessed_date_data.drop("Class",axis=1)
y=preprocessed_date_data["Class"]

In [172]:
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=42,stratify=y)

In [173]:
print(f"""
X_train shape: {X_train.shape}
X_test shape: {X_test.shape}
y_train shape: {y_train.shape}
y_test shape: {y_test.shape}
""")


X_train shape: (660, 26)
X_test shape: (165, 26)
y_train shape: (660,)
y_test shape: (165,)



XGBOOST 
Notes:¶
* I will use "RandomizeGridSearch" to select parameters
* I will use "StratifiedKFold" because we are dealing with imbalanced class distributions. (You can see below)¶

In [176]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

fine tuning of xgboost

In [187]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'eval_metric': ["mlogloss"],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [0, 3, 4]
        }

In [195]:
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 0)

randomized_search = RandomizedSearchCV(xgb_clf, param_distributions=params, n_iter=5, cv=skf.split(X_train,y_train), verbose=3, random_state=0)

randomized_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END colsample_bytree=0.8, eval_metric=mlogloss, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.6;, score=0.871 total time=   0.3s
[CV 2/5] END colsample_bytree=0.8, eval_metric=mlogloss, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.6;, score=0.879 total time=   0.3s
[CV 3/5] END colsample_bytree=0.8, eval_metric=mlogloss, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.6;, score=0.894 total time=   0.2s
[CV 4/5] END colsample_bytree=0.8, eval_metric=mlogloss, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.6;, score=0.886 total time=   0.2s
[CV 5/5] END colsample_bytree=0.8, eval_metric=mlogloss, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.6;, score=0.833 total time=   0.3s
[CV 1/5] END colsample_bytree=0.8, eval_metric=mlogloss, gamma=1, learning_rate=0.1, max_depth=4, min_child_we

In [196]:
print('Best hyperparameters:', randomized_search.best_params_)

Best hyperparameters: {'subsample': 0.8, 'min_child_weight': 10, 'max_depth': 0, 'learning_rate': 0.1, 'gamma': 1.5, 'eval_metric': 'mlogloss', 'colsample_bytree': 1.0}


In [197]:
from xgboost import XGBClassifier
xgb = XGBClassifier(subsample = randomized_search.best_params_["subsample"],
                      min_child_weight = randomized_search.best_params_["min_child_weight"],
                      max_depth = randomized_search.best_params_["max_depth"],
                      learning_rate = randomized_search.best_params_["learning_rate"],
                      gamma = randomized_search.best_params_["gamma"],
                      eval_metric = randomized_search.best_params_["eval_metric"],
                      colsample_bytree = randomized_search.best_params_["colsample_bytree"])

In [198]:
xgb.fit(X_train, y_train)

In [199]:
from sklearn.metrics import accuracy_score


train_pred_xgb = xgb.predict(X_train)
train_acc_xgb = accuracy_score(y_train,train_pred_xgb)
print('Train Accuracy: ', train_acc_xgb)
 
test_pred_xgb = xgb.predict(X_test)
test_acc_xgb = accuracy_score(y_test,test_pred_xgb)
print('Test Accuracy:', test_acc_xgb)


Train Accuracy:  0.9454545454545454
Test Accuracy: 0.9272727272727272


In [200]:
print(classification_report(y_train,train_pred_xgb))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92        45
           1       0.86      0.82      0.84        74
           2       0.94      0.96      0.95       160
           3       0.91      0.96      0.94        53
           4       0.98      0.98      0.98       131
           5       0.99      0.99      0.99       134
           6       0.89      0.90      0.90        63

    accuracy                           0.95       660
   macro avg       0.94      0.93      0.93       660
weighted avg       0.95      0.95      0.95       660



In [201]:
print(classification_report(y_test,test_pred_xgb))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95        11
           1       0.80      0.67      0.73        18
           2       0.95      0.97      0.96        40
           3       0.92      0.85      0.88        13
           4       0.97      1.00      0.99        33
           5       1.00      1.00      1.00        34
           6       0.74      0.88      0.80        16

    accuracy                           0.93       165
   macro avg       0.91      0.90      0.90       165
weighted avg       0.93      0.93      0.93       165



CATBOOST

fine tuning

In [174]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'iterations': [100, 500, 1000],  # Number of trees (boosting rounds)
    'learning_rate': [0.01, 0.1],  # Learning rate
    'depth': [4, 6],  # Depth of trees
}

# Initialize CatBoost classifier
catboost_classifier = cb.CatBoostClassifier(loss_function='MultiClass', verbose=100)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=catboost_classifier, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from grid search
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

# Train CatBoost classifier with the best hyperparameters
best_catboost_classifier = cb.CatBoostClassifier(loss_function='MultiClass', verbose=100, **best_params)
best_catboost_classifier.fit(X_train, y_train, eval_set=(X_test, y_test))

# Make predictions on the testing set
y_pred = best_catboost_classifier.predict(X_test)
y2=best_catboost_classifier.predict(X_train)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
0:	learn: 1.9226466	total: 3.15ms	remaining: 312ms
99:	learn: 0.8918809	total: 185ms	remaining: 0us
[CV] END ........depth=4, iterations=100, learning_rate=0.01; total time=   0.2s
0:	learn: 1.9178869	total: 1.92ms	remaining: 190ms
99:	learn: 0.8807547	total: 164ms	remaining: 0us
[CV] END ........depth=4, iterations=100, learning_rate=0.01; total time=   0.2s
0:	learn: 1.9183016	total: 1.4ms	remaining: 139ms
99:	learn: 0.8901327	total: 202ms	remaining: 0us
[CV] END ........depth=4, iterations=100, learning_rate=0.01; total time=   0.2s
0:	learn: 1.7221004	total: 1.69ms	remaining: 167ms
99:	learn: 0.2046377	total: 172ms	remaining: 0us
[CV] END .........depth=4, iterations=100, learning_rate=0.1; total time=   0.2s
0:	learn: 1.6766019	total: 1.53ms	remaining: 151ms
99:	learn: 0.2164740	total: 159ms	remaining: 0us
[CV] END .........depth=4, iterations=100, learning_rate=0.1; total time=   0.2s
0:	learn: 1.6810585	total: 1.5ms	re

In [175]:
clf_cat=classification_report(y_pred,y_test)
print(clf_cat)
clf2=classification_report(y2,y_train)
print(clf2)

              precision    recall  f1-score   support

           0       0.82      0.90      0.86        10
           1       0.61      0.92      0.73        12
           2       1.00      0.93      0.96        43
           3       0.92      0.92      0.92        13
           4       0.97      0.97      0.97        33
           5       1.00      1.00      1.00        34
           6       0.88      0.70      0.78        20

    accuracy                           0.92       165
   macro avg       0.89      0.91      0.89       165
weighted avg       0.93      0.92      0.92       165

              precision    recall  f1-score   support

           0       0.84      1.00      0.92        38
           1       0.78      0.88      0.83        66
           2       0.98      0.95      0.97       165
           3       1.00      0.93      0.96        57
           4       0.98      0.98      0.98       132
           5       0.99      0.98      0.99       136
           6       0.86 

LIGHTGBM

In [None]:
lgb_model = lgb.LGBMClassifier()

# Define the hyperparameters grid to search
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5],
    'reg_lambda': [0.0, 0.1, 0.5]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)


# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)