In [23]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score,roc_curve


In [113]:
data=pd.read_excel('Dry_Bean_Dataset.xlsx')
from Data_Cleaning import outlier_replacer

In [114]:
x,y,x_cols=outlier_replacer(data)

Columns list 
 ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']

Inter_Quartile_Range list 
 [25004.0, 273.68950000000007, 123.19137971902833, 41.18357126472196, 0.27480199520019677, 0.09453823854376209, 25579.5, 64.37846326571864, 0.06821796924643408, 0.004342711665307752, 0.08477250399742942, 0.07180116379321289, 0.0013715027478648162, 0.0010167480731758682, 0.11464769352273418, 0.004179647584361068]

 Lower_Bound list 
 [-1178.0, 292.9892499999998, 68.51656313215878, 114.07281314149485, 1.0201039390339628, 0.5741203881303729, -1654.75, 118.50030853819551, 0.6163065734778252, 0.9791563308156301, 0.7049375730172388, 0.6547670013051465, 0.003842663289784868, -0.0003716018817849318, 0.40938705023979677, 0.9874334240797669]

Upper_Bound list 
 [98838.0, 1387.7472500000001, 561.2820820082721, 278.8070982

In [77]:
models_dict={
    "LogisticRegression":LogisticRegression(),
    "SupportVector":SVC(),
    "NaiveBayes":GaussianNB(),
    "KnnClassifier":KNeighborsClassifier(),
    "DecisionTree":DecisionTreeClassifier(),
    "RandomForest":RandomForestClassifier(),
    "AdaBoost":AdaBoostClassifier(),
    "GradientBoost":GradientBoostingClassifier(),
    "XgbClassifier":XGBClassifier()    
}

In [78]:
x_train=x_train_pca
y_train=y_train_le
x_test=x_test_pca
y_test=y_test_le

In [79]:
for i in range(len(list(models_dict))):
    model=list(models_dict.values())[i]
    model.fit(x_train,y_train)

    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    train_acc=accuracy_score(y_train,y_train_pred)
    train_f1=f1_score(y_train,y_train_pred,average='weighted')
    train_prec=precision_score(y_train,y_train_pred,average='weighted')
    train_recall=recall_score(y_train,y_train_pred,average='weighted')
    #train_roc=roc_auc_score(y_train,y_train_pred,average='weighted',multi_class="ovo")

    test_acc=accuracy_score(y_test,y_test_pred)
    test_f1=f1_score(y_test,y_test_pred,average='weighted')
    test_prec=precision_score(y_test,y_test_pred,average='weighted')
    test_recall=recall_score(y_test,y_test_pred,average='weighted')
    #test_roc=roc_auc_score(y_test,y_test_pred,average='weighted',multi_class='ovo')

    print(list(models_dict.keys())[i])
    print("Training AccuracyScore:{:.4f}".format(train_acc))
    print("Training F1Score:{:.4f}".format(train_f1))
    print("Training PrecisionScore:{:.4f}".format(train_prec))
    print("Training RecallScore:{:.4f}".format(train_recall))
    #print("RocAocScore:{:.4f}".format(train_roc))
    print("Test AccuracyScore:{:.4f}".format(test_acc))
    print("Test F1Score:{:.4f}".format(test_f1))
    print("Test PrecisionScore:{:.4f}".format(test_prec))
    print("Test RecallScore:{:.4f}".format(test_recall))
    print('\n')


LogisticRegression
Training AccuracyScore:0.9325
Training F1Score:0.9325
Training PrecisionScore:0.9326
Training RecallScore:0.9325
Test AccuracyScore:0.9315
Test F1Score:0.9317
Test PrecisionScore:0.9326
Test RecallScore:0.9315


SupportVector
Training AccuracyScore:0.9406
Training F1Score:0.9407
Training PrecisionScore:0.9410
Training RecallScore:0.9406
Test AccuracyScore:0.9404
Test F1Score:0.9405
Test PrecisionScore:0.9411
Test RecallScore:0.9404


NaiveBayes
Training AccuracyScore:0.9068
Training F1Score:0.9066
Training PrecisionScore:0.9070
Training RecallScore:0.9068
Test AccuracyScore:0.9082
Test F1Score:0.9079
Test PrecisionScore:0.9091
Test RecallScore:0.9082


KnnClassifier
Training AccuracyScore:0.9559
Training F1Score:0.9559
Training PrecisionScore:0.9560
Training RecallScore:0.9559
Test AccuracyScore:0.9345
Test F1Score:0.9346
Test PrecisionScore:0.9350
Test RecallScore:0.9345


DecisionTree
Training AccuracyScore:1.0000
Training F1Score:1.0000
Training PrecisionScore:1.0



AdaBoost
Training AccuracyScore:0.7617
Training F1Score:0.7446
Training PrecisionScore:0.7762
Training RecallScore:0.7617
Test AccuracyScore:0.7656
Test F1Score:0.7490
Test PrecisionScore:0.7755
Test RecallScore:0.7656


GradientBoost
Training AccuracyScore:0.9563
Training F1Score:0.9564
Training PrecisionScore:0.9565
Training RecallScore:0.9563
Test AccuracyScore:0.9376
Test F1Score:0.9377
Test PrecisionScore:0.9383
Test RecallScore:0.9376


XgbClassifier
Training AccuracyScore:0.9996
Training F1Score:0.9996
Training PrecisionScore:0.9996
Training RecallScore:0.9996
Test AccuracyScore:0.9490
Test F1Score:0.9491
Test PrecisionScore:0.9494
Test RecallScore:0.9490




In [37]:
from sklearn.model_selection import RandomizedSearchCV
rf_params={
    "n_estimators":[100,200,500,1000],
    "criterion":['gini','entropy','log_loss'],
    "max_depth":[5,8,25,None],
    "max_features":['sqrt', 'log2', None],
    "min_samples_split":[2,8,15,20]
}

model_param={}
randomcv_models=[("RandomForest",models_dict["RandomForest"],rf_params)]
for name ,model,params in randomcv_models:
    random=RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=10,cv=3,verbose=2,n_jobs=-1)
    random.fit(x_train,y_train)
    model_param[name]=random.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [38]:
print(model_param["RandomForest"])

{'n_estimators': 500, 'min_samples_split': 15, 'max_features': 'log2', 'max_depth': 25, 'criterion': 'log_loss'}


In [58]:
model2=RandomForestClassifier(n_estimators=500, min_samples_split=15,max_features='log2',max_depth=25,criterion='log_loss')
model2.fit(x_train,y_train)

In [59]:
y_train_pred2=model2.predict(x_train)
y_test_pred2=model2.predict(x_test)

In [60]:
train_acc2=accuracy_score(y_train,y_train_pred2)
train_f12=f1_score(y_train,y_train_pred2,average='weighted')
train_prec2=precision_score(y_train,y_train_pred2,average='weighted')
train_recall2=recall_score(y_train,y_train_pred2,average='weighted')
    #train_roc=roc_auc_score(y_train,y_train_pred,average='weighted',multi_class="ovo")

test_acc2=accuracy_score(y_test,y_test_pred2)
test_f12=f1_score(y_test,y_test_pred2,average='weighted')
test_prec2=precision_score(y_test,y_test_pred2,average='weighted')
test_recall2=recall_score(y_test,y_test_pred2,average='weighted')


In [61]:

print("Training AccuracyScore:{:.4f}".format(train_acc2))
print("Training F1Score:{:.4f}".format(train_f12))
print("Training PrecisionScore:{:.4f}".format(train_prec2))
print("Training RecallScore:{:.4f}".format(train_recall2))
    #print("RocAocScore:{:.4f}".format(train_roc))
print("Test AccuracyScore:{:.4f}".format(test_acc2))
print("Test F1Score:{:.4f}".format(test_f12))
print("Test PrecisionScore:{:.4f}".format(test_prec2))
print("Test RecallScore:{:.4f}".format(test_recall2))
print('\n')

Training AccuracyScore:0.9781
Training F1Score:0.9781
Training PrecisionScore:0.9782
Training RecallScore:0.9781
Test AccuracyScore:0.9418
Test F1Score:0.9418
Test PrecisionScore:0.9420
Test RecallScore:0.9418




In [65]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define the parameter grid
param_dist = {
    'n_estimators': randint(50, 400),
    'max_depth': randint(3, 25),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_weight': randint(1, 10),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0.1, 1)
}

# Initialize the XGBClassifier
xgb = XGBClassifier(eval_metric='mlogloss')

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=100,
                                   scoring='accuracy', n_jobs=-1, cv=5, verbose=1, random_state=42)

# Fit the model
random_search.fit(x_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy: ", random_search.best_score_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'colsample_bytree': np.float64(0.8738286191519333), 'gamma': np.float64(0.043934055621328405), 'learning_rate': np.float64(0.05164743203260611), 'max_depth': 11, 'min_child_weight': 1, 'n_estimators': 314, 'reg_alpha': np.float64(0.5956387406078443), 'reg_lambda': np.float64(0.5715761885501583), 'subsample': np.float64(0.7647363656589075)}
Best accuracy:  0.9469708381739415
