In [79]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score, f1_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# from sklearn.tree import plot_tree
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder 
# from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, KFold

# import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [80]:
df = pd.read_csv(r'C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Cases\Glass Identification\Glass.csv')
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,building_windows_float_processed


In [81]:
df.Type.value_counts()

Type
building_windows_non_float_processed    76
building_windows_float_processed        70
headlamps                               29
vehicle_windows_float_processed         17
containers                              13
tableware                                9
Name: count, dtype: int64

In [82]:
X = df.drop(columns=['Type'], axis=1)
y = df['Type']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

## Bagging with DIfferent algos

In [84]:
lr = LogisticRegression(random_state=24)
bagg=BaggingClassifier(estimator=lr, random_state=24)

bagg.fit(X_train, y_train)
y_pred = bagg.predict(X_test)
print("lr Accuracy: ", accuracy_score(y_test, y_pred))
print("f1_score: ", f1_score(y_test, y_pred, average='macro'))


lr Accuracy:  0.6461538461538462
f1_score:  0.5471277399848828


In [85]:
dtc = DecisionTreeClassifier(random_state=24)
bagg=BaggingClassifier(estimator=dtc, random_state=24)
bagg.fit(X_train, y_train)

y_pred = bagg.predict(X_test)
print("dtr Accuracy: ", accuracy_score(y_test, y_pred))
print("f1_score: ", f1_score(y_test, y_pred, average='macro'))


dtr Accuracy:  0.6615384615384615
f1_score:  0.697313615006784


In [86]:
svm = SVC(random_state=24)
bagg=BaggingClassifier(estimator=svm, random_state=24)
bagg.fit(X_train, y_train)
y_pred = bagg.predict(X_test)

print("svm Accuracy: ", accuracy_score(y_test, y_pred))
print("f1_score: ", f1_score(y_test, y_pred, average='macro'))

svm Accuracy:  0.35384615384615387
f1_score:  0.08712121212121211


## GridSearch

In [87]:
bagg=BaggingClassifier(random_state=24)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
# pipe = Pipeline([('LR', lr), ('DTC', dtc), ('SVC', svm)])
params = {'estimator': [lr, svm, dtc], 'n_estimators': [10, 50, 100]}

gcv = GridSearchCV(bagg, param_grid=params, cv=kfold,scoring='neg_log_loss', verbose=1)

In [88]:
gcv.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [89]:
print(gcv.best_score_)
print(gcv.best_params_)

y_pred = gcv.predict(X_test)
print("bagg Accuracy: ", accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

-0.7417111834484823
{'estimator': DecisionTreeClassifier(random_state=24), 'n_estimators': 100}
bagg Accuracy:  0.7076923076923077
0.7192349560770613


###  Using GridSearchCV try different Estiamtors(lr, svm, dtc) with Bagging

#### With LinearClassifier

In [90]:
bagg=BaggingClassifier(estimator=lr, random_state=24)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
# bagg.get_params()

In [None]:
params = { 'estimator__C': np.linspace(0, 1, 5), 'estimator__l1_ratio': np.linspace(0.001,5,10),
          'n_estimators': [10, 50, 100]}

gcv = GridSearchCV(bagg, param_grid=params, cv=kfold,scoring='neg_log_loss', verbose=1)

In [92]:
gcv.fit(X_train, y_train)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


In [93]:
print(gcv.best_score_)
print(gcv.best_params_)

y_pred = gcv.predict(X_test)
print("lr bagg Accuracy: ", accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

-0.955891025612272
{'estimator__C': 1.0, 'estimator__l1_ratio': 0.001, 'n_estimators': 50}
lr bagg Accuracy:  0.6307692307692307
0.4574452003023431


#### With Decision Tree Classifier

In [94]:
#With Decision Tree Classifier
bagg_dtc=BaggingClassifier(estimator=dtc, random_state=24)
bagg_dtc.get_params()

{'bootstrap': True,
 'bootstrap_features': False,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__monotonic_cst': None,
 'estimator__random_state': 24,
 'estimator__splitter': 'best',
 'estimator': DecisionTreeClassifier(random_state=24),
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 24,
 'verbose': 0,
 'warm_start': False}

In [95]:
params = { 'estimator__max_depth': [None, 3, 5],
          'estimator__min_samples_split': [2, 5, 10],
          'estimator__min_samples_leaf': [1,5,10],
          'n_estimators': [10, 50, 100]}

gcv_dtc = GridSearchCV(bagg_dtc, param_grid=params, cv=kfold,scoring='neg_log_loss', verbose=1)

In [96]:
gcv_dtc.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [97]:
print(gcv_dtc.best_score_)
print(gcv_dtc.best_params_)

y_pred = gcv_dtc.predict(X_test)
print("lr bagg Accuracy: ", accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

-0.7331355036894752
{'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 5, 'n_estimators': 50}
lr bagg Accuracy:  0.7076923076923077
0.743814080656186


#### With SVC

In [98]:
#With SVC
bagg_svm=BaggingClassifier(estimator=svm, random_state=24)
# bagg_svm.get_params()

In [99]:
params = { 'estimator__C': np.linspace(0, 1, 5),
          'estimator__decision_function_shape': ['ovr', 'ovo'],
          'n_estimators': [10, 50, 100]}

gcv_svm = GridSearchCV(bagg_svm, param_grid=params, cv=kfold,scoring='neg_log_loss', verbose=1)

# %%
gcv_svm.fit(X_train, y_train)

# %%
print(gcv_svm.best_score_)
print(gcv_svm.best_params_)

y_pred = gcv_svm.predict(X_test)
print("lr bagg Accuracy: ", accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
-11.857810881319313
{'estimator__C': 0.25, 'estimator__decision_function_shape': 'ovr', 'n_estimators': 100}
lr bagg Accuracy:  0.35384615384615387
0.08712121212121211
