In [1]:
from catboost import CatBoostClassifier, CatBoostRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

from sklearn.tree import plot_tree
from sklearn.metrics import roc_auc_score,accuracy_score,r2_score
from sklearn.model_selection import train_test_split, GridSearchCV,KFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.naive_bayes import GaussianNB



import warnings
warnings.filterwarnings('ignore')


In [2]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
productivity_prediction_of_garment_employees = fetch_ucirepo(id=597) 
  
# data (as pandas dataframes) 
X = productivity_prediction_of_garment_employees.data.features 
y = productivity_prediction_of_garment_employees.data.targets 
  
# metadata 
print(productivity_prediction_of_garment_employees.metadata) 
  
# variable information 
print(productivity_prediction_of_garment_employees.variables) 


{'uci_id': 597, 'name': 'Productivity Prediction of Garment Employees', 'repository_url': 'https://archive.ics.uci.edu/dataset/597/productivity+prediction+of+garment+employees', 'data_url': 'https://archive.ics.uci.edu/static/public/597/data.csv', 'abstract': 'This dataset includes important attributes of the garment manufacturing process and the productivity of the employees which had been collected manually and also been validated by the industry experts.', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate', 'Time-Series'], 'num_instances': 1197, 'num_features': 14, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['actual_productivity'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C51S6D', 'creators': [], 'intro_paper': {'ID': 399, 'type': 'NATIVE', 'title': 'Mining the productivity dat

In [4]:
X

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
0,1/1/2015,Quarter1,sweing,Thursday,8,0.80,26.16,1108.0,7080,98,0.0,0,0,59.0
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0
2,1/1/2015,Quarter1,sweing,Thursday,11,0.80,11.41,968.0,3660,50,0.0,0,0,30.5
3,1/1/2015,Quarter1,sweing,Thursday,12,0.80,11.41,968.0,3660,50,0.0,0,0,30.5
4,1/1/2015,Quarter1,sweing,Thursday,6,0.80,25.90,1170.0,1920,50,0.0,0,0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,3/11/2015,Quarter2,finishing,Wednesday,10,0.75,2.90,,960,0,0.0,0,0,8.0
1193,3/11/2015,Quarter2,finishing,Wednesday,8,0.70,3.90,,960,0,0.0,0,0,8.0
1194,3/11/2015,Quarter2,finishing,Wednesday,7,0.65,3.90,,960,0,0.0,0,0,8.0
1195,3/11/2015,Quarter2,finishing,Wednesday,9,0.75,2.90,,1800,0,0.0,0,0,15.0


In [5]:
y

Unnamed: 0,actual_productivity
0,0.940725
1,0.886500
2,0.800570
3,0.800570
4,0.800382
...,...
1192,0.628333
1193,0.625625
1194,0.625625
1195,0.505889


In [6]:
X.isnull()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,False,False,False,False,False,False,False,True,False,False,False,False,False,False
1193,False,False,False,False,False,False,False,True,False,False,False,False,False,False
1194,False,False,False,False,False,False,False,True,False,False,False,False,False,False
1195,False,False,False,False,False,False,False,True,False,False,False,False,False,False


In [7]:
X.isnull().sum()

date                       0
quarter                    0
department                 0
day                        0
team                       0
targeted_productivity      0
smv                        0
wip                      506
over_time                  0
incentive                  0
idle_time                  0
idle_men                   0
no_of_style_change         0
no_of_workers              0
dtype: int64

In [8]:
#X= X.drop('wip', 'date' axis=1)

X

In [9]:
X_train, y_train, X_test, y_test= train_test_split(X,y, random_state=24, test_size=0.3)

### CatBoost

In [10]:
#Without one Hot Encoding

In [11]:
list(X.columns[X.dtypes==object])

['date', 'quarter', 'department', 'day']

In [26]:
cgbm= CatBoostRegressor(random_state=24, cat_features=list(X.columns[X.dtypes==object]))
kfold = KFold(shuffle=True, n_splits=5, random_state=24)
params= {'n_estimators':[10,50],'max_depth':[2,3,4],'learning_rate': np.linspace(0.001,1,5)}
gcv = GridSearchCV(cgbm, param_grid=params,cv =kfold, scoring='r2', verbose=3)
gcv.fit(X,y)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 0.1734897	total: 12.8ms	remaining: 115ms
1:	learn: 0.1734608	total: 23.6ms	remaining: 94.5ms
2:	learn: 0.1734310	total: 39.3ms	remaining: 91.7ms
3:	learn: 0.1734010	total: 50ms	remaining: 75ms
4:	learn: 0.1733719	total: 56ms	remaining: 56ms
5:	learn: 0.1733388	total: 65.9ms	remaining: 43.9ms
6:	learn: 0.1733090	total: 75.5ms	remaining: 32.4ms
7:	learn: 0.1732773	total: 90.5ms	remaining: 22.6ms
8:	learn: 0.1732441	total: 107ms	remaining: 11.9ms
9:	learn: 0.1732170	total: 118ms	remaining: 0us
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.004 total time=   0.1s
0:	learn: 0.1743432	total: 6.98ms	remaining: 62.8ms
1:	learn: 0.1743071	total: 18.8ms	remaining: 75.3ms
2:	learn: 0.1742733	total: 36.5ms	remaining: 85.2ms
3:	learn: 0.1742385	total: 53.1ms	remaining: 79.7ms
4:	learn: 0.1742039	total: 60.1ms	remaining: 60.1ms
5:	learn: 0.1741697	total: 69.9ms	remaining: 46.6ms
6:	learn: 0.1741445	tota

In [27]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.5005, 'max_depth': 4, 'n_estimators': 50}
0.48298250837096585


In [14]:
#With One Hot Encoding

In [17]:
cgbm= CatBoostRegressor(random_state=24)
ohe= OneHotEncoder(handle_unknown='ignore')
kfold = KFold(shuffle=True, n_splits=5, random_state=24)
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False)
pipe= Pipeline([('CT',ct), ('MODEL', cgbm)])
params= {'MODEL__max_depth':[2,3,4],'MODEL__n_estimators':[10,50],'MODEL__learning_rate': np.linspace(0.001,1,5)}
gcv = GridSearchCV(pipe, param_grid=params,cv =kfold, scoring='r2', verbose=3)
gcv.fit(X,y)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 0.1734866	total: 889us	remaining: 8.01ms
1:	learn: 0.1734531	total: 1.59ms	remaining: 6.36ms
2:	learn: 0.1734221	total: 2.35ms	remaining: 5.49ms
3:	learn: 0.1733899	total: 3.08ms	remaining: 4.61ms
4:	learn: 0.1733591	total: 3.79ms	remaining: 3.79ms
5:	learn: 0.1733290	total: 4.59ms	remaining: 3.06ms
6:	learn: 0.1732992	total: 5.41ms	remaining: 2.32ms
7:	learn: 0.1732679	total: 6.17ms	remaining: 1.54ms
8:	learn: 0.1732392	total: 6.91ms	remaining: 767us
9:	learn: 0.1732087	total: 7.66ms	remaining: 0us
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.004 total time=   0.0s
0:	learn: 0.1743341	total: 926us	remaining: 8.34ms
1:	learn: 0.1742962	total: 1.73ms	remaining: 6.91ms
2:	learn: 0.1742627	total: 2.53ms	remaining: 5.9ms
3:	learn: 0.1742265	total: 3.4ms	remaining: 5.1ms
4:	learn: 0.1741931	total: 4.19ms	remaining: 4.19ms
5:	learn: 0.1741631	total: 4.97ms	remaining: 3.31m

In [28]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.5005, 'max_depth': 4, 'n_estimators': 50}
0.48298250837096585


### XGBoost

In [None]:

# ohe = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
# ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
#                              (ohe, make_column_selector(dtype_include=object) ),
#                             verbose_feature_names_out=False).set_output(transform='pandas')


In [None]:
# lgbm = LGBMRegressor(random_state=24)
# pipe = Pipeline([('CT',ct),('LGBM',lgbm)])
# kfold = KFold(shuffle=True, n_splits=5, random_state=24)
# params= {'LGBM__max_depth':[2,3,4],
#          'LGBM__n_estimators':[10,50],
#          'LGBM__learning_rate': np.linspace(0.001,1,5)}
# gcv = GridSearchCV(pipe, param_grid=params,cv =kfold,scoring='r2', verbose=3)
# gcv.fit(X,y)



In [19]:
xgbm = XGBRegressor(random_state=24)
ohe= OneHotEncoder(handle_unknown='ignore')
kfold = KFold(shuffle=True, n_splits=5, random_state=24)
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False)
pipe= Pipeline([('CT',ct), ('MODEL', xgbm)])
params= {'MODEL__n_estimators':[10,50], 'MODEL__max_depth':[2,3,4],
         'MODEL__learning_rate':np.linspace(0.001, 1,5)}
gcv = GridSearchCV(pipe, param_grid=params,cv =kfold,scoring='r2', verbose=3)
gcv.fit(X,y)
         


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.005 total time=   0.0s
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=-0.006 total time=   0.0s
[CV 3/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.004 total time=   0.0s
[CV 4/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=-0.002 total time=   0.0s
[CV 5/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.003 total time=   0.0s
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.025 total time=   0.0s
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.010 total time=   0.0s
[CV 3/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.024 total time=   0.0s


In [20]:
print(gcv.best_params_)
print(gcv.best_score_)

{'MODEL__learning_rate': 0.5005, 'MODEL__max_depth': 3, 'MODEL__n_estimators': 10}
0.4755805492401123


### Light GBM

In [23]:
lgbm= LGBMClassifier(random_state=24)
ohe= OneHotEncoder(handle_unknown='ignore')
kfold = KFold(shuffle=True, n_splits=5, random_state=24)
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False)
pipe= Pipeline([('CT',ct), ('MODEL', xgbm)])
params= {'MODEL__n_estimators':[10,50], 'MODEL__max_depth':[2,3,4],
         'MODEL__learning_rate':np.linspace(0.001, 1,5)}
gcv = GridSearchCV(pipe, param_grid=params,cv =kfold,scoring='r2', verbose=3)
gcv.fit(X,y)
         

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.005 total time=   0.0s
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=-0.006 total time=   0.0s
[CV 3/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.004 total time=   0.0s
[CV 4/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=-0.002 total time=   0.0s
[CV 5/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=10;, score=0.003 total time=   0.0s
[CV 1/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.025 total time=   0.0s
[CV 2/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.010 total time=   0.0s
[CV 3/5] END MODEL__learning_rate=0.001, MODEL__max_depth=2, MODEL__n_estimators=50;, score=0.024 total time=   0.0s


In [25]:
print(gcv.best_params_)
print(gcv.best_score_)

{'MODEL__learning_rate': 0.5005, 'MODEL__max_depth': 3, 'MODEL__n_estimators': 10}
0.4755805492401123
