In [236]:
import numpy as np
import pandas as pd

from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score


import warnings
warnings.filterwarnings("ignore")


In [237]:
import pandas as pd

# Fetch the dataset
from ucimlrepo import fetch_ucirepo
productivity_prediction_of_garment_employees = fetch_ucirepo(id=597) 

# Get features and target
X = productivity_prediction_of_garment_employees.data.features 
y = productivity_prediction_of_garment_employees.data.targets 


# Now you can see metadata and variables
print("Dataset metadata:")
print(productivity_prediction_of_garment_employees.metadata)
print("\nVariable information:")
print(productivity_prediction_of_garment_employees.variables)


# Concatenate X and y using pd.concat
df = pd.concat([X, y], axis=1)

Dataset metadata:
{'uci_id': 597, 'name': 'Productivity Prediction of Garment Employees', 'repository_url': 'https://archive.ics.uci.edu/dataset/597/productivity+prediction+of+garment+employees', 'data_url': 'https://archive.ics.uci.edu/static/public/597/data.csv', 'abstract': 'This dataset includes important attributes of the garment manufacturing process and the productivity of the employees which had been collected manually and also been validated by the industry experts.', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate', 'Time-Series'], 'num_instances': 1197, 'num_features': 14, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['actual_productivity'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C51S6D', 'creators': [], 'intro_paper': {'ID': 399, 'type': 'NATIVE', 'title': 'Mining th

In [238]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek

df.drop('date', axis=1, inplace=True)

df.head()

Unnamed: 0,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity,year,month,day_of_week
0,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725,2015,1,3
1,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865,2015,1,3
2,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057,2015,1,3
3,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057,2015,1,3
4,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382,2015,1,3


In [239]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   quarter                1197 non-null   object 
 1   department             1197 non-null   object 
 2   day                    1197 non-null   object 
 3   team                   1197 non-null   int64  
 4   targeted_productivity  1197 non-null   float64
 5   smv                    1197 non-null   float64
 6   wip                    691 non-null    float64
 7   over_time              1197 non-null   int64  
 8   incentive              1197 non-null   int64  
 9   idle_time              1197 non-null   float64
 10  idle_men               1197 non-null   int64  
 11  no_of_style_change     1197 non-null   int64  
 12  no_of_workers          1197 non-null   float64
 13  actual_productivity    1197 non-null   float64
 14  year                   1197 non-null   int32  
 15  mont

In [240]:
df.drop(columns=['wip'], axis=1, inplace=True)

In [241]:
df.isnull().sum()   

quarter                  0
department               0
day                      0
team                     0
targeted_productivity    0
smv                      0
over_time                0
incentive                0
idle_time                0
idle_men                 0
no_of_style_change       0
no_of_workers            0
actual_productivity      0
year                     0
month                    0
day_of_week              0
dtype: int64

In [242]:
scores_df = pd.DataFrame(columns=['model', 'r2_score', 'best socre'])

### XGBoost

In [243]:
from xgboost import XGBRegressor

In [244]:
X=df.drop(columns=['actual_productivity'], axis=1)
y = df['actual_productivity']

In [245]:
label_encoder = LabelEncoder()

X['day'] = label_encoder.fit_transform(X['day'])
X['quarter'] = label_encoder.fit_transform(X['quarter'])
X['department'] = label_encoder.fit_transform(X['department'])

In [246]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

In [247]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   quarter                1197 non-null   int32  
 1   department             1197 non-null   int32  
 2   day                    1197 non-null   int32  
 3   team                   1197 non-null   int64  
 4   targeted_productivity  1197 non-null   float64
 5   smv                    1197 non-null   float64
 6   over_time              1197 non-null   int64  
 7   incentive              1197 non-null   int64  
 8   idle_time              1197 non-null   float64
 9   idle_men               1197 non-null   int64  
 10  no_of_style_change     1197 non-null   int64  
 11  no_of_workers          1197 non-null   float64
 12  year                   1197 non-null   int32  
 13  month                  1197 non-null   int32  
 14  day_of_week            1197 non-null   int32  
dtypes: f

In [248]:
xgbm = XGBRegressor(random_state=24, enable_categorical=True)

kfold = KFold(n_splits=5, shuffle=True, random_state=24)

params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [2, 3],
    'learning_rate': np.linspace(0.001, 1, 5)
}

gcv = GridSearchCV(
    estimator=xgbm, 
    param_grid=params, 
    cv=kfold, 
    scoring='r2',
    verbose=1
)

gcv.fit(X_test, y_test)

print(f"Best parameters: {gcv.best_params_}")
print(f"Best score: {gcv.best_score_}")



y_pred = gcv.best_estimator_.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R2 score: {r2}")


new_row = pd.DataFrame({
    'model': ['XGBoost'],
    'r2_score': [r2],
    'best socre': [gcv.best_score_],
})

scores_df = pd.concat([scores_df, new_row], ignore_index=True)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters: {'learning_rate': 0.25075, 'max_depth': 2, 'n_estimators': 50}
Best score: 0.42665596524939503
R2 score: 0.7297102107362653


### Light GBM

In [249]:
from lightgbm import LGBMRegressor

In [250]:
X = df.drop(columns=['actual_productivity'], axis=1)
y = df['actual_productivity']

In [251]:
label_encoder = LabelEncoder()

X['day'] = label_encoder.fit_transform(X['day'])
X['quarter'] = label_encoder.fit_transform(X['quarter'])
X['department'] = label_encoder.fit_transform(X['department'])

In [252]:
X = df.drop(columns=['actual_productivity'], axis=1)
y = df['actual_productivity']

In [253]:
label_encoder = LabelEncoder()

X['day'] = label_encoder.fit_transform(X['day'])
X['quarter'] = label_encoder.fit_transform(X['quarter'])
X['department'] = label_encoder.fit_transform(X['department'])

In [254]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [255]:
lgbm = LGBMRegressor(random_state=24, verbose=0)

kfold = KFold(n_splits=5, shuffle=True, random_state=24)

params={
    'n_estimators': [10,50,100],
    'max_depth': [2,2,3],
    'learning_rate': np.linspace(0.001,1,5)
}               

gcv = GridSearchCV(
    estimator=lgbm, 
    param_grid=params, 
    cv=kfold, 
    scoring='r2',
    verbose=1
)

gcv.fit(X_train, y_train)

print(f"Best parameters: {gcv.best_params_}")
print(f"Best score: {gcv.best_score_}")



y_pred = gcv.best_estimator_.predict(X_test)

r2 =  r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")


new_row = pd.DataFrame({
    'model': ['LightGBM'],
    'r2_score': [r2],
    'best socre': [gcv.best_score_],
})

scores_df = pd.concat([scores_df, new_row], ignore_index=True)


Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best parameters: {'learning_rate': 0.25075, 'max_depth': 2, 'n_estimators': 50}
Best score: 0.4434591229600763
R2 Score: 0.5137020650405978


### Cat Boost

In [256]:
from catboost import CatBoostRegressor 

In [257]:
X = df.drop(columns=['actual_productivity'], axis=1)
y = df['actual_productivity']

In [258]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [None]:
cgbm = CatBoostRegressor(random_state=24, verbose=0, cat_features=['day', 'quarter', 'department'])

kfold = KFold(n_splits=5, shuffle=True, random_state=24)

params={
    'n_estimators': [10,50,100],
    'max_depth': [2,2,3],
    'learning_rate': np.linspace(0.001,1,5)
}               

gcv = GridSearchCV(
    estimator=cgbm, 
    param_grid=params, 
    cv=kfold, 
    scoring='r2',
    verbose=1
)


gcv.fit(X_train, y_train)

print(f"Best parameters: {gcv.best_params_}")
print(f"Best score: {gcv.best_score_}")


y_pred = gcv.best_estimator_.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")


new_row = pd.DataFrame({
    'model': ['CatBoost-withcat'],
    'r2_score': [r2],
    'best socre': [gcv.best_score_],
})

scores_df = pd.concat([scores_df, new_row], ignore_index=True)


Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best parameters: {'learning_rate': 0.5005, 'max_depth': 3, 'n_estimators': 100}
Best score: 0.5034374298889207
R2 Score: 0.6760762260077868


In [263]:
X = df.drop(columns=['actual_productivity'], axis=1)
y = df['actual_productivity']

In [264]:
label_encoder = LabelEncoder()

X['day'] = label_encoder.fit_transform(X['day'])
X['quarter'] = label_encoder.fit_transform(X['quarter'])
X['department'] = label_encoder.fit_transform(X['department'])

In [265]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [266]:
cgbm = CatBoostRegressor(random_state=24, verbose=0)

kfold = KFold(n_splits=5, shuffle=True, random_state=24)

params={
    'n_estimators': [10,50,100],
    'max_depth': [2,2,3],
    'learning_rate': np.linspace(0.001,1,5)
}               

gcv = GridSearchCV(
    estimator=cgbm, 
    param_grid=params, 
    cv=kfold, 
    scoring='r2',
    verbose=1
)

gcv.fit(X_train, y_train)

print(f"Best parameters: {gcv.best_params_}")
print(f"Best score: {gcv.best_score_}")

y_pred = gcv.best_estimator_.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")


new_row = pd.DataFrame({
    'model': ['CatBoost-withoutcat'],
    'r2_score': [r2],
    'best socre': [gcv.best_score_],
})

scores_df = pd.concat([scores_df, new_row], ignore_index=True)


Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best parameters: {'learning_rate': 0.25075, 'max_depth': 3, 'n_estimators': 100}
Best score: 0.4691565783235476
R2 Score: 0.5325469547067959


In [268]:
scores_df

Unnamed: 0,model,r2_score,best socre
0,XGBoost,0.72971,0.426656
1,LightGBM,0.513702,0.443459
2,CatBoost-withcat,0.676076,0.503437
3,CatBoost-withoutcat,0.532547,0.469157
