In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from pycaret.regression import *

In [2]:
df = pd.read_csv("garments_worker_productivity.csv", header = 0) # read in data
df = pd.get_dummies(df, columns=["quarter","department","day","team"], drop_first=True) # create dummy variables for categorical data
df = df.dropna() # drop rows with NaN values
df = df.drop('date', axis=1) # drop dates

In [3]:
X_train, X_test = train_test_split(df, test_size = .2, random_state = 42)

In [4]:
s = setup(data = X_train, test_data = X_test, target = 'actual_productivity', numeric_features = ['department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'idle_time', 'over_time'],  fold_strategy = 'timeseries', fold = 3, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,actual_productivity
2,Original Data,"(552, 32)"
3,Missing Values,False
4,Numeric Features,30
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(552, 33)"


In [5]:
best = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0381,0.0054,0.0725,0.7607,0.0465,0.0684,0.0433
rf,Random Forest Regressor,0.0422,0.0052,0.0719,0.7671,0.0459,0.0756,0.05
gbr,Gradient Boosting Regressor,0.0438,0.0052,0.0717,0.7685,0.0458,0.076,0.02
omp,Orthogonal Matching Pursuit,0.0495,0.0054,0.0735,0.7579,0.0465,0.0843,0.5067
dt,Decision Tree Regressor,0.0504,0.0093,0.096,0.5845,0.0628,0.0924,0.0033
lightgbm,Light Gradient Boosting Machine,0.0525,0.0064,0.0799,0.7127,0.0507,0.0902,0.0133
lr,Linear Regression,0.0546,0.0066,0.0799,0.7061,0.0497,0.0903,0.6367
ridge,Ridge Regression,0.0562,0.0066,0.0806,0.7043,0.051,0.0947,0.5067
br,Bayesian Ridge,0.058,0.0073,0.0843,0.6721,0.0531,0.0973,0.4967
ada,AdaBoost Regressor,0.0605,0.0072,0.0849,0.6771,0.0536,0.1001,0.02


In [6]:
predictions = predict_model(best, data = X_test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.0289,0.0036,0.0597,0.8286,0.0393,0.0571


In [7]:
predictions

Unnamed: 0,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity,...,team_4,team_5,team_6,team_7,team_8,team_9,team_10,team_11,team_12,Label
658,0.8,22.52,1615.0,6780,88,0.0,0,0,56.5,0.900136,...,0,0,0,0,0,0,0,0,0,0.900500
1166,0.7,23.41,1180.0,4560,30,0.0,0,1,38.0,0.700246,...,0,0,1,0,0,0,0,0,0,0.698104
451,0.7,29.12,1143.0,10440,40,0.0,0,0,58.0,0.700614,...,0,0,0,0,0,1,0,0,0,0.688705
593,0.7,18.79,913.0,4020,30,0.0,0,0,33.5,0.700185,...,0,0,1,0,0,0,0,0,0,0.699746
563,0.8,22.52,21385.0,7020,88,0.0,0,0,58.5,0.900158,...,0,0,0,0,0,0,0,0,0,0.910533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
898,0.7,21.25,1583.0,6000,0,0.0,0,1,50.0,0.417917,...,0,0,0,0,0,0,1,0,0,0.520710
43,0.8,11.61,539.0,6975,50,0.0,0,0,31.0,0.879714,...,0,0,1,0,0,0,0,0,0,0.798326
286,0.8,25.90,1292.0,10170,50,0.0,0,0,56.5,0.800129,...,0,0,0,0,1,0,0,0,0,0.801253
890,0.8,30.10,541.0,7140,38,0.0,0,0,59.0,0.800137,...,0,1,0,0,0,0,0,0,0,0.764018


In [8]:
import plotly.express as px
fig = px.scatter(predictions, x = "incentive", y= ["actual_productivity"])

fig.show()