# 9. Use pycaret to automatically identify the best model for this dataset.

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from pycaret.regression import *

In [2]:
df=pd.read_csv("garments_worker_productivity.csv")
df.head()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [3]:
df.isnull().sum()

date                       0
quarter                    0
department                 0
day                        0
team                       0
targeted_productivity      0
smv                        0
wip                      506
over_time                  0
incentive                  0
idle_time                  0
idle_men                   0
no_of_style_change         0
no_of_workers              0
actual_productivity        0
dtype: int64

In [4]:
df['wip']=df['wip'].replace(np.NaN,0)

In [5]:
df['date']=df['date'].astype('category')
df['date']=df['date'].cat.codes

In [6]:
df['quarter']=df['quarter'].astype('category')
df['quarter']=df['quarter'].cat.codes

In [7]:
df['department']=df['department'].astype('category')
df['department']=df['department'].cat.codes

In [8]:
df['day']=df['day'].astype('category')
df['day']=df['day'].cat.codes

In [9]:
y=df['actual_productivity']
X=df.drop('actual_productivity',axis=1)

X

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
0,0,0,2,3,8,0.80,26.16,1108.0,7080,98,0.0,0,0,59.0
1,0,0,1,3,1,0.75,3.94,0.0,960,0,0.0,0,0,8.0
2,0,0,2,3,11,0.80,11.41,968.0,3660,50,0.0,0,0,30.5
3,0,0,2,3,12,0.80,11.41,968.0,3660,50,0.0,0,0,30.5
4,0,0,2,3,6,0.80,25.90,1170.0,1920,50,0.0,0,0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,51,1,0,5,10,0.75,2.90,0.0,960,0,0.0,0,0,8.0
1193,51,1,0,5,8,0.70,3.90,0.0,960,0,0.0,0,0,8.0
1194,51,1,0,5,7,0.65,3.90,0.0,960,0,0.0,0,0,8.0
1195,51,1,0,5,9,0.75,2.90,0.0,1800,0,0.0,0,0,15.0


In [10]:
X_train, X_test = train_test_split(df, test_size = .2, random_state = 143)

In [11]:
s=setup(data=X_train,test_data=X_test,target='actual_productivity',numeric_features = ['date','quarter','quarter','day','team','targeted_productivity','smv','wip','over_time','incentive','idle_time','idle_men','no_of_style_change','no_of_workers'],fold_strategy = 'timeseries', fold = 3, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,actual_productivity
2,Original Data,"(957, 15)"
3,Missing Values,False
4,Numeric Features,14
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(957, 14)"


In [12]:
best = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0838,0.0184,0.1353,0.381,0.0834,0.1524,0.12
rf,Random Forest Regressor,0.0851,0.0174,0.1317,0.4134,0.0813,0.1543,0.24
gbr,Gradient Boosting Regressor,0.0908,0.0183,0.1349,0.3847,0.0833,0.1625,0.0367
lightgbm,Light Gradient Boosting Machine,0.0944,0.0188,0.1371,0.3647,0.0844,0.1669,0.1033
ada,AdaBoost Regressor,0.1058,0.0207,0.1439,0.3004,0.0878,0.1821,0.0267
omp,Orthogonal Matching Pursuit,0.1093,0.0251,0.1586,0.1499,0.0966,0.1923,0.0067
lr,Linear Regression,0.1114,0.0235,0.1532,0.2069,0.0936,0.1956,10.6
lar,Least Angle Regression,0.1116,0.0235,0.1534,0.2047,0.0938,0.1961,0.12
ridge,Ridge Regression,0.1134,0.0235,0.1534,0.2049,0.0938,0.2003,1.2567
dt,Decision Tree Regressor,0.1183,0.0376,0.1932,-0.2667,0.1184,0.2064,0.0267


In [13]:
predictions = predict_model(best, data = X_test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.0607,0.0111,0.1055,0.6193,0.0643,0.1059


In [14]:
predictions

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity,Label
554,26,0,2,2,12,0.75,15.26,1276.0,1440,45,0.0,0,0,35.0,0.750451,0.744431
1158,50,1,2,4,12,0.80,15.26,1069.0,4080,63,0.0,0,0,34.0,0.800402,0.800399
640,46,0,2,1,10,0.80,22.52,1142.0,6720,88,0.0,0,0,56.0,0.900130,0.904634
988,49,0,2,2,8,0.75,29.40,622.0,6240,56,0.0,0,0,57.0,0.750750,0.752104
381,12,3,2,3,11,0.70,14.89,1863.0,10260,50,0.0,0,0,57.0,0.700170,0.700432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,1,1,1,1,7,0.80,2.90,0.0,1440,0,0.0,0,0,8.0,0.670076,0.598603
99,23,0,2,4,10,0.75,28.08,1209.0,10530,45,0.0,0,0,58.5,0.750545,0.754982
254,5,1,2,5,10,0.50,22.40,947.0,3390,23,0.0,0,0,56.5,0.499980,0.493935
73,22,0,2,0,12,0.80,11.61,1037.0,7200,50,0.0,0,0,32.0,0.800246,0.800192
