In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import os
import warnings
warnings.filterwarnings('ignore')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv("/kaggle/input/productivity-prediction-of-garment-employees/garments_worker_productivity.csv", 
                 parse_dates=['date'])
print(df.shape)
df.head()

- quarter : A portion of the month. A month was divided into four quarters
- team : Associated team number with the instance 
- no_of_workers : Number of workers in each team 
- no_of_style_change : Number of changes in the style of a particular product
- targeted_productivity : Targeted productivity set by the Authority for each team for each day. 
- smv : Standard Minute Value, it is the allocated time for a task 
- wip : Work in progress. Includes the number of unfinished items for products 
- overtime : Represents the amount of overtime by each team in minutes
- incentive : Represents the amount of financial incentive (in BDT) that enables or motivates a particular course of action.
- idle_time : The amount of time when the production was interrupted due to several reasons 
- idle_men : The number of workers who were idle due to production interruption
- actual_productivity : The actual % of productivity that was delivered by the workers. It ranges from 0-1.

In [None]:
# Check for duplicate records.
df.duplicated().sum()

In [None]:
# Check datatypes, null values
df.info()

In [None]:
# We only have null values in `work in process`
df.isnull().sum()

In [None]:
# Separate categorical and numerical data for simplicity in analysis
category = df.select_dtypes(include='object')
numerical = df.select_dtypes(exclude='object')

## Categorical

In [None]:
for c in category.columns:
    print(f"{c}")
    print(category[c].unique())
    print()

In [None]:
# Fix an error of department
category.loc[:,'department'] = category.loc[:,'department'].str.strip()

## Numerical

In [None]:
numerical.describe()

In [None]:
sns.catplot(kind='box', data=numerical, orient='h');

In [None]:
# Assemble a full dataframe
df2 = pd.concat([category, numerical], axis=1)

## Missing values

In [None]:
# Check for missing values
df2[df2.isnull().any(axis=1)]

In [None]:
df2[df2.isnull().any(axis=1)]['department'].unique()

# All the missing values are `work in process` of the finishing department
# Finishing dep. might have to wait for the product from other departments. So, there's no work on process (`wip`=0)

In [None]:
# We then fill it with 0
df2['wip'] = df2['wip'].fillna(value=0.0)

In [None]:
# Drop date coloumn since it's unnecessary
df2.drop(['date'],axis=1,inplace=True)

# Get dummies variables
df2_dummies = pd.get_dummies(df2, drop_first=True)

In [None]:
# Save the features for later use
features = df2_dummies.drop(['actual_productivity'], axis=1).columns

# Feature selection
---

### 1) Sequential Feature Selector
Forward-SFS is a greedy procedure that iteratively finds the best new feature to add to the set of selected features. 
- Concretely, we initially start with zero feature and **find the one feature that maximizes a cross-validated score** when an estimator is trained on this single feature. 
- Once that first feature is selected, we repeat the procedure by adding a new feature to the set of selected features. 
- The procedure stops when the desired number of selected features is reached, as determined by the n_features_to_select parameter.

Backward-SFS follows the same idea but works in the opposite direction: instead of starting with no feature and greedily adding features, we start with all the features and greedily remove features from the set. The direction parameter controls whether forward or backward SFS is used.

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor

X, y = df2_dummies.drop(['actual_productivity'], axis=1), df2_dummies['actual_productivity']
rf = RandomForestRegressor()

`sklearn.feature_selection.SequentialFeatureSelector` is a transformer that performs Sequential Feature Selection. We can treat it like other transformers i.e. use it in `Pipeline`, `ColumnTransformer`.

In [None]:
# We can do Forward or Backward selection
# by specifying `direction` parameter
sfs = SequentialFeatureSelector(rf, n_features_to_select=7, direction='forward')
sfs.fit(X, y)

In [None]:
# Get the selected features
features[sfs.get_support()]

In [None]:
# Transform to use only selected features
X_selected = sfs.transform(X)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=112)

model = RandomForestRegressor().fit(X_train, y_train)

# This is the result
y_pred = model.predict(X_test)
print(f'mse = {mse(y_true=y_test, y_pred=y_pred)}')

In [None]:
# Compared to the distribution of target,
# the result of model built with only 7 features
# is fairly good.
df2['actual_productivity'].hist();

### 2) Recursive feature elimination
Given an estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. 
- First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through any specific attribute (such as coef_, feature_importances_). 
- Then, **the least important features are pruned** from current set of features. 
- That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.

SFS differs from RFE in that it does not require the underlying model to expose a coef_ or feature_importances_ attribute. SFS is base on score obtained by model in each iteration rather than features' importance.

In [None]:
from sklearn.feature_selection import RFE

X, y = df2_dummies.drop(['actual_productivity'], axis=1), df2_dummies['actual_productivity']

estimator = RandomForestRegressor()
rfe = RFE(estimator, n_features_to_select=7, step=1)
rfe.fit(X, y)

In [None]:
# Transform to use only selected features
features[rfe.support_]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(rfe.transform(X), y, test_size=0.2, random_state=112)

model = RandomForestRegressor().fit(X_train, y_train)

# This is the result
y_pred = model.predict(X_test)
print(f'mse = {mse(y_true=y_test, y_pred=y_pred)}')

# Add to Pipeline
---
We can do one-hot encoding, standardize, feature selection by using `ColumnTransformer`, `Pipeline`. This section will show an example.

In [None]:
X,y = df2.drop(['actual_productivity'], axis=1),df2['actual_productivity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=112)

X_train

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

rf = RandomForestRegressor()
sfs = SequentialFeatureSelector(rf, n_features_to_select=7, direction='forward')

ct = ColumnTransformer([
    ("ohe", OneHotEncoder(), [0,1,2])
], remainder='passthrough')

pipe = Pipeline([
    ("ohe", ct),
    ("standardize", StandardScaler()),
    ("feature selection", sfs),
    ("model", RandomForestRegressor())
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
# This is the result
y_pred = pipe.predict(X_test)
print(f'mse = {mse(y_true=y_test, y_pred=y_pred)}')