In [1]:
#importing libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import plotly_express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from plotly.offline import plot, iplot,init_notebook_mode

In [3]:
data = pd.read_csv('../input/productivity-prediction-of-garment-employees/garments_worker_productivity.csv')
data.head()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [8]:
data.date.unique()

array(['1/1/2015', '1/3/2015', '1/4/2015', '1/5/2015', '1/6/2015',
       '1/7/2015', '1/8/2015', '1/10/2015', '1/11/2015', '1/12/2015',
       '1/13/2015', '1/14/2015', '1/15/2015', '1/17/2015', '1/18/2015',
       '1/19/2015', '1/20/2015', '1/21/2015', '1/22/2015', '1/24/2015',
       '1/25/2015', '1/26/2015', '1/27/2015', '1/28/2015', '1/29/2015',
       '1/31/2015', '2/1/2015', '2/2/2015', '2/3/2015', '2/4/2015',
       '2/5/2015', '2/7/2015', '2/8/2015', '2/9/2015', '2/10/2015',
       '2/11/2015', '2/12/2015', '2/14/2015', '2/15/2015', '2/16/2015',
       '2/17/2015', '2/18/2015', '2/19/2015', '2/22/2015', '2/23/2015',
       '2/24/2015', '2/25/2015', '2/26/2015', '2/28/2015', '3/1/2015',
       '3/2/2015', '3/3/2015', '3/4/2015', '3/5/2015', '3/7/2015',
       '3/8/2015', '3/9/2015', '3/10/2015', '3/11/2015'], dtype=object)

In [12]:
data[["date","quarter"]].drop_duplicates()

Unnamed: 0,date,quarter
0,1/1/2015,Quarter1
19,1/3/2015,Quarter1
40,1/4/2015,Quarter1
61,1/5/2015,Quarter1
83,1/6/2015,Quarter1
104,1/7/2015,Quarter1
126,1/8/2015,Quarter2
148,1/10/2015,Quarter2
170,1/11/2015,Quarter2
193,1/12/2015,Quarter2


In [4]:
df = data.copy()

In [5]:
df.shape

(1197, 15)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   object 
 1   quarter                1197 non-null   object 
 2   department             1197 non-null   object 
 3   day                    1197 non-null   object 
 4   team                   1197 non-null   int64  
 5   targeted_productivity  1197 non-null   float64
 6   smv                    1197 non-null   float64
 7   wip                    691 non-null    float64
 8   over_time              1197 non-null   int64  
 9   incentive              1197 non-null   int64  
 10  idle_time              1197 non-null   float64
 11  idle_men               1197 non-null   int64  
 12  no_of_style_change     1197 non-null   int64  
 13  no_of_workers          1197 non-null   float64
 14  actual_productivity    1197 non-null   float64
dtypes: f

# Data Description
1. The dataset contains 1197 rows and 15 columns
 
Attribute Information:

1. date : Date in MM-DD-YYYY
2. day : Day of the Week
3. quarter : A portion of the month. A month was divided into four quarters
4. department : Associated department with the instance
5. teamno : Associated team number with the instance
6. noofworkers : Number of workers in each team 
7. noofstylechange : Number of changes in the style of a particular product
8. targetedproductivity : Targeted productivity set by the Authority for each team for each day.
9. smv : Standard Minute Value, it is the allocated time for a task 
10. wip : Work in progress. Includes the number of unfinished items for products 
11. overtime : Represents the amount of overtime by each team in minutes
12. incentive : Represents the amount of financial incentive (in BDT) that enables or motivates a particular course of action.
13. idletime : The amount of time when the production was interrupted due to several reasons 
14. idlemen : The number of workers who were idle due to production interruption
15. actual_productivity : The actual % of productivity that was delivered by the workers. It ranges from 0-1.

Date,quarter, department and day are object datatypes and rest are int or float types

In [6]:
df.isna().sum()

date                       0
quarter                    0
department                 0
day                        0
team                       0
targeted_productivity      0
smv                        0
wip                      506
over_time                  0
incentive                  0
idle_time                  0
idle_men                   0
no_of_style_change         0
no_of_workers              0
actual_productivity        0
dtype: int64

# Data Cleaning - Feature engineering

In [13]:
df['date'] = pd.to_datetime(df['date'])
df['month_name'] = df['date'].dt.month_name() #create month name

In [14]:
df['overtime_in_hours'] = df['over_time'].apply(lambda x: x/60) #create a new feature which has overtime measured in hours

In [15]:
df['quarter'].value_counts()

Quarter1    360
Quarter2    335
Quarter4    248
Quarter3    210
Quarter5     44
Name: quarter, dtype: int64

In [16]:
df['department'].value_counts().index.to_list() #We can see there is an extra space, which led to 3 categories, we'll fix it and also fix the spelling of sewing

['sweing', 'finishing ', 'finishing']

In [17]:
df['department'] = df['department'].apply(lambda x: 'finishing' if x == ('finishing ' or 'finishing' ) else 'sewing' )

In [18]:
df['department'].value_counts().index.to_list()

['sewing', 'finishing']

In [19]:
df['day'].value_counts() #Friday is not a working day

Wednesday    208
Sunday       203
Tuesday      201
Thursday     199
Monday       199
Saturday     187
Name: day, dtype: int64

In [20]:
dept = df.department.value_counts().reset_index()
dept.rename(columns = {'index':'department', 'department':'total_num'},inplace=True)
dept

Unnamed: 0,department,total_num
0,sewing,940
1,finishing,257


# Univariate analysis of categorical variables

In [21]:
fig = go.Figure(data=[go.Pie(labels=dept.department.to_list(),
                             values=dept.total_num.to_list())])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=['green', 'yellow'], line=dict(color='#000000', width=2)))
fig.show()

In [22]:
qdf = df['quarter'].value_counts().reset_index()
qdf.rename(columns={'index': 'quarters', 'quarter': 'days_in_each_quarter'}, inplace=True)
fig = go.Figure(data=[go.Pie(labels=qdf.quarters.to_list(), values=qdf.days_in_each_quarter.to_list(), pull=[0, 0, 0, 0, 0.4], hole=.3)])
fig.show()

# Univariate analysis of continuous variables

**Histogram is representation of the distribution of numerical data**

In [23]:
fig = px.histogram(df, x="targeted_productivity", nbins=20, template='plotly_dark')
fig.update_layout(title='Distribution of Targeted productivity', title_x=0.5)
fig.show()

**A violin plot is a method of plotting numeric data. It is similar to a box plot, with the addition of a rotated kernel density plot on each side**

In [18]:
fig = px.violin(df, y="actual_productivity", box=True, points='all')
fig.update_layout(title='Distribution of actual_productivity', title_x=0.5)
fig.show()

In [19]:
fig = px.histogram(df, x="smv", 
                   marginal="violin", template='ggplot2' # or violin, rug
                  )
fig.update_layout(title='Distribution of smv(standard minute value)', title_x=0.5)

fig.show()

In [20]:
fig = make_subplots(rows=1, cols=2)
y1= df['no_of_workers']
y2=df['incentive']
x=df['month_name']

fig.add_trace(go.Bar( x=x,y=y1), row=1, col=1)
fig.add_trace(go.Bar(x=x, y=y2), row=1, col=2)

fig.update_layout(title='Plots of number of workers and incentive paid in each month', title_x=0.5)

fig.show()

# Although less number of people worked in March, high incentive was paid, which means they are working overtime.

In [21]:
fig = px.scatter(df, x="date", y="idle_time",color="team", template="plotly_dark")
fig.update_layout(title='Idle time spent by Teams', title_x=0.5)
fig.show()

# Team 8 and 7 spent most idle time

In [22]:
fig = px.scatter(df, y="idle_men", x="date", color="idle_men", facet_col="team", 
                 title="Number of Idle_men in Teams", template="plotly_dark")
fig.update_layout(title='Count of Idle men in Teams', title_x=0.5)
fig.update_xaxes(showgrid=False)

fig.show()

# Bivariate Analysis

# Actual productivity w.r.t departments

In [23]:
fig = px.violin(df, y="actual_productivity", box=True, points='all', color='department')
fig.show()

In [24]:
fig = px.scatter(df, y="actual_productivity",x="targeted_productivity", color='department'
             )
fig.update_layout(title_text='Actual vs Targeted Productivity',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="rosybrown"
    ))

fig.show()

Both departments are exceeding targets

In [25]:
fig = px.bar(df, x="department", y="actual_productivity", barmode="group", color="department",
             facet_col="quarter")
fig.update_layout(title_text='Productivity of departments in each quarter',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="maroon"
    ))

fig.show()

# Productivity in each month

In [26]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=df['month_name'].to_list(),
    x=df['no_of_workers'].to_list(),
    name='Sewing',
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
    )
))
fig.add_trace(go.Bar(
    y=df['month_name'].to_list(),
    x=df['no_of_workers'].to_list(),
    name='Finishing',
    orientation='h',
    marker=dict(
        color='rgba(58, 71, 80, 0.6)',
        line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
    )
))

fig.update_layout(barmode='stack')
fig.show()

In [27]:
daydf = df[['day', 'actual_productivity']].groupby('day').mean()
daydf = daydf.sort_values('actual_productivity', ascending=False).reset_index()
fig = px.bar(daydf, x='day', y='actual_productivity',
             hover_data=['day', 'actual_productivity'], color='actual_productivity',
             labels={'actual_productivity':'Productivity'}, height=500)
fig.show()

**Surprisingly Saturday is the most productive with an average of 0.75, maybe because Friday is holiday which makes sense, after a break your productivity is more**

In [28]:
teamdf = df[['team', 'actual_productivity']].groupby('team').mean()
teamdf = teamdf.sort_values('actual_productivity', ascending=False).reset_index()
teamdf
fig = px.bar(teamdf, x="team", y="actual_productivity", color='actual_productivity')
fig.update_layout(title_text='Productivity by Teams',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="green"
    ))
fig.show()

# Team 1 is the most productive followed by Team 3

# Number of workers in each Team

In [29]:
fig = px.bar(df, x="team", y="no_of_workers", barmode="group", color='department'
             )
fig.update_layout(title_text='Team size',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="red"
    ))

fig.show()  

# Incentives paid to different teams

In [30]:
idf = df[['team', 'incentive']].groupby('team').mean().sort_values('incentive',ascending=False).reset_index()
fig = px.bar(idf, x="team", y="incentive", barmode="group"
             )
fig.update_layout(title_text='Incentive paid to each team',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="green"
    ))

fig.show()  

**Team 9 gets the highest incentive on avg, let us check the box plots to see if any outliers exist**

In [31]:
fig = px.box(df, x="team", y="incentive")
fig.update_layout(title_text='Incentive paid to each team - boxplot',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=10,
        color="red"
    ))
fig.show()

**Because of the outlier, team 9 avg incentive value is very high****

# Only sewing department gets incentive

In [32]:
fig = px.box(df, x="department", y="incentive")
fig.show()

# Over-time vs Teams, department

In [33]:
fig = px.box(df, x="department", y="overtime_in_hours")

fig.update_layout(title_text='Overtime spent(in hours) vs Departments',xaxis_title = 'Department',yaxis_title='Overtime in minutes', title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="mediumvioletred"
    ))

fig.show()

**the median value of financing department working overtime is 1 day , it is not paid incentive, while sewing department works 4 days as overtime**

# Work in progress in departments, teams, quarters

In [34]:
fig = px.box(df, x="department", y="wip")
fig.show()

## Work in progress in teams

In [35]:
fig = px.box(df, x="team", y="wip")
fig.show()

# Work in progress in teams

In [36]:
fig = px.box(df, x="quarter", y="wip")
fig.show()

**Notebook is under construction .. Do Appreciate if you find it useful and checkout this dataset to create visualisations and submitting tasks**
**Thank you**