In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as mpl
import seaborn as sb

import plotly_express as pe
import plotly.graph_objects as pgo
import plotly.io as pio
import plotly.figure_factory as pff

from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode

In [None]:
dataset = pd.read_csv('../input/productivity-prediction-of-garment-employees/garments_worker_productivity.csv')

In [None]:
# To have a look through the dataset at an initial level
dataset.head()

In [None]:
# Number of rows or instances, and columns or attributes 
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
# Number of NULL values
dataset.isna().sum()

In [None]:
# Number of Non-NULL values
dataset.notna().sum()

In [None]:
# Making the two date formats uniform and converting them into respective months
dataset['date'] = pd.to_datetime(dataset['date'])
dataset['month'] = dataset['date'].dt.month_name()

In [None]:
dataset['quarter'].value_counts()

In [None]:
dataset['department'].value_counts()

In [None]:
dataset['department'].unique()

In [None]:
# Cleaning and fixing dataset for the correct Department value
dataset['department'] = dataset['department'].replace('sweing', 'Sewing')

In [None]:
# Cleaning and fixing dataset for the correct Department value
dataset['department'] = dataset['department'].replace('finishing ', 'Finishing')

In [None]:
# Cleaning and fixing dataset for the correct Department value
dataset['department'] = dataset['department'].replace('finishing', 'Finishing')

In [None]:
dataset['department'].value_counts()

In [None]:
dataset['department'].unique()

In [None]:
dataset['day'].value_counts()

In [None]:
dataset['day'].unique()

In [None]:
temp_department = dataset['department'].value_counts().reset_index()
temp_department.rename(columns = {'index':'Departments', 'department':'Total'}, inplace = True)

temp_department

In [None]:
labels_1 = temp_department['Departments'].to_list() 
values_1 = temp_department['Total'].to_list()
figure_1 = pgo.Figure(data = [pgo.Pie(labels = labels_1, values = values_1)])
figure_1.update_traces(hoverinfo = 'label+value', textinfo = 'percent', textfont_size = 24, 
                       marker = dict(colors = ['#101B8E', '#C91313'], line = dict(color = '#FFFFFF', width = 4)), hole = .15)
figure_1.update_layout(title_text = "Segregated Departments relating to each of the instance", font_size = 16)

figure_1.show()

In [None]:
temp_quarter = dataset['quarter'].value_counts().reset_index()
temp_quarter.rename(columns = {'index':'Quarters', 'quarter':'Total'}, inplace = True)
temp_quarter

In [None]:
labels_2 = temp_quarter['Quarters'].to_list() 
values_2 = temp_quarter['Total'].to_list()
figure_2 = pgo.Figure(data = [pgo.Pie(labels = labels_2, values = values_2)])
figure_2.update_traces(hoverinfo = 'label+value', textinfo = 'percent', textfont_size = 24, 
                       marker = dict(colors = ['#9E4CC4','#68BA67','#E1E71F', '#2EAFD3', '#D57474'], 
                                     line = dict(color = '#FFFFFF', width = 4)), hole = .15, pull = [0, 0, 0, 0, 0.2])
figure_2.update_layout(title_text = "Total number of each quarter accounted for in the entire tenure", font_size = 16)

figure_2.show()

In [None]:
figure_3 = pe.violin(dataset, y = ["targeted_productivity", "actual_productivity"], box = True, points = 'all', 
                     color = 'department')
figure_3.update_layout(title = 'Distribution of targeted_productivity vs actual_productivity per department', font_size = 16, 
                       template = 'ggplot2')
figure_3.update_xaxes( title_text = "productivities")
figure_3.update_yaxes( title_text = "% productivity [0-1]")

figure_3.show()

In [None]:
figure_3 = pe.box(dataset, y = ["targeted_productivity", "actual_productivity"], points = 'outliers', 
                     color = 'department')
figure_3.update_layout(title = 'Distribution of targeted_productivity vs actual_productivity per department', font_size = 16, 
                       template = 'ggplot2')
figure_3.update_xaxes( title_text = "productivities")
figure_3.update_yaxes( title_text = "% productivity [0-1]")

figure_3.show()

In [None]:
figure_13 = pe.scatter(dataset, x = "targeted_productivity", y = "actual_productivity", size = "incentive", 
                      color = 'department', template = "plotly_white", size_max = 60)
figure_13.update_layout(title_text = 'Comparision of targeted_productivity vs actual_productivity per department', 
                       font_size = 16)

figure_13.show()

In [None]:
# To find the percentage of points lying in the top right corner of the above scatter plot  
((dataset['targeted_productivity'] > 0.45) & (dataset['actual_productivity'] > 0.6) & (dataset['incentive'] > 40)
).value_counts() / 1197 * 100

In [None]:
figure_8 = pe.scatter(dataset, x = "targeted_productivity", y = "actual_productivity", size = "no_of_workers", 
                      color = 'department', template = "plotly_white", size_max = 10)
figure_8.update_layout(title_text = 'Comparision of targeted_productivity vs actual_productivity per department', 
                       font_size = 16)

figure_8.show()

In [None]:
# Mean values of Actual Productivity of the Sewing Department per quarter
q1_mean_sewing = dataset.loc[(dataset['quarter'] == 'Quarter1') & (dataset['department'] == 'Sewing'), 
                             'actual_productivity'].mean()
q2_mean_sewing = dataset.loc[(dataset['quarter'] == 'Quarter2') & (dataset['department'] == 'Sewing'), 
                             'actual_productivity'].mean()
q3_mean_sewing = dataset.loc[(dataset['quarter'] == 'Quarter3') & (dataset['department'] == 'Sewing'), 
                             'actual_productivity'].mean()
q4_mean_sewing = dataset.loc[(dataset['quarter'] == 'Quarter4') & (dataset['department'] == 'Sewing'), 
                             'actual_productivity'].mean()
q5_mean_sewing = dataset.loc[(dataset['quarter'] == 'Quarter5') & (dataset['department'] == 'Sewing'), 
                             'actual_productivity'].mean()

In [None]:
# Mean values of Actual Productivity of the Finishing Department per quarter
q1_mean_finishing = dataset.loc[(dataset['quarter'] == 'Quarter1') & (dataset['department'] == 'Finishing'), 
                             'actual_productivity'].mean()
q2_mean_finishing = dataset.loc[(dataset['quarter'] == 'Quarter2') & (dataset['department'] == 'Finishing'), 
                             'actual_productivity'].mean()
q3_mean_finishing = dataset.loc[(dataset['quarter'] == 'Quarter3') & (dataset['department'] == 'Finishing'), 
                             'actual_productivity'].mean()
q4_mean_finishing = dataset.loc[(dataset['quarter'] == 'Quarter4') & (dataset['department'] == 'Finishing'), 
                             'actual_productivity'].mean()
q5_mean_finishing = dataset.loc[(dataset['quarter'] == 'Quarter5') & (dataset['department'] == 'Finishing'), 
                             'actual_productivity'].mean()

In [None]:
figure_21 = pgo.Figure()
figure_21.add_trace(pgo.Bar(y = [q1_mean_sewing, q2_mean_sewing, q3_mean_sewing, q4_mean_sewing, q5_mean_sewing], 
                            x = dataset['quarter'].unique(), name = 'Sewing', marker = 
                            dict(color = 'rgba(233, 226, 22, 0.6)', line = dict(color = 'rgba(233, 226, 22, 1.0)', width = 3))
                           ))
figure_21.add_trace(pgo.Bar(y = [q1_mean_finishing, q2_mean_finishing, q3_mean_finishing, q4_mean_finishing, q5_mean_finishing], 
                            x = dataset['quarter'].unique(), name = 'Finishing', marker = 
                            dict(color = 'rgba(14, 223, 221, 0.6)', line = dict(color = 'rgba(14, 223, 221, 1.0)', width = 3))
                           ))

figure_21.update_layout(title = 'Comparision of actual_productivity of the departments in each quarter', font_size = 16, 
                        barmode = 'group', template = "ggplot2")
figure_21.update_xaxes(title_text = "quarter")
figure_21.update_yaxes(title_text = "actual_productivity")

figure_21.show()    

In [None]:
# Mean values of Actual Productivity of the Sewing Department per month
january_mean_sewing = dataset.loc[(dataset['month'] == 'January') & (dataset['department'] == 'Sewing'), 
                                  'actual_productivity'].mean()
february_mean_sewing = dataset.loc[(dataset['month'] == 'February') & (dataset['department'] == 'Sewing'), 
                                   'actual_productivity'].mean()
march_mean_sewing = dataset.loc[(dataset['month'] == 'March') & (dataset['department'] == 'Sewing'), 
                                'actual_productivity'].mean()

In [None]:
# Mean values of Actual Productivity of the Finishing Department per month
january_mean_finishing = dataset.loc[(dataset['month'] == 'January') & (dataset['department'] == 'Finishing'), 
                                  'actual_productivity'].mean()
february_mean_finishing = dataset.loc[(dataset['month'] == 'February') & (dataset['department'] == 'Finishing'), 
                                   'actual_productivity'].mean()
march_mean_finishing = dataset.loc[(dataset['month'] == 'March') & (dataset['department'] == 'Finishing'), 
                                'actual_productivity'].mean()

In [None]:
figure_10 = pgo.Figure()
figure_10.add_trace(pgo.Bar(x = [january_mean_sewing, february_mean_sewing, march_mean_sewing], y = dataset['month'].unique(), 
                            name = 'Sewing', orientation = 'h', marker = dict(color = 'rgba(139, 216, 162, 0.6)', 
                                                                              line = dict(color = 'rgba(139, 216, 162, 1.0)', 
                                                                                          width = 3))))
figure_10.add_trace(pgo.Bar(x = [january_mean_finishing, february_mean_finishing, march_mean_finishing], 
                            y = dataset['month'].unique(), name = 'Finishing', orientation = 'h', 
                            marker = dict(color='rgba(203, 53, 247, 0.6)', line = dict(color = 'rgba(203, 53, 247, 1.0)', 
                                                                                     width = 3))))
figure_10.update_layout(title = 'Comparision of actual_productivity in each month of both departments', font_size = 16, 
                        barmode = 'group', template = "ggplot2")
figure_10.update_xaxes(title_text = "actual_productivity")
figure_10.update_yaxes(title_text = "month")

figure_10.show()                                                                                         

In [None]:
days_data = dataset[['day', 'actual_productivity', 'over_time', 'targeted_productivity']].groupby('day').mean()
days_data = days_data.sort_values('actual_productivity', ascending = False).reset_index()

days_data

In [None]:
figure_11 = pe.bar(days_data, x = 'day', y = 'actual_productivity', 
                   hover_data = ['targeted_productivity', 'actual_productivity', 'day', 'over_time'], color = 'over_time')
figure_11.update_layout(title = 'actual_productivity in descending order across days with an indication of over_time per day', 
                        font_size = 14, template = "ggplot2")

figure_11.show()

In [None]:
team_data_sewing = dataset.loc[dataset['department'] == 'Sewing', ['team', 'actual_productivity', 'no_of_workers', 
                                                                   'targeted_productivity']]
team_data_sewing_mean = team_data_sewing[['team', 'actual_productivity', 'targeted_productivity']].groupby(
    'team').mean()
team_data_sewing_sum = team_data_sewing[['team', 'no_of_workers']].groupby('team').sum()
team_data_sewing_mean['no_of_workers'] = team_data_sewing_sum
team_data_sewing_mean = team_data_sewing_mean.sort_values('actual_productivity', ascending = False).reset_index()

team_data_sewing_mean

In [None]:
team_data_finishing = dataset.loc[dataset['department'] == 'Finishing', ['team', 'actual_productivity', 'no_of_workers', 
                                                                   'targeted_productivity']]
team_data_finishing_mean = team_data_finishing[['team', 'actual_productivity', 'targeted_productivity']].groupby(
    'team').mean()
team_data_finishing_sum = team_data_finishing[['team', 'no_of_workers']].groupby('team').sum()
team_data_finishing_mean['no_of_workers'] = team_data_finishing_sum
team_data_finishing_mean = team_data_finishing_mean.sort_values('actual_productivity', ascending = False).reset_index()

team_data_finishing_mean

In [None]:
figure_12 = make_subplots(rows = 1, cols = 2, shared_yaxes = True, subplot_titles=("Sewing", "Finishing"))
figure_12.add_trace(pgo.Bar(x = team_data_sewing_mean['team'], y = team_data_sewing_mean['actual_productivity'], marker = 
                            dict(color = team_data_sewing_mean['no_of_workers'], coloraxis = "coloraxis")), 1, 1)
figure_12.add_trace(pgo.Bar(x = team_data_finishing_mean['team'], y = team_data_finishing_mean['actual_productivity'], marker = 
                            dict(color = team_data_finishing_mean['no_of_workers'], coloraxis = "coloraxis")), 1, 2)
figure_12.update_layout(title = 'mean actual_productivity of different teams with an indication of average team size', 
                        font_size = 16, template = "ggplot2", showlegend = False)
figure_12.update_coloraxes(colorbar_title_text = "no_of_workers")
figure_12.update_xaxes(title_text = "team")
figure_12.update_yaxes(title_text = "actual_productivity")

figure_12.show()

In [None]:
figure_4 = make_subplots(rows = 1, cols = 2, subplot_titles = ("no_of_workers", "incentive"))
y_axis_1 = dataset['no_of_workers']
y_axis_2 = dataset['incentive']
x_axis = dataset['month']
figure_4.add_trace(pgo.Bar(x = x_axis, y = y_axis_1, marker_color = '#A11F29'), row = 1, col = 1)
figure_4.add_trace(pgo.Bar(x = x_axis, y = y_axis_2, marker_color = '#0B0B6E'), row = 1, col = 2)
figure_4.update_layout(title = 'Comparision of number of workers with the incentives paid in each month', font_size = 16, 
                       showlegend = False, template = "plotly_white")
figure_4.update_xaxes(title_text = "month")

figure_4.show()

In [None]:
figure_7 = make_subplots(rows = 1, cols = 2, subplot_titles = ("idle_time", "idle_men"))
y_axis_3 = dataset['idle_time'] 
y_axis_4 = dataset['idle_men']
x_axis = dataset['month']
figure_7.add_trace(pgo.Bar(x = x_axis, y = y_axis_3, marker_color = '#134605'), row = 1, col = 1)
figure_7.add_trace(pgo.Bar(x = x_axis, y = y_axis_4, marker_color = '#B5AF08'), row = 1, col = 2)
figure_7.update_layout(title = 'Overview of total idle time with the number of idle men', font_size = 16, showlegend = False, 
                       template = "plotly_white")
figure_7.update_xaxes(title_text = "month")

figure_7.show()

In [None]:
figure_5 = pe.scatter(dataset, x = "date", y = "idle_time", size = "no_of_workers", color = "team", template = "plotly_white", 
                      size_max = 10)
figure_5.update_layout(title = 'Idle time spent by each of the Teams', font_size = 16)

figure_5.show()

In [None]:
figure_6 = pe.scatter(dataset, x = "date", y = "idle_men", size = "no_of_workers", color = "team", template = "plotly_white", 
                      size_max = 10)
figure_6.update_layout(title = 'Number of Idle men in each of the Teams', font_size = 16)

figure_6.show()

In [None]:
team_sewing_incentive = dataset.loc[dataset['department'] == 'Sewing', ['team', 'incentive', 'over_time']]
team_sewing_incentive_sum = team_sewing_incentive[['team', 'incentive', 'over_time']].groupby('team').sum()
team_sewing_incentive_sum = team_sewing_incentive_sum.sort_values('incentive', ascending = False).reset_index()

team_sewing_incentive_sum.describe()

In [None]:
team_finishing_incentive = dataset.loc[dataset['department'] == 'Finishing', ['team', 'incentive', 'over_time']]
team_finishing_incentive_sum = team_finishing_incentive[['team', 'incentive', 'over_time']].groupby('team').sum()
team_finishing_incentive_sum = team_finishing_incentive_sum.sort_values('incentive', ascending = False).reset_index()

team_finishing_incentive_sum.describe()

In [None]:
figure_14 = make_subplots(rows = 1, cols = 2, shared_yaxes = True, subplot_titles=("Sewing", "Finishing"))
figure_14.add_trace(pgo.Bar(x = team_sewing_incentive_sum['team'], y = team_sewing_incentive_sum['incentive'], marker = 
                            dict(color = team_sewing_incentive_sum['over_time'], coloraxis = "coloraxis")), 1, 1)
figure_14.add_trace(pgo.Bar(x = team_finishing_incentive_sum['team'], y = team_finishing_incentive_sum['incentive'], marker = 
                            dict(color = team_finishing_incentive_sum['over_time'], coloraxis = "coloraxis")), 1, 2)
figure_14.update_layout(title = 'Total incentives given to each team per department with an overall overview of over_time', 
                        font_size = 14, template = "ggplot2", showlegend = False)
figure_14.update_coloraxes(colorbar_title_text = "over_time", colorscale = "Viridis")
figure_14.update_xaxes(title_text = "team")
figure_14.update_yaxes(title_text = "incentive")

figure_14.show()

In [None]:
figure_15 = pe.box(dataset, x = 'team', y = "incentive", color = 'department', points = 'outliers')
figure_15.update_layout(title = 'incentive given to each team per department', font_size = 16, 
                       template = 'ggplot2')

figure_15.show()

In [None]:
figure_16 = pe.box(dataset, x = 'department', y = "incentive", color = 'month', points = 'outliers')
figure_16.update_layout(title = 'incentive given to both departments every month', font_size = 16, 
                       template = 'ggplot2')

figure_16.show()

In [None]:
dataset['over_time_hours'] = dataset['over_time'] / 60

In [None]:
figure_17 = pe.box(dataset, x = 'team', y = "over_time_hours", color = 'department', points = 'outliers')
figure_17.update_layout(title = 'over_time_hours by each team per department', font_size = 16, 
                       template = 'ggplot2')

figure_17.show()

In [None]:
figure_18 = pe.box(dataset, x = 'team', y = "wip", color = 'department', points = 'outliers')
figure_18.update_layout(title = 'Number of unfinished items for products by each team per department', font_size = 16, 
                       template = 'ggplot2')

figure_18.show()

In [None]:
figure_19 = pe.box(dataset, x = 'quarter', y = "wip", color = 'department', points = 'outliers')
figure_19.update_layout(title = 'Number of unfinished items for products in each quarter per department', font_size = 16, 
                       template = 'ggplot2')

figure_19.show()

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn import datasets, linear_model, svm
from scipy.stats.mstats import normaltest

In [None]:
# Machine Learning Model using scikit-learn for an approximate regression model and also for splitting the dataset for training and testing purposes
labelEncoder = LabelEncoder()
dataset['department'] = labelEncoder.fit_transform(dataset['department'])
X = dataset[['no_of_workers', 'targeted_productivity', 'incentive', 'over_time', 'smv']]
y = dataset['actual_productivity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
standardScaler = StandardScaler()
X_train_sm = standardScaler.fit_transform(X_train)
mlr = LinearRegression()
mlr.fit(X_train_sm, y_train)
X_test_sm = standardScaler.transform(X_test)
y_pred_sm = mlr.predict(X_test_sm)

print(f'R2 score is {r2_score(y_pred_sm,y_test)}')
print("Residual sum of squares (MSE): %.2f" % np.mean((y_pred_sm - y_test) ** 2))
print(f'Co-efficients are {mlr.coef_}')
print(f'Intercept is {mlr.intercept_}')

In [None]:
# Normality Test on actual_productivity
normaltest(dataset.actual_productivity.values)

In [None]:
textile_data = pd.read_csv("../input/productivity-prediction-of-garment-employees/garments_worker_productivity.csv")

In [None]:
# Histograms of all the different numerical attributes to detect their frequency distribution
textile_data.hist(bins=10,figsize=(20,15))
mpl.show()

In [None]:
sb.distplot(textile_data['actual_productivity'], bins = 10)

In [None]:
sb.distplot(textile_data['targeted_productivity'], bins = 10)

In [None]:
# To determine the Pearson's Correlation Coefficient between different attributes and variables 
mpl.figure(figsize = (14, 10))
corr_matrix = textile_data.corr(method = 'pearson')
sb.heatmap(corr_matrix, square = True, annot = True)

In [None]:
# Comparision of smv with no_of_workers for both the Sewing and Finishing departments
sb.lmplot(data = textile_data, x = "no_of_workers", y = "smv")

In [None]:
textile_data['department'] = textile_data['department'].str.strip(' ')
textile_data['department'].value_counts()

In [None]:
sb.scatterplot(data = textile_data, x = "no_of_workers", y = "smv", hue = "department").set_title('Comparision of smv with no_of_workers for both the Sewing and Finishing departments')

In [None]:
# Scatterplot between no_of_workers in bins and actual_productivity

# Create an empty list
no_of_workers_bin = []

# Iterate over the entire dataframe to append elements to the empty list 
for i,row in textile_data.iterrows():
    if (row['no_of_workers'] <= 20):
        no_of_workers_bin.append('0 - 20')
    elif (row['no_of_workers'] >= 21 and row['no_of_workers'] <= 40):
        no_of_workers_bin.append('21 - 40')
    elif (row['no_of_workers'] >= 41 and row['no_of_workers'] <= 60):
        no_of_workers_bin.append('41 - 60')
    elif (row['no_of_workers'] >= 61 and row['no_of_workers'] <= 80):
        no_of_workers_bin.append('61 - 80')
    else:
        no_of_workers_bin.append('80+')

# Add column to dataframe        
textile_data['no_of_workers_bins'] = no_of_workers_bin

# Create boxplot
sb.boxplot(data = textile_data, x = 'no_of_workers_bins', y = 'actual_productivity')

In [None]:
# Convert date column from object to string
textile_data['date'] = textile_data['date'].astype('str')
# Change '12/04/2015' to '12-04-2015'
textile_data['date'] = textile_data['date'].str.replace('/','-',regex = True)
# Convert date column to datetime
textile_data['date'] = pd.to_datetime(textile_data['date'], format = '%m-%d-%Y', errors='coerce')

In [None]:
mpl.figure(figsize = (10, 6))
sb.lineplot(data = textile_data, x = "date", y = "targeted_productivity", hue = "department", ci= None).set_title('Variation of Targeted Productivity over entire time duration for the Sewing and the Finishing department')

In [None]:
mpl.figure(figsize = (10, 6))
sb.lineplot(data = textile_data, x = "date", y = "actual_productivity", hue = "department", ci= None).set_title('Variation of Actual Productivity over entire time duration for the Sewing and the Finishing department')

In [None]:
mpl.figure(figsize = (10, 6))
sweing_data = pd.pivot_table(textile_data[textile_data['department'] == 'sweing'].loc[:, ['date','targeted_productivity','actual_productivity']], index = "date", values = ["targeted_productivity","actual_productivity"], aggfunc = np.mean)
sb.lineplot(data = sweing_data.reset_index().melt(id_vars = ["date"], value_vars = ["actual_productivity","targeted_productivity"]), x = "date", y = "value", hue = "variable").set_title('Variation between Actual Productivity and Targeted Productivity over entire time duration for the Sewing department')

In [None]:
mpl.figure(figsize = (10, 6))
finishing_data = pd.pivot_table(textile_data[textile_data['department'] == 'finishing'].loc[:,['date','targeted_productivity','actual_productivity']], index = "date", values = ["targeted_productivity","actual_productivity"], aggfunc = np.mean)
sb.lineplot(data = finishing_data.reset_index().melt(id_vars = ["date"], value_vars = ["actual_productivity","targeted_productivity"]), x = "date", y = "value", hue = "variable").set_title('Variation between Actual Productivity and Targeted Productivity over entire time duration for the Finishing department')