In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

%matplotlib inline

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
sns.set(style="whitegrid")


import warnings
import re
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/kaggle/input/productivity-prediction-of-garment-employees/garments_worker_productivity.csv')
target_col = 'actual_productivity'
df['department'] = df['department'].replace({'finishing ': 'finishing', 'sweing' : 'sewing'})
df

In [None]:
df_summary = df.describe()
df_summary

In [None]:
df.isna().sum()

In [None]:
df['wip'] = df['wip'].fillna(df['wip'].mean())
# df.drop(columns=['wip'], inplace=True)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

In [None]:
sns.pairplot(df)

In [None]:
fig, axes = plt.subplots(2, 5, figsize = (20,10));
col_cnt = 0
col_names = ['team', 'targeted_productivity', 'smv', 'wip', 'over_time', 'incentive', 'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers']
for r in range(2):
    for c in range(5):
        try:
            sns.boxplot(df[col_names[col_cnt]], ax=axes[r][c], orient="v");
            col_cnt += 1
        except:
            pass

## Removing outliers

In [None]:
outlier_cols = ["targeted_productivity", "wip", "idle_time", "idle_men", "no_of_style_change"] # Columns with outliers
def remove_outliers(df, col):
    q1 = df_summary[col].loc["25%"]
    q3 = df_summary[col].loc["75%"]
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df
    
for col in outlier_cols:
    df = remove_outliers(df, col)
    
df.drop(columns=["idle_time", "idle_men", "no_of_style_change"], inplace=True)

## Productivity vs Overtime

In [None]:
df_temp = pd.concat([df[['date', 'actual_productivity']].groupby('date')['actual_productivity'].mean().reset_index(), df[['date', 'over_time']].groupby('date')['over_time'].mean().reset_index().drop(columns=['date'])], axis=1)
plt.figure(figsize = (15,8))
sns.lineplot(data = df_temp.actual_productivity, color = "orange")
ax2 = plt.twinx()
sns.lineplot(data = df_temp.over_time, ax = ax2, color = "red");

In [None]:
df = pd.concat([df, pd.get_dummies(df['day'])], axis = 1) 
df = pd.concat([df, pd.get_dummies(df['department'])], axis = 1)
df.drop(columns=['quarter', 'department', 'day'], inplace=True)

In [None]:
cols = list(df.columns)
cols.remove(target_col)

In [None]:
df = df[cols + [target_col]]

In [None]:
plt.figure(figsize = (20,8))
sns.heatmap(df.corr(), annot=True, fmt='.2g', cmap="YlGnBu");

In [None]:
df.drop(columns=['date'], inplace=True)
cols = list(df.columns)
cols.remove(target_col)

In [None]:
X = df[cols]
y = df[[target_col]].to_numpy()

## Feature selection

In [None]:
model = linear_model.LinearRegression()
model.fit(X, y)
importance = model.coef_[0]
for i,v in enumerate(importance):
	print('Feature: %s, Score: %.5f' % (list(X.columns)[i],v))
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
model = DecisionTreeRegressor()
model.fit(X, y)
importance = model.feature_importances_
for i,v in enumerate(importance):
	print('Feature: %s, Score: %.5f' % (list(X.columns)[i],v))
plt.bar([x for x in range(len(importance))], importance)
plt.show()

## PCA

In [None]:
comp = 2
pca = PCA(n_components=comp)
pca.fit(X)
X_pca = pca.transform(X)

print(f"Variance explained: {round(sum(pca.explained_variance_ratio_)*100, 2)}%")

In [None]:
X_pca_df = pd.DataFrame(X_pca, columns=['pca_1', 'pca_2'])

In [None]:
data_temp = pd.concat([X_pca_df, df[[target_col]]], axis=1)
fig = px.scatter_3d(data_temp, x='pca_1', y='pca_2', z=target_col, color=target_col, opacity=0.7, size_max=2)
fig.show()

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca_df, y, test_size = 0.2, random_state = 42)

In [None]:
regr = linear_model.LinearRegression(normalize=True)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")

In [None]:
pred_df = pd.concat([pd.DataFrame(y_pred), pd.DataFrame(y_test)], axis=1)
pred_df.columns = ['pred', 'actual']

pred_df['error'] = (pred_df['actual'] - pred_df['pred']) ** 2

In [None]:
sns.lineplot(data=pred_df, x='pred', y='error');