In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from tensorflow import keras
import tensorflow as tf
%matplotlib inline

In [None]:
df = pd.read_csv('../input/productivity-prediction-of-garment-employees/garments_worker_productivity.csv')
df.shape

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df['date'].dtype == 'object'

# **Changing date column type to datetime64**

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['date'].dtype

In [None]:
#Setting date column as index
df.set_index('date', inplace=True)

In [None]:
df.head()

# **Filling WIP NaN values using interpolate(time) method**

In [None]:
df['wip'].interpolate(method='time',inplace=True)

In [None]:
df[['wip']].head()

In [None]:
df['wip'].isna().sum()

In [None]:
df['idle_time'].plot(kind='hist')
plt.show()

In [None]:
df['idle_men'].plot(kind='hist')
plt.show()

In [None]:
df['no_of_style_change'].plot(kind='hist')
plt.show()

# **Dropping 'idle_men', 'idle_time' and 'no_of_style_change' columns because high frequency in 0 values**

In [None]:
df.drop(['idle_men', 'idle_time','no_of_style_change'], axis=1,inplace=True)
df.head()

In [None]:
df['department'].unique()

In [None]:
df['department'] = df['department'].str.replace(' ','')
df['department'].unique()

In [None]:
df['department'].value_counts().plot(kind='pie', autopct="%.2f")
plt.show()

In [None]:
plt.title("Quarters")
df['quarter'].value_counts().plot(kind='pie')
plt.show()

In [None]:
quarters = df['quarter'].unique()

# **Department in each Quarter**

In [None]:
for quarter in quarters:
    qcounter = df['department'][df['quarter'] == quarter].value_counts()
    plt.pie(qcounter, autopct="%.2f", labels=qcounter.keys())
    plt.title(f"Department in {quarter}")
    plt.xlabel(f"Total:{sum(qcounter)}")
    plt.show()
    print("\n")

In [None]:
df['day'].value_counts().plot(kind='barh')
plt.title("Total working days")
plt.xlabel('Frequency')
plt.show()

# **Work In Progress(WIP) on week days**

In [None]:
days = df['day'].unique()
for day in days:
    plt.style.use('seaborn')
    plt.title(f"Work In Progress on {day}s")
    wip_day = df['wip'][df['day'] == day]
    wip_day.plot(kind="hist", rwidth=0.95, color='orange')
    plt.show()
    

# **Targeted productivity vs Actual productivity**

In [None]:
t_vs_a = ['targeted_productivity', 'actual_productivity']
df[t_vs_a].describe()

In [None]:
sns.histplot(data=df[['targeted_productivity', 'actual_productivity']], element='poly')
plt.show()

# **Analysis on incentives**

In [None]:
df['incentive'].plot(kind='hist')
plt.show()

# **Incentives > 0**

In [None]:
df['incentive'][df['incentive']>0].plot(kind='line')
plt.yscale('log')
plt.show()

# **Incentives per Quarter**

In [None]:
for quarter in quarters:
    plt.title(f"Incentives in {quarter}")
    df['incentive'][(df['quarter'] == quarter) & (df['incentive']>0)].plot(kind='hist', rwidth=0.95)
    plt.show()
    print('\n')

# **Incentives per weekdays**

In [None]:
for day in days:
    plt.title(f"Incentives on {day}s")
    df['incentive'][(df['day'] == day) & (df['incentive']>0)].plot(kind='hist', rwidth=0.95)
    plt.show()
    print('\n')

# **Incentives for sweing vs Incentives for finishing**

In [None]:
depts = df['department'].unique()
for dept in depts:
    plt.title(f"Incentives on {dept}")
    df['incentive'][(df['department'] == dept) & (df['incentive']>1)].plot(kind='hist', rwidth=0.95)
    plt.show()
    print('\n')    

# **As a result Finishing department gets high icentives than Sweing department**

# **Team and Over time**

In [None]:
x_axis = df['team'].unique()

In [None]:
y_axis=[]
for val in x_axis:
    y_axis.append(df['over_time'][df['team'] == val].mean())

In [None]:
sns.lineplot(x=x_axis, y=y_axis)
plt.title("Team and Overtime")
plt.xlabel('Team Size')
plt.ylabel('Average Overtime')
plt.show()

# **One Hot encoding**

In [None]:
cols_to_encode =['quarter', 'department', 'day']
encoded_cols = pd.get_dummies(df[cols_to_encode], drop_first = True)
#First vals dropped:quarter_Quarter1,department_finishing, day_Monday
encoded_cols.head()

In [None]:
df.drop(cols_to_encode, axis=1, inplace=True)

# **Feature Scaling**

In [None]:
cols_to_scale = df.drop(['targeted_productivity','actual_productivity'],axis=1).columns
cols_to_scale

In [None]:
df

In [None]:
scale = MinMaxScaler()
scalled = scale.fit_transform(df[cols_to_scale])

In [None]:
for i in range(len(cols_to_scale)):
    df[cols_to_scale[i]] = scalled[:,i]

In [None]:
df

In [None]:
new_df = pd.concat([encoded_cols,df],axis=1)

In [None]:
new_df.shape

In [None]:
new_df.head()

# **Splitting and training**

In [None]:
x, y = new_df.drop(['actual_productivity'], axis=1), new_df['actual_productivity']

In [None]:
x.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
models = [LinearRegression(), Ridge(), Lasso(), SVR(kernel='linear'),KNeighborsRegressor(), DecisionTreeRegressor()]

In [None]:
for model in models:
    print("Model:",model)
    model.fit(x_train, y_train)
    print("Score:",model.score(x_test,y_test))
    print('\n')

In [None]:
new_df.columns

In [None]:
x2, y2 = new_df.drop(['actual_productivity','quarter_Quarter2','quarter_Quarter3','quarter_Quarter4','quarter_Quarter5','day_Saturday', 'day_Sunday',
       'day_Thursday', 'day_Tuesday', 'day_Wednesday','incentive'], axis=1), new_df['actual_productivity']

In [None]:
x2.head()

In [None]:
x2.shape, y2.shape

In [None]:
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.3)

In [None]:
for model in models:
    print("Model:",model)
    model.fit(x2_train, y2_train)
    print("Score:",model.score(x2_test,y2_test))
    print('\n')

# **Hyperparameter tuning on KNR and SVR**

In [None]:
param_grid = {
    'n_neighbors':[3,5, 11 ,39, 51, 75],
    'weights':['uniform','distance'],
    'metric':['minkowski','manhattan','euclidean']
}
clf = GridSearchCV(
    KNeighborsRegressor(),
    param_grid,
    cv=5,
)

In [None]:
param_grid2 = {
    'kernel':['rbf', 'sigmoid', 'linear'],
    'gamma':['scale','auto'],
    'C':[1, 5, 10, 25, 40, 100]
}
clf2 = GridSearchCV(
    SVR(),
    param_grid2,
    cv=5,
)

In [None]:
clf.fit(x2_train, y2_train)

In [None]:
clf2.fit(x2_train, y2_train)

In [None]:
clf.best_estimator_

In [None]:
model_knr = KNeighborsRegressor(n_neighbors=11, weights='distance')
model_knr.fit(x2_train, y2_train)
model_knr.score(x2_test, y2_test)

In [None]:
clf2.best_estimator_

In [None]:
model_svr = SVR(C=25)
model_svr.fit(x2_train, y2_train)
model_svr.score(x2_test, y2_test)

In [None]:
new_df.to_csv('cleaned_data.csv',index=False)

In [None]:
y2_test_predict = model_knr.predict(x2_test)
y2_train_predict = model_knr.predict(x2_train)


In [None]:
test = pd.DataFrame({'Y2 test':y2_test,'Y2 Predicted test':y2_test_predict})
train = pd.DataFrame({'Y2 train':y2_train,'Y2 Predicted train':y2_train_predict})

In [None]:
sns.scatterplot(data=test, x='Y2 test', y='Y2 Predicted test')
plt.title('Test data')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
sns.scatterplot(data=train, x='Y2 train', y='Y2 Predicted train')
plt.title('Train data')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

# **KneighborsRegressor Overfitted**

# **SVR**

In [None]:
y2_test_predict_svr = model_svr.predict(x2_test)
y2_train_predict_svr = model_svr.predict(x2_train)

In [None]:
test_svr = pd.DataFrame({'Y2 test':y2_test,'Y2 Predicted test':y2_test_predict_svr})
train_svr = pd.DataFrame({'Y2 train':y2_train,'Y2 Predicted train':y2_train_predict_svr})

In [None]:
sns.scatterplot(data=test_svr, x='Y2 test', y='Y2 Predicted test')
plt.title('Test data(SVR)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
sns.scatterplot(data=train_svr, x='Y2 train', y='Y2 Predicted train')
plt.title('Train data(SVR)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
mean_squared_error(y2_test,y2_test_predict_svr)

# **Using Ann**

In [None]:
model = keras.Sequential([
                          keras.layers.Dense(17, input_shape=(17,), activation='linear'),
                          keras.layers.Dense(1,activation='linear')
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error'
)

In [None]:
model.fit(x_train, y_train, epochs=100)

In [None]:
model.evaluate(x_test, y_test)

In [None]:
y_pred_test = model.predict(x_test).flatten()
y_pred_train = model.predict(x_train).flatten()
test_ann = pd.DataFrame({'Y test':y_test, 'Y test predicted':y_pred_test.flatten()})
train_ann = pd.DataFrame({'Y train':y_train, 'Y train predicted':y_pred_train.flatten()})

In [None]:
sns.scatterplot(data=test_ann, x='Y test', y='Y test predicted')
plt.title('Test data(ANN)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
sns.scatterplot(data=train_ann, x='Y train', y='Y train predicted')
plt.title('Train data(ANN)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()