In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# I tried MAE which is less than the baseline(0.15) with XGBoost.

In [None]:
df=pd.read_csv('/kaggle/input/productivity-prediction-of-garment-employees/garments_worker_productivity.csv')

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df

# 1. Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [None]:
df['date']=pd.to_datetime(df['date'])

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.groupby('targeted_productivity')['targeted_productivity'].count().plot.bar()

In [None]:
df.groupby('actual_productivity')['actual_productivity'].count().plot()

In [None]:
df.groupby('smv')['smv'].count().plot.bar()

In [None]:
df.groupby('wip')['wip'].count().plot()

# It seems that there are a lot of features which are not normal distribution.

In [None]:
sns.heatmap(df.corr())

In [None]:
df['month']=df['date'].dt.month

In [None]:
df

# 2. Firat Step Analysis: LinearRegression

In [None]:
x=df.drop(['date','actual_productivity'],axis=1)
y=df['actual_productivity']

In [None]:
x=pd.get_dummies(x)

In [None]:
x=x.fillna(x.mean())

In [None]:
x

In [None]:
x['team']=x['team'].astype(str)

In [None]:
x=pd.get_dummies(x)

In [None]:
x

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,train_size=0.8,random_state=0)

In [None]:
model_lr=LinearRegression()
model_lr.fit(x_train,y_train)
pred_train=model_lr.predict(x_train)
pred_test=model_lr.predict(x_test)
print("train_RMSE:",np.sqrt(mean_squared_error(y_train, pred_train)))
print("test_RMSE:",np.sqrt(mean_squared_error(y_test, pred_test)))
print("train_MAE:",mean_absolute_error(y_train, pred_train))
print("test_MAE:",mean_absolute_error(y_test, pred_test))
print("R^2:{}".format(model_lr.score(x_test, y_test)))

# 3.Logarithmic

In [None]:
from scipy.stats import skew

In [None]:
AP=pd.DataFrame({'ap':y,'log(ap+1)':np.log1p(y)})
print(AP, '¥n')

print('ap skew        :',skew(AP['ap']))
print('log(ap+1) skew:', skew(AP['log(ap+1)']))

AP.hist()

I should not use logarithmic transformation in "y".

In [None]:
x_skew=x.apply(lambda x:skew(x))
print(x_skew)

I tired to logarithmic features whose skews are greater than 0.75.

In [None]:
x_skew = x_skew[x_skew > 0.75]
print('-----Skewness greater than 0.75-----')
print(x_skew)
x_skew = x_skew.index

x[x_skew] = np.log1p(x[x_skew])
x[x_skew]

In [None]:
x

# 4.Prediction Model

# 1)LinearRegression

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,train_size=0.8,random_state=0)

In [None]:
model_lr=LinearRegression()

In [None]:
model_lr.fit(x_train,y_train)
pred_train=model_lr.predict(x_train)
pred_test=model_lr.predict(x_test)
print("train_RMSE:",np.sqrt(mean_squared_error(y_train, pred_train)))
print("test_RMSE:",np.sqrt(mean_squared_error(y_test, pred_test)))
print("train_MAE:",mean_absolute_error(y_train, pred_train))
print("test_MAE:",mean_absolute_error(y_test, pred_test))
print("R^2:{}".format(model_lr.score(x_test, y_test)))

# 2) Ridge

In [None]:
def rmse_cv(model):
    rmse = np.sqrt(
        -cross_val_score(
            model, x_train, y_train,
            scoring="neg_mean_squared_error", 
            cv = 5))
    return(rmse)

In [None]:
model_rg = Ridge()

alphas = [0.3,0.4, 0.5, 0.6,0.7]
cv_rg = [rmse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]
cv_rg = pd.Series(cv_rg, index = alphas)

print('Ridge RMSE loss:')
print(cv_rg, '\n')

print('Ridge RMSE loss Mean:')
print(cv_rg.mean())


plt.ﬁgure(ﬁgsize=(10, 5))
plt.plot(cv_rg)
plt.grid()
plt.title('Validation - by regularization strength')
plt.xlabel('Alpha')
plt.ylabel('RMSE')
plt.show()

In [None]:
model_rg.fit(x_train,y_train)
pred1=model_rg.predict(x_test)
print("test_RMSE:",np.sqrt(mean_squared_error(y_test, pred1)))
print("test_MAE:",mean_absolute_error(y_test, pred1))
print("R^2:{}".format(model_rg.score(x_test, y_test)))

# 3)LassoCV

In [None]:
model_ls = LassoCV(
    alphas = [1, 0.1, 0.001, 0.0005]).fit(x_train, y_train)

print('Lasso regression RMSE loss:')
print(rmse_cv(model_ls))

print('Average loss:', rmse_cv(model_ls).mean())
print('Minimum loss:', rmse_cv(model_ls).min())
print('Best alpha  :', model_ls.alpha_) 

In [None]:
model_ls.fit(x_train,y_train)
pred2=model_ls.predict(x_test)
print("test_RMSE:",np.sqrt(mean_squared_error(y_test, pred2)))
print("test_MAE:",mean_absolute_error(y_test, pred2))
print("R^2:{}".format(model_ls.score(x_test, y_test)))

# 4)XGBoost

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(x_train, label = y_train)

params = {"max_depth":3, "eta":0.1}

cross_val = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    early_stopping_rounds=50)
cross_val

In [None]:
plt.ﬁgure(ﬁgsize=(8, 6))
plt.plot(cross_val.loc[0:,["test-rmse-mean", "train-rmse-mean"]])
plt.grid()
plt.xlabel('num_boost_round')
plt.ylabel('RMSE')
plt.show()

In [None]:
model_xgb = xgb.XGBRegressor(
    n_estimators=110,
    max_depth=3,
    learning_rate=0.1)
model_xgb.fit(x_train, y_train)
pred3=model_xgb.predict(x_test)

print('xgboost RMSE loss:')
print(rmse_cv(model_xgb).mean())
print("test_RMSE:",np.sqrt(mean_squared_error(y_test, pred3)))
print("test_MAE:",mean_absolute_error(y_test, pred3))
print("R^2:{}".format(model_xgb.score(x_test, y_test)))

# In four models, XGBoost with Logarithmic seems to be better. MAE0.08< The baseline performance error0.15.