In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [28]:
df = pd.read_csv("data/data.csv", parse_dates=['WINDOW_WEEK'])
df.head()

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,TOTAL_WORK_BLOCK_PLANNED,DESIRED_WORKING_HOURS
0,abhishek.gurung@es.cloudfactory.com,2024-01-07,32.0,48.0
1,abhishek.gurung@es.cloudfactory.com,2024-01-14,40.0,48.0
2,abhishek.gurung@es.cloudfactory.com,2024-01-21,36.0,48.0
3,abhishek.gurung@es.cloudfactory.com,2024-01-28,46.0,48.0
4,abhishek.gurung@es.cloudfactory.com,2024-02-04,34.0,48.0


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 721 entries, 0 to 720
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   WORKER_EMAIL              721 non-null    object        
 1   WINDOW_WEEK               721 non-null    datetime64[ns]
 2   TOTAL_WORK_BLOCK_PLANNED  721 non-null    float64       
 3   DESIRED_WORKING_HOURS     684 non-null    float64       
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 22.7+ KB


In [30]:
df['YEAR'] = df['WINDOW_WEEK'].dt.year
df['MONTH'] = df['WINDOW_WEEK'].dt.month
df['DAY'] = df['WINDOW_WEEK'].dt.day
df.head()

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,TOTAL_WORK_BLOCK_PLANNED,DESIRED_WORKING_HOURS,YEAR,MONTH,DAY
0,abhishek.gurung@es.cloudfactory.com,2024-01-07,32.0,48.0,2024,1,7
1,abhishek.gurung@es.cloudfactory.com,2024-01-14,40.0,48.0,2024,1,14
2,abhishek.gurung@es.cloudfactory.com,2024-01-21,36.0,48.0,2024,1,21
3,abhishek.gurung@es.cloudfactory.com,2024-01-28,46.0,48.0,2024,1,28
4,abhishek.gurung@es.cloudfactory.com,2024-02-04,34.0,48.0,2024,2,4


In [31]:
df['MOVING_AVERAGE'] = df.groupby('WORKER_EMAIL')['TOTAL_WORK_BLOCK_PLANNED'].transform(lambda x: x.rolling(window=2).mean())
df.head()

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,TOTAL_WORK_BLOCK_PLANNED,DESIRED_WORKING_HOURS,YEAR,MONTH,DAY,MOVING_AVERAGE
0,abhishek.gurung@es.cloudfactory.com,2024-01-07,32.0,48.0,2024,1,7,
1,abhishek.gurung@es.cloudfactory.com,2024-01-14,40.0,48.0,2024,1,14,36.0
2,abhishek.gurung@es.cloudfactory.com,2024-01-21,36.0,48.0,2024,1,21,38.0
3,abhishek.gurung@es.cloudfactory.com,2024-01-28,46.0,48.0,2024,1,28,41.0
4,abhishek.gurung@es.cloudfactory.com,2024-02-04,34.0,48.0,2024,2,4,40.0


In [32]:
X = df.drop(columns=['WINDOW_WEEK', 'TOTAL_WORK_BLOCK_PLANNED'], axis=1)
y = df['TOTAL_WORK_BLOCK_PLANNED']

In [33]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

In [42]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler

label_encoder = LabelEncoder()

preprocessor = ColumnTransformer(
    [
        ("SimpleImputer", imputer, num_features),
        ("LabelEncoder", label_encoder, cat_features)
    ]
)

In [43]:
X = preprocessor.fit_transform(X).reshape(-1, 1)

ValueError: Specifying the columns using strings is only supported for dataframes.

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((576, 6), (145, 6))

In [37]:
y_train.shape, y_test.shape

((576,), (145,))

In [38]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_squared = r2_score(true, predicted)
    return mae, rmse, r2_squared

In [39]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred  = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model performance for Training Set")
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print("---------------------------")

    print("Model Performance for Testing Set")
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print("="*35)
    print("\n")

Linear Regression
Model performance for Training Set
- Mean Absolute Error: 5.8412
- Root Mean Squared Error: 8.2413
- R2 Score: 0.6642
---------------------------
Model Performance for Testing Set
- Mean Absolute Error: 5.6545
- Root Mean Squared Error: 8.1606
- R2 Score: 0.6659


Lasso
Model performance for Training Set
- Mean Absolute Error: 5.9264
- Root Mean Squared Error: 8.3500
- R2 Score: 0.6552
---------------------------
Model Performance for Testing Set
- Mean Absolute Error: 5.7194
- Root Mean Squared Error: 8.1873
- R2 Score: 0.6638


Ridge
Model performance for Training Set
- Mean Absolute Error: 5.8422
- Root Mean Squared Error: 8.2413
- R2 Score: 0.6642
---------------------------
Model Performance for Testing Set
- Mean Absolute Error: 5.6555
- Root Mean Squared Error: 8.1595
- R2 Score: 0.6660


K-Neighbors Regressor
Model performance for Training Set
- Mean Absolute Error: 6.0347
- Root Mean Squared Error: 8.0193
- R2 Score: 0.6820
---------------------------
Model P

In [40]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2 Score']).sort_values(by=["R2 Score"], ascending=False)

Unnamed: 0,Model Name,R2 Score
7,CatBoosting Regressor,0.797622
5,Random Forest Regressor,0.794973
8,AdaBoost Regressor,0.781806
6,XGBRegressor,0.720453
2,Ridge,0.666037
0,Linear Regression,0.665949
1,Lasso,0.663758
4,Decision Tree,0.647554
3,K-Neighbors Regressor,0.564849


In [445]:
model = AdaBoostRegressor()

In [446]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r_squared = r2_score(y_test, y_pred)
print("Accuracy of the model is: {:.2f}".format(r_squared))

Accuracy of the model is: 0.74


### New Data

In [447]:
df_new = pd.read_csv("data/data.csv", parse_dates=["WINDOW_WEEK"])
df_new.head()

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,TOTAL_WORK_BLOCK_PLANNED,DESIRED_WORKING_HOURS
0,abhishek.gurung@es.cloudfactory.com,2024-01-07,32.0,48.0
1,abhishek.gurung@es.cloudfactory.com,2024-01-14,40.0,48.0
2,abhishek.gurung@es.cloudfactory.com,2024-01-21,36.0,48.0
3,abhishek.gurung@es.cloudfactory.com,2024-01-28,46.0,48.0
4,abhishek.gurung@es.cloudfactory.com,2024-02-04,34.0,48.0


In [448]:
df_new.isna().sum()

WORKER_EMAIL                 0
WINDOW_WEEK                  0
TOTAL_WORK_BLOCK_PLANNED     0
DESIRED_WORKING_HOURS       37
dtype: int64

In [449]:
df_new.fillna(0, inplace=True)
df_new.head()

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,TOTAL_WORK_BLOCK_PLANNED,DESIRED_WORKING_HOURS
0,abhishek.gurung@es.cloudfactory.com,2024-01-07,32.0,48.0
1,abhishek.gurung@es.cloudfactory.com,2024-01-14,40.0,48.0
2,abhishek.gurung@es.cloudfactory.com,2024-01-21,36.0,48.0
3,abhishek.gurung@es.cloudfactory.com,2024-01-28,46.0,48.0
4,abhishek.gurung@es.cloudfactory.com,2024-02-04,34.0,48.0


In [450]:
avg_worked_hours = df_new.groupby("WORKER_EMAIL")['TOTAL_WORK_BLOCK_PLANNED'].mean()
df_new['MOVING_AVERAGE'] = df_new['WORKER_EMAIL'].map(avg_worked_hours)
df_new.head()


Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,TOTAL_WORK_BLOCK_PLANNED,DESIRED_WORKING_HOURS,MOVING_AVERAGE
0,abhishek.gurung@es.cloudfactory.com,2024-01-07,32.0,48.0,34.909091
1,abhishek.gurung@es.cloudfactory.com,2024-01-14,40.0,48.0,34.909091
2,abhishek.gurung@es.cloudfactory.com,2024-01-21,36.0,48.0,34.909091
3,abhishek.gurung@es.cloudfactory.com,2024-01-28,46.0,48.0,34.909091
4,abhishek.gurung@es.cloudfactory.com,2024-02-04,34.0,48.0,34.909091


In [451]:
next_week = df_new['WINDOW_WEEK'].max() + timedelta(days=7)
df_new["WINDOW_WEEK"] = next_week
df_new

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,TOTAL_WORK_BLOCK_PLANNED,DESIRED_WORKING_HOURS,MOVING_AVERAGE
0,abhishek.gurung@es.cloudfactory.com,2024-03-24,32.0,48.0,34.909091
1,abhishek.gurung@es.cloudfactory.com,2024-03-24,40.0,48.0,34.909091
2,abhishek.gurung@es.cloudfactory.com,2024-03-24,36.0,48.0,34.909091
3,abhishek.gurung@es.cloudfactory.com,2024-03-24,46.0,48.0,34.909091
4,abhishek.gurung@es.cloudfactory.com,2024-03-24,34.0,48.0,34.909091
...,...,...,...,...,...
716,winnie.kutto@es.cloudfactory.com,2024-03-24,0.0,48.0,9.714286
717,winnie.kutto@es.cloudfactory.com,2024-03-24,0.0,48.0,9.714286
718,winnie.kutto@es.cloudfactory.com,2024-03-24,8.0,48.0,9.714286
719,winnie.kutto@es.cloudfactory.com,2024-03-24,30.0,48.0,9.714286


In [452]:
df_new = df_new[['WORKER_EMAIL', 'WINDOW_WEEK', 'DESIRED_WORKING_HOURS', 'MOVING_AVERAGE']].drop_duplicates()
df_new.shape

(122, 4)

In [453]:
df_new.head()

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,DESIRED_WORKING_HOURS,MOVING_AVERAGE
0,abhishek.gurung@es.cloudfactory.com,2024-03-24,48.0,34.909091
11,abhishek.shakya02@es.cloudfactory.com,2024-03-24,48.0,27.272727
22,albert.kipkogei@es.cloudfactory.com,2024-03-24,48.0,8.0
23,alfred.ouko@es.cloudfactory.com,2024-03-24,46.0,12.666667
29,alice.siocha@es.cloudfactory.com,2024-03-24,48.0,4.0


In [454]:
from datetime import timedelta

In [455]:
df_new['YEAR'] = df_new['WINDOW_WEEK'].dt.year
df_new['MONTH'] = df_new['WINDOW_WEEK'].dt.month
df_new['DAY'] = df_new['WINDOW_WEEK'].dt.day
df_new.head()

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,DESIRED_WORKING_HOURS,MOVING_AVERAGE,YEAR,MONTH,DAY
0,abhishek.gurung@es.cloudfactory.com,2024-03-24,48.0,34.909091,2024,3,24
11,abhishek.shakya02@es.cloudfactory.com,2024-03-24,48.0,27.272727,2024,3,24
22,albert.kipkogei@es.cloudfactory.com,2024-03-24,48.0,8.0,2024,3,24
23,alfred.ouko@es.cloudfactory.com,2024-03-24,46.0,12.666667,2024,3,24
29,alice.siocha@es.cloudfactory.com,2024-03-24,48.0,4.0,2024,3,24


In [456]:
num_features = df_new.select_dtypes(exclude="object").columns
cat_features = df_new.select_dtypes(include="object").columns

In [457]:
num_features

Index(['WINDOW_WEEK', 'DESIRED_WORKING_HOURS', 'MOVING_AVERAGE', 'YEAR',
       'MONTH', 'DAY'],
      dtype='object')

In [458]:
cat_features

Index(['WORKER_EMAIL'], dtype='object')

In [459]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for col in cat_features:
    df_new[col] = label_encoder.fit_transform(df_new[col])

df_new

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,DESIRED_WORKING_HOURS,MOVING_AVERAGE,YEAR,MONTH,DAY
0,0,2024-03-24,48.0,34.909091,2024,3,24
11,1,2024-03-24,48.0,27.272727,2024,3,24
22,2,2024-03-24,48.0,8.000000,2024,3,24
23,3,2024-03-24,46.0,12.666667,2024,3,24
29,4,2024-03-24,48.0,4.000000,2024,3,24
...,...,...,...,...,...,...,...
690,117,2024-03-24,48.0,21.454545,2024,3,24
701,118,2024-03-24,48.0,4.000000,2024,3,24
702,119,2024-03-24,48.0,16.000000,2024,3,24
703,120,2024-03-24,40.0,28.363636,2024,3,24


In [460]:
X_new = df_new[['WORKER_EMAIL',	'DESIRED_WORKING_HOURS',	'YEAR',	'MONTH',	'DAY',	'MOVING_AVERAGE']]

In [461]:
predictions = model.predict(X_new)
df_new.loc[X_new.index, 'TOTAL_WORK_BLOCK_PLANNED'] = predictions
df_new

Unnamed: 0,WORKER_EMAIL,WINDOW_WEEK,DESIRED_WORKING_HOURS,MOVING_AVERAGE,YEAR,MONTH,DAY,TOTAL_WORK_BLOCK_PLANNED
0,0,2024-03-24,48.0,34.909091,2024,3,24,36.480000
11,1,2024-03-24,48.0,27.272727,2024,3,24,20.267516
22,2,2024-03-24,48.0,8.000000,2024,3,24,8.857143
23,3,2024-03-24,46.0,12.666667,2024,3,24,19.714286
29,4,2024-03-24,48.0,4.000000,2024,3,24,5.934426
...,...,...,...,...,...,...,...,...
690,117,2024-03-24,48.0,21.454545,2024,3,24,20.096552
701,118,2024-03-24,48.0,4.000000,2024,3,24,5.934426
702,119,2024-03-24,48.0,16.000000,2024,3,24,20.096552
703,120,2024-03-24,40.0,28.363636,2024,3,24,31.950000
