In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

# 1. Define the updated dataset
data = {
    'date': ['2023-11-01', '2023-11-02', '2023-11-03', '2023-11-04'],  # Date column
    'reason': [0, 1, 2, 1],  # Categorical but numeric
    'Age': [25, 30, 35, 30],  # Continuous numeric
    'Distance to Work': [10.0, 20.0, 15.0, 25.0],  # Continuous numeric
    'Transportation Expense': [200, 250, 300, 220],  # Continuous numeric
    'Absenteeism Time in Hours': [5, 8, 6, 7]  # Target variable (y)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Ensure 'date' is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Set the target variable
y = df['Absenteeism Time in Hours'].values


In [None]:
# Drop 'date' since it’s not numeric or directly useful for modeling
df = df.drop(columns=['date'])

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['reason']),  # OneHotEncode 'reason'
        ('num', StandardScaler(), ['Age', 'Distance to Work', 'Transportation Expense'])  # Scale numeric columns
    ]
)

# Transform the features
X_processed = preprocessor.fit_transform(df.drop(columns=['Absenteeism Time in Hours']))


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.25, random_state=42)

# Define the ANN model
model = Sequential()

# Input layer with the size of processed features
model.add(Dense(units=16, activation='relu', input_shape=(X_train.shape[1],)))

# Hidden layers
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=4, activation='relu'))

# Output layer for regression
model.add(Dense(units=1, activation='linear'))  # Linear activation for regression

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])  # Use mean squared error for regression

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=4, validation_split=0.2, verbose=1)

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Predict on test data
predictions = model.predict(X_test)
print("Predictions:", predictions)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Example dataset
data = {
    'date': ['2023-11-01', '2023-11-02', '2023-11-03', '2023-11-04'],
    'reason': [0, 1, 2, 28],  # Reason column with values 0 to 28
    'Age': [25, 30, 35, 30],
    'Distance to Work': [10.0, 20.0, 15.0, 25.0],
    'Transportation Expense': [200, 250, 300, 220],
    'Absenteeism Time in Hours': [5, 8, 6, 7]  # Target variable
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Convert 'date' to datetime format (just for completeness)
df['date'] = pd.to_datetime(df['date'])

# Target variable
y = df['Absenteeism Time in Hours']

# Drop the 'date' and target column
X = df.drop(columns=['date', 'Absenteeism Time in Hours'])

# Group the 'reason' values into custom groups using a lambda function
def group_reasons(reason):
    if 0 <= reason <= 4:
        return 0  # Group 1
    elif 5 <= reason <= 9:
        return 1  # Group 2
    elif 10 <= reason <= 14:
        return 2  # Group 3
    elif 15 <= reason <= 19:
        return 3  # Group 4
    elif 20 <= reason <= 24:
        return 4  # Group 5
    else:
        return 5  # Group 6 (for 25-28)

# Apply the grouping function to the 'reason' column
X['reason_grouped'] = X['reason'].apply(group_reasons)

# Drop the original 'reason' column as it is now grouped
X = X.drop(columns=['reason'])

# Define the preprocessor pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['reason_grouped']),  # One-hot encode the grouped 'reason'
        ('num', StandardScaler(), ['Age', 'Distance to Work', 'Transportation Expense'])  # Scale numeric columns
    ])

# Define the model
model = Sequential()

# Input layer size matches the number of processed features
model.add(Dense(units=16, activation='relu', input_shape=(X.shape[1],)))

# Hidden layers
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=4, activation='relu'))

# Output layer for regression
model.add(Dense(units=1, activation='linear'))  # Use linear activation for regression

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Create a full pipeline that includes the preprocessor and the model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply preprocessing
    ('model', model)  # Fit the model
])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train the model
history = full_pipeline.fit(X_train, y_train, epochs=50, batch_size=4, validation_split=0.2, verbose=1)

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Predict on the test data
predictions = model.predict(X_test)
print("Predictions:", predictions)
