In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load data from Excel file
data = pd.read_csv('emp.xls')

# Convert date to datetime object
data['date'] = pd.to_datetime(data['date'])
data = data.apply(pd.to_numeric, errors='ignore')
data.dropna(inplace=True)

# Extract day and month from the date
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data.dropna(subset=['date'], inplace=True)
data['day'] = data['date'].dt.day
data['month'] = data['date'].dt.month


# Define categorical and numerical columns
categorical_columns = ['quarter', 'department', 'day', 'month']
numerical_columns = ['targeted_productivity', 'smv', 'wip', 'over_time', 'incentive']

# Preprocess categorical and numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', 'passthrough', numerical_columns)
    ])

# Fit and transform data
X = preprocessor.fit_transform(data.drop('actual_productivity', axis=1))
y = data['actual_productivity']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train decision tree regressor
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Predict on test set
y_pred = regressor.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error: {rmse:.4f}")

# Predict performance for trial sample
trial_sample = [[1/1/2015,"Quarter1","sweing","Thursday",8,0.8,26.16,1108,7080,98,0,0,0,59,0.940725424]]
trial_sample_df = pd.DataFrame(trial_sample, columns=data.drop('actual_productivity', axis=1).columns)


# Encode categorical variables
trial_sample_encoded = preprocessor.transform(trial_sample_df)

# Predict performance for trial sample
predicted_performance = regressor.predict(trial_sample_encoded)[0]
print(f"Predicted performance for trial sample: {predicted_performance}")



Root Mean Squared Error: 0.0968
Predicted performance for trial sample: 0.940725424
