In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the cleaned dataset
df = pd.read_csv('cleaned_attendance_data.csv')

# Assume the target variable is in a column named 'attendance'
# For this example, we'll generate a synthetic target variable
# Replace this with your actual target variable
np.random.seed(42)
df['attendance'] = np.random.choice([0, 1], size=len(df))

# Check for missing values and handle them if necessary
print(df.isnull().sum())

# If there are missing values, you can fill them with the mean or median, or drop the rows/columns
# For simplicity, we'll drop rows with any missing values
df = df.dropna()

# Define the categorical columns to encode
categorical_columns = ["gender", "marital_status", "department", "position", "employment_status", "shift_timing",
                       "past_event_attendance", "event_type_preference", "event_timing_preference", 
                       "family_commitments", "health_issues", "transportation", "event_location", 
                       "prior_commitments"]

# Ensure all specified categorical columns exist in the DataFrame
existing_categorical_columns = [col for col in categorical_columns if col in df.columns]

# Encode categorical variables
df = pd.get_dummies(df, columns=existing_categorical_columns, drop_first=True)

# Convert date and time columns to numerical features
df['event_date'] = pd.to_datetime(df['event_date'])
df['event_day'] = df['event_date'].dt.day
df['event_month'] = df['event_date'].dt.month
df['event_year'] = df['event_date'].dt.year

# Convert event_time to minutes since midnight
df['event_time'] = pd.to_datetime(df['event_time'], format='%H:%M:%S')
df['event_minutes'] = df['event_time'].dt.hour * 60 + df['event_time'].dt.minute

# Drop the original date and time columns
df = df.drop(['event_date', 'event_time'], axis=1)

# Ensure all columns are of numerical type
print(df.dtypes)

# Split the data into features and target variable
X = df.drop('attendance', axis=1)
y = df['attendance']

# Normalize numerical features
numerical_columns = X.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


age                                   0
number_of_children                    0
length_of_service                     0
work_hours                            0
overtime_days                         0
vacation_days                         0
work_days_last_month                  0
absent_days_last_month                0
event_interest                        0
event_date                            0
event_time                            0
gender_M                              0
marital_status_Single                 0
department_HR                         0
department_IT                         0
department_Marketing                  0
department_Sales                      0
position_HR Manager                   0
position_Marketing Specialist         0
position_Sales Executive              0
position_Software Engineer            0
employment_status_Full-time           0
employment_status_Part-time           0
shift_timing_Night                    0
past_event_attendance_Yes             0
