In [13]:
import pandas as pd
import numpy as np
from faker import Faker

fake = Faker()

# Seed for reproducibility
np.random.seed(42)

# Define the number of records
num_records = 100

# Generate synthetic data
data = {
    "age": np.random.randint(22, 60, size=num_records),
    "gender": np.random.choice(["M", "F"], size=num_records),
    "marital_status": np.random.choice(["Single", "Married"], size=num_records),
    "number_of_children": np.random.randint(0, 5, size=num_records),
    "department": np.random.choice(["IT", "HR", "Finance", "Marketing", "Sales"], size=num_records),
    "position": np.random.choice(["Software Engineer", "HR Manager", "Accountant", "Marketing Specialist", "Sales Executive"], size=num_records),
    "employment_status": np.random.choice(["Full-time", "Part-time", "Contract"], size=num_records),
    "shift_timing": np.random.choice(["Day", "Night"], size=num_records),
    "length_of_service": np.random.randint(1, 15, size=num_records),
    "work_hours": np.random.choice([20, 30, 40], size=num_records),
    "overtime_days": np.random.randint(0, 5, size=num_records),
    "vacation_days": np.random.randint(0, 5, size=num_records),
    "past_event_attendance": np.random.choice(["Yes", "No"], size=num_records),
    "work_days_last_month": np.random.randint(15, 23, size=num_records),
    "absent_days_last_month": np.random.randint(0, 5, size=num_records),
    "event_interest": np.random.randint(1, 11, size=num_records),
    "event_type_preference": np.random.choice(["Social", "Professional", "Family-friendly"], size=num_records),
    "event_timing_preference": np.random.choice(["Weekday", "Weekend", "Evening"], size=num_records),
    "family_commitments": np.random.choice(["Yes", "No"], size=num_records),
    "health_issues": np.random.choice(["None", "Mild", "Severe"], size=num_records),
    "transportation": np.random.choice(["Own vehicle", "Public transport"], size=num_records),
    "event_date": [fake.date_this_year() for _ in range(num_records)],
    "event_time": [fake.time() for _ in range(num_records)],
    "event_location": np.random.choice(["Office", "Off-site"], size=num_records),
    "prior_commitments": np.random.choice(["Yes", "No"], size=num_records),
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
file_path = "attendance_data.csv"
df.to_csv(file_path, index=False)

file_path


'attendance_data.csv'

In [5]:
# Data cleaning

In [14]:
# Check for missing values
from sklearn.preprocessing import StandardScaler
from scipy import stats

missing_values = df.isnull().sum()

# Remove duplicates
df_cleaned = df.drop_duplicates()

# Encode categorical variables
categorical_columns = ["gender", "marital_status", "department", "position", "employment_status", "shift_timing",
                       "past_event_attendance", "event_type_preference", "event_timing_preference", 
                       "family_commitments", "health_issues", "transportation", "event_location", 
                       "prior_commitments"]
df_encoded = pd.get_dummies(df_cleaned, columns=categorical_columns, drop_first=True)

# Normalize numerical features
numerical_columns = ["age", "number_of_children", "length_of_service", "work_hours", "overtime_days", 
                     "vacation_days", "work_days_last_month", "absent_days_last_month", "event_interest"]

scaler = StandardScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])

# Check for outliers (simple method using z-score)
z_scores = np.abs(stats.zscore(df_encoded[numerical_columns]))
outliers = np.where(z_scores > 3)

# Removing rows with outliers (for simplicity)
df_no_outliers = df_encoded[(z_scores < 3).all(axis=1)]

# Save the cleaned dataset
cleaned_file_path = "cleaned_attendance_data.csv"
df_no_outliers.to_csv(cleaned_file_path, index=False)

print(f"Missing values: \n{missing_values}")
print(f"Original Dataset Shape: {df_cleaned.shape}")
print(f"Encoded Dataset Shape: {df_encoded.shape}")
print(f"Final Cleaned Dataset Shape: {df_no_outliers.shape}")
print(f"Cleaned dataset saved to: {cleaned_file_path}")

Missing values: 
age                        0
gender                     0
marital_status             0
number_of_children         0
department                 0
position                   0
employment_status          0
shift_timing               0
length_of_service          0
work_hours                 0
overtime_days              0
vacation_days              0
past_event_attendance      0
work_days_last_month       0
absent_days_last_month     0
event_interest             0
event_type_preference      0
event_timing_preference    0
family_commitments         0
health_issues              0
transportation             0
event_date                 0
event_time                 0
event_location             0
prior_commitments          0
dtype: int64
Original Dataset Shape: (100, 25)
Encoded Dataset Shape: (100, 35)
Final Cleaned Dataset Shape: (100, 35)
Cleaned dataset saved to: cleaned_attendance_data.csv


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the cleaned dataset
df = pd.read_csv('attendance_data.csv')

# Assume the target variable is in a column named 'attendance'
# For this example, we'll generate a synthetic target variable
# Replace this with your actual target variable
np.random.seed(42)
df['attendance'] = np.random.choice([0, 1], size=len(df))

# Check for missing values and handle them if necessary
print(df.isnull().sum())

# If there are missing values, you can fill them with the mean or median, or drop the rows/columns
# For simplicity, we'll drop rows with any missing values
df = df.dropna()

# Define the categorical columns to encode
categorical_columns = ["gender", "marital_status", "department", "position", "employment_status", "shift_timing",
                       "past_event_attendance", "event_type_preference", "event_timing_preference", 
                       "family_commitments", "health_issues", "transportation", "event_location", 
                       "prior_commitments"]

# Ensure all specified categorical columns exist in the DataFrame
existing_categorical_columns = [col for col in categorical_columns if col in df.columns]

# Encode categorical variables
df = pd.get_dummies(df, columns=existing_categorical_columns, drop_first=True)

# Ensure all columns are of numerical type
print(df.dtypes)

# Split the data into features and target variable
X = df.drop('attendance', axis=1)
y = df['attendance']

# Normalize numerical features
numerical_columns = X.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

age                         0
gender                      0
marital_status              0
number_of_children          0
department                  0
position                    0
employment_status           0
shift_timing                0
length_of_service           0
work_hours                  0
overtime_days               0
vacation_days               0
past_event_attendance       0
work_days_last_month        0
absent_days_last_month      0
event_interest              0
event_type_preference       0
event_timing_preference     0
family_commitments          0
health_issues              30
transportation              0
event_date                  0
event_time                  0
event_location              0
prior_commitments           0
attendance                  0
dtype: int64
age                                    int64
number_of_children                     int64
length_of_service                      int64
work_hours                             int64
overtime_days              

ValueError: could not convert string to float: '2024-03-20'

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the cleaned dataset
df = pd.read_csv('cleaned_attendance_data.csv')

# Assume the target variable is in a column named 'attendance'
# For this example, we'll generate a synthetic target variable
# Replace this with your actual target variable
np.random.seed(42)
df['attendance'] = np.random.choice([0, 1], size=len(df))

# Check for missing values and handle them if necessary
print(df.isnull().sum())

# If there are missing values, you can fill them with the mean or median, or drop the rows/columns
# For simplicity, we'll drop rows with any missing values
df = df.dropna()

# Define the categorical columns to encode
categorical_columns = ["gender", "marital_status", "department", "position", "employment_status", "shift_timing",
                       "past_event_attendance", "event_type_preference", "event_timing_preference", 
                       "family_commitments", "health_issues", "transportation", "event_location", 
                       "prior_commitments"]

# Ensure all specified categorical columns exist in the DataFrame
existing_categorical_columns = [col for col in categorical_columns if col in df.columns]

# Encode categorical variables
df = pd.get_dummies(df, columns=existing_categorical_columns, drop_first=True)

# Ensure all columns are of numerical type
print(df.dtypes)

# Split the data into features and target variable
X = df.drop('attendance', axis=1)
y = df['attendance']

# Normalize numerical features
numerical_columns = X.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

age                                   0
number_of_children                    0
length_of_service                     0
work_hours                            0
overtime_days                         0
vacation_days                         0
work_days_last_month                  0
absent_days_last_month                0
event_interest                        0
event_date                            0
event_time                            0
gender_M                              0
marital_status_Single                 0
department_HR                         0
department_IT                         0
department_Marketing                  0
department_Sales                      0
position_HR Manager                   0
position_Marketing Specialist         0
position_Sales Executive              0
position_Software Engineer            0
employment_status_Full-time           0
employment_status_Part-time           0
shift_timing_Night                    0
past_event_attendance_Yes             0


ValueError: could not convert string to float: '2024-05-08'