In [None]:
# train_model.py
# Run this script in the same folder as your project.
# It will create new 'model' and 'scaler' files.

import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

print("Starting model retraining process...")

# --- 1. Load The Original Training Data ---
# IMPORTANT: You need the original CSV file used for training.
# For this example, I'm using a common public version of this dataset.
# Please REPLACE 'Absenteeism_at_work.csv' with the name of your actual training data file.
try:
    # Attempt to load a common version of this dataset.
    # You can download it from the UCI Machine Learning Repository if you don't have it.
    raw_df = pd.read_csv('Absenteeism_at_work.csv', sep=';')
    print("Successfully loaded 'Absenteeism_at_work.csv'.")
except FileNotFoundError:
    print("\n---")
    print("ERROR: Training data file not found.")
    print("Please download 'Absenteeism_at_work.csv' from the UCI repository or use your original training file.")
    print("Stopping script.")
    exit()


# --- 2. Preprocessing the Data ---
# This section mimics the preprocessing from your module to ensure consistency.
print("Preprocessing data...")

# Create the target variable: 1 for excessive absenteeism, 0 for moderate.
# We use the median as a cutoff point, which is a common practice for this dataset.
median_absenteeism = raw_df['Absenteeism time in hours'].median()
targets = np.where(raw_df['Absenteeism time in hours'] > median_absenteeism, 1, 0)
raw_df['Excessive_Absenteeism'] = targets

# Drop the original absenteeism column
df = raw_df.drop(['Absenteeism time in hours'], axis=1)

# Create dummy variables for reasons for absence
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first=True)
reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:, 18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)

# Drop the original reason column and concatenate the new dummy variables
df = df.drop(['Reason for Absence'], axis=1)
df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)

# Rename the new reason columns
df.columns = [
    'ID', 'Date', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age',
    'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Education', 'Son',
    'Social drinker', 'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index',
    'Excessive_Absenteeism', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4'
]

# Convert 'Date' and extract Month and Day of the Week
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Month Value'] = df['Date'].apply(lambda x: x.month)
df['Day of the Week'] = df['Date'].apply(lambda x: x.weekday())
df = df.drop(['Date'], axis=1)

# Map Education into fewer categories
df['Education'] = df['Education'].map({1: 0, 2: 1, 3: 1, 4: 1})

# Drop columns that are not needed for the model as per your module
df = df.drop(['ID', 'Service time', 'Work load Average/day ', 'Distance from Residence to Work', 'Day of the Week'], axis=1)

# Reorder columns to be safe
df = df[[
    'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
    'Transportation expense', 'Age', 'Hit target', 'Disciplinary failure',
    'Education', 'Son', 'Social drinker', 'Social smoker', 'Pet',
    'Weight', 'Height', 'Body mass index', 'Excessive_Absenteeism'
]]

print("Preprocessing complete.")

# --- 3. Define Features (Inputs) and Target ---
unscaled_inputs = df.drop('Excessive_Absenteeism', axis=1)
targets = df['Excessive_Absenteeism']


# --- 4. Scale the Data ---
print("Scaling features...")

# We will scale only a subset of the columns, similar to your original project.
# The CustomScaler in your module was designed to do this, here we do it manually.
columns_to_scale = [
    'Month Value', 'Transportation expense', 'Age', 'Hit target',
    'Son', 'Pet', 'Weight', 'Height', 'Body mass index'
]
# Create the scaler
scaler = StandardScaler()
# Fit the scaler ONLY on the columns that need scaling
scaler.fit(unscaled_inputs[columns_to_scale])
# Transform the columns and create a new DataFrame
scaled_columns = scaler.transform(unscaled_inputs[columns_to_scale])
scaled_inputs_df = pd.DataFrame(scaled_columns, columns=columns_to_scale)

# Get the columns that were not scaled
unscaled_columns = unscaled_inputs.drop(columns_to_scale, axis=1)

# Concatenate the unscaled and scaled columns to create the final inputs
# We reset index to prevent issues with joining
inputs = pd.concat([unscaled_columns.reset_index(drop=True), scaled_inputs_df], axis=1)

print("Scaling complete.")

# --- 5. Train the Model ---
print("Splitting data and training the model...")

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
model = LogisticRegression(solver='liblinear') # Using a compatible solver
model.fit(x_train, y_train)

# Check model accuracy on the test set
accuracy = model.score(x_test, y_test)
print(f"Model training complete. Accuracy: {accuracy:.4f}")


# --- 6. Save the New Model and Scaler ---
print("Saving new 'model' and 'scaler' files...")

with open('model', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('scaler', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("\n✅ Success! New 'model' and 'scaler' files have been created.")
print("You can now re-run your original Jupyter Notebook cell.")
