In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import pickle
import json
from datetime import datetime

In [14]:
df = pd.read_csv('../flask_ml/data/Income_dataset.csv')

In [15]:
target_variable = 'Income'
# Select the 10 features the model should *actually* be trained on
# Match these with the user selection in the failing run:
features_to_include = [
    'workclass', 'education', 'education-num', 'marital-status', 'occupation',
    'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'
]
# Define original numeric and categorical columns *within the included features*
numeric_columns = df[features_to_include].select_dtypes(include=np.number).columns.tolist()
categorical_columns = df[features_to_include].select_dtypes(exclude=np.number).columns.tolist()

# All columns needed from the original CSV for processing
columns_to_load = features_to_include + [target_variable]
df_processed = df[columns_to_load].copy()

# --- Target Encoding ---
# Ensure target is binary 0/1
unique_target = df_processed[target_variable].unique()
if len(unique_target) > 2:
    print(f"Warning: Target variable '{target_variable}' has more than 2 values. Attempting mapping.")
    # Example mapping (adjust if needed) - assumes common income format
    positive_label = '>50K' # Or find dynamically
    target_map = {val: 1 if positive_label in str(val) else 0 for val in unique_target}
    print(f"Target mapping used: {target_map}")
else:
    # Find the positive class label (e.g., ' >50K') assuming it's the less frequent one or specified
    value_counts = df_processed[target_variable].value_counts()
    positive_label = value_counts.idxmin() if len(value_counts) > 1 else unique_target[0]
    target_map = {val: 1 if val == positive_label else 0 for val in unique_target}
    print(f"Target mapping used (Auto-detected positive class '{positive_label}'): {target_map}")

df_processed[target_variable] = df_processed[target_variable].map(target_map)

# Separate Features (X) and Target (y)
X_orig = df_processed[features_to_include]
y = df_processed[target_variable]

# --- Train/Validation/Test Split (on original features) ---
# Split data: first 80% train+val, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(X_orig, y, test_size=0.20, random_state=42, stratify=y)

# Split train+val into 80% train (64% of total), 20% validation (16% of total)
# Test size for this split is 0.20 *within the train_val set* = 16% of total
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.20, random_state=42, stratify=y_train_val)

print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

# --- Define Preprocessing Steps ---
# Pipeline for numeric features: Impute missing values (median) then Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) # Scaler applied ONLY to numeric features
])

# Pipeline for categorical features: Impute missing values (most frequent) then OneHotEncode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)) # drop='first' matches previous logic
])

# Create the ColumnTransformer
# This applies the transformers to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder='passthrough' 
)

# --- Create the Full Model Pipeline ---
# Chain the preprocessor and the logistic regression model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# --- Train the Full Pipeline ---
print("Training the model pipeline...")
model_pipeline.fit(X_train, y_train)
print("Training complete.")

# --- Evaluate ---
# Evaluate on validation set
y_val_pred = model_pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Evaluate on test set
y_test_pred = model_pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# --- Get Final Feature Names AFTER Preprocessing ---
# This is important for the metadata and consistency checks
try:
    final_feature_names = model_pipeline.named_steps['preprocessor'].get_feature_names_out()
    final_feature_names = [name.split('__')[-1] for name in final_feature_names]
    print(f"Number of features after preprocessing: {len(final_feature_names)}")
except Exception as e:
    print(f"Could not get feature names from ColumnTransformer: {e}")
    final_feature_names = ["error_getting_names"] # Placeholder


# --- Save Model and Preprocessor ---
# Save the *entire pipeline* which includes the preprocessor and the classifier
model_path = 'income_prediction_pipeline.pkl' # Changed name to reflect it's a pipeline
with open(model_path, 'wb') as file:
    pickle.dump(model_pipeline, file)
print(f"Model pipeline saved to '{model_path}'")

# --- Create and Save Metadata ---
# Metadata should now clearly list the *original* features expected as input
# and the final features *after* the saved preprocessor runs.
metadata = {
    "model_info": {
        "pipeline_file": model_path, # Changed key name
        "target_variable": target_variable,
        "model_type_in_pipeline": type(model_pipeline.named_steps['classifier']).__name__, # e.g., LogisticRegression
        "creation_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "validation_accuracy": float(val_accuracy),
        "test_accuracy": float(test_accuracy)
    },
    "feature_info": {
        # Features EXPECTED AS INPUT to the pipeline
        "input_features_ordered": features_to_include, # Order matters if not using ColumnTransformer by name
        "input_numeric_features": numeric_columns,
        "input_categorical_features": categorical_columns,
        "input_feature_count": len(features_to_include),
        # Features AFTER the pipeline's preprocessor step runs
        "processed_feature_names_ordered": final_feature_names,
        "processed_feature_count": len(final_feature_names)
    },
    "preprocessing_in_pipeline": {
        "numeric_strategy": "Impute Median -> StandardScaler",
        "categorical_strategy": "Impute Mode -> OneHotEncoder (drop='first')",
        # Details extracted from the pipeline components if needed
    },
     "evaluation_split": { # Added info about the split used for validation/test accuracy reported
         "train_percentage": 0.64, # 80% of 80%
         "validation_percentage": 0.16, # 20% of 80%
         "test_percentage": 0.20,
         "random_state": 42
     }
}

metadata_path = 'model_pipeline_metadata.json' 
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=4)

print(f"Metadata saved to '{metadata_path}'")

Target mapping used (Auto-detected positive class ' >50K'): {' <=50K': 0, ' >50K': 1}
Train shape: (31258, 10), Validation shape: (7815, 10), Test shape: (9769, 10)
Training the model pipeline...
Training complete.
Validation Accuracy: 0.8518
Test Accuracy: 0.8504
Number of features after preprocessing: 92
Model pipeline saved to 'income_prediction_pipeline.pkl'
Metadata saved to 'model_pipeline_metadata.json'




In [12]:
# Create and save metadata
metadata = {
    "model_info": {
        "target_variable": "Income",
        "excluded_features": columns_to_drop + ["Income"],
        "creation_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "model_file": model_path,
        "validation_accuracy": float(val_accuracy),
        "test_accuracy": float(test_accuracy)
    },
    "feature_info": {
        "original_features": all_original_features,
        "included_original_features": included_original_features,
        "numeric_features": numeric_columns,
        "categorical_features": categorical_columns,
        "one_hot_encoded_features": encoded_features,
        "final_feature_list": X.columns.tolist(),
        "feature_count": len(X.columns)
    },
    "preprocessing": {
        "scaling": "StandardScaler",
        "missing_values": "numeric filled with median, remaining rows dropped",
        "categorical_encoding": "one-hot with drop_first=True",
        "train_val_test_split": "80:10:10"
    }
}

metadata_path = 'model_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=4)

print(f"Metadata saved to '{metadata_path}'")

Metadata saved to 'model_metadata.json'
