In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
import os

save_dir = './model_sklearn'  # relative to current notebook location
os.makedirs(save_dir, exist_ok=True)

class CustomBinner(BaseEstimator, TransformerMixin):
    def __init__(self, bins, labels):
        self.bins = bins
        self.labels = labels
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        # Convert to numpy array if it's a DataFrame
        if isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0].values
        elif isinstance(X, pd.Series):
            X = X.values
            
        # Ensure we have a 1D array
        if X.ndim > 1 and X.shape[1] == 1:
            X = X.ravel()
            
        # Perform binning
        binned = pd.cut(X, bins=self.bins, labels=self.labels)
        
        # Convert to DataFrame for compatibility
        return pd.DataFrame(binned)

def create_preprocessing_pipeline():
    # Define binning parameters
    trust_bins = [-np.inf, 5, 7, 8, 9, np.inf]
    trust_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
    price_bins = [-np.inf, 5, 6, 7, 8, np.inf]
    price_labels = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
    
    # Create numeric transformers with binning AND encoding
    trust_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('binner', CustomBinner(bins=trust_bins, labels=trust_labels)),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False))
    ])
    
    price_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('binner', CustomBinner(bins=price_bins, labels=price_labels)),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False))
    ])
    
    # Create categorical transformer
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False))
    ])
    
    # Define feature groups
    numeric_features = {
        'trust': ['iag_trust_confidence_scale11'],
        'price': ['iag_value_price_of_policy_reflects_scale11']
    }
    
    categorical_features = [
        'iag_business_unit_ug',
        'iag_age_band_auto',
        'iag_tenure_band_enum',
        'iag_product_type_auto',
        'iag_region_ug'
    ]
    
    # Create the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('trust', trust_transformer, numeric_features['trust']),
            ('price', price_transformer, numeric_features['price']),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop',
        verbose_feature_names_out=False  # Simplified feature names
    )
    
    return preprocessor

def create_full_pipeline():
    preprocessor = create_preprocessing_pipeline()
    
    # Create full pipeline with logistic regression
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000))
    ])
    
    return pipeline

class ModelTrainer:
    def __init__(self):
        self.pipeline = create_full_pipeline()
        self.feature_columns = [
            'iag_trust_confidence_scale11',
            'iag_value_price_of_policy_reflects_scale11',
            'iag_business_unit_ug',
            'iag_age_band_auto',
            'iag_tenure_band_enum',
            'iag_product_type_auto',
            'iag_region_ug'
        ]
        
    def prepare_target(self, df):
        return (df['Likely to recommend'] == 'Promote').astype(int)
        
    def fit_and_evaluate(self, df):
        # Prepare features and target
        X = df[self.feature_columns]
        y = self.prepare_target(df)
        
        # Handle missing values in target
        mask = ~pd.isna(y)
        X = X[mask]
        y = y[mask]
        
        print(f"Training with {len(X)} samples after removing missing target values")
        
        # Fit pipeline
        self.pipeline.fit(X, y)
        
        # Make predictions
        y_pred = self.pipeline.predict(X)
        y_pred_proba = self.pipeline.predict_proba(X)[:, 1]
        
        # Calculate metrics
        metrics = {
            'accuracy': (y_pred == y).mean(),
            'roc_auc': roc_auc_score(y, y_pred_proba),
            'classification_report': classification_report(y, y_pred, output_dict=True)
        }
        
        return metrics
    
    def get_feature_columns(self):
        return self.feature_columns

# Example usage
if __name__ == "__main__":
    # Load data
    df = pd.read_excel('../../data/IAG.xlsx')
    
    # Initialize and train model
    trainer = ModelTrainer()
    metrics = trainer.fit_and_evaluate(df)
    
    # Print results
    print("\nModel Performance:")
    print(f"Accuracy: {metrics['accuracy']:.3f}")
    print(f"ROC AUC: {metrics['roc_auc']:.3f}")
    print("\nClassification Report:")
    print(pd.DataFrame(metrics['classification_report']).transpose())
    
    # Save pipeline
    import joblib
    model_path = os.path.join(save_dir, 'promotion_pipeline.joblib')
    joblib.dump(trainer.pipeline, model_path)
    print(f"Model saved to: {os.path.abspath(model_path)}")

Training with 9264 samples after removing missing target values

Model Performance:
Accuracy: 0.819
ROC AUC: 0.885

Classification Report:
              precision    recall  f1-score      support
0              0.741403  0.706393  0.723474  3113.000000
1              0.854875  0.875305  0.864969  6151.000000
accuracy       0.818545  0.818545  0.818545     0.818545
macro avg      0.798139  0.790849  0.794222  9264.000000
weighted avg   0.816744  0.818545  0.817422  9264.000000
Model saved to: e:\vscode projects\iag_analysis\iag analysis 2\sql\model_sklearn\promotion_pipeline.joblib


In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import classification_report, roc_auc_score
import os
import pickle

# Create directory for statsmodels
save_dir = './model_statsmodels'
os.makedirs(save_dir, exist_ok=True)

class StatsmodelsRecommendationPipeline:
    def __init__(self):
        self.trust_bins = [-np.inf, 5, 7, 8, 9, np.inf]
        self.trust_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
        
        self.price_bins = [-np.inf, 5, 6, 7, 8, np.inf]
        self.price_labels = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
        
        self.features = [
            'iag_trust_confidence_scale11',
            'iag_value_price_of_policy_reflects_scale11',
            'iag_business_unit_ug',
            'iag_age_band_auto',
            'iag_tenure_band_enum',
            'iag_product_type_auto',
            'iag_region_ug'
        ]
        self.model = None
        self.feature_names = None
        
    def prepare_data(self, df):
        print("\nStarting data preparation...")
        
        # Create copy and drop missing values
        df_clean = df.copy()
        df_clean = df_clean.dropna(subset=self.features + ['Likely to recommend'])
        
        print("\nShape after dropping missing values:", df_clean.shape)
        
        # Create binned versions of numeric features
        df_clean['iag_trust_confidence_binned'] = pd.cut(
            df_clean['iag_trust_confidence_scale11'].astype(float),
            bins=self.trust_bins,
            labels=self.trust_labels
        )
        
        df_clean['iag_value_price_binned'] = pd.cut(
            df_clean['iag_value_price_of_policy_reflects_scale11'].astype(float),
            bins=self.price_bins,
            labels=self.price_labels
        )
        
        print("\nUnique values in binned features:")
        print("Trust bins:", df_clean['iag_trust_confidence_binned'].value_counts())
        print("\nPrice bins:", df_clean['iag_value_price_binned'].value_counts())
        
        # Define categorical features including binned ones
        categorical_features = [
            'iag_trust_confidence_binned',
            'iag_value_price_binned',
            'iag_business_unit_ug',
            'iag_age_band_auto',
            'iag_tenure_band_enum',
            'iag_product_type_auto',
            'iag_region_ug'
        ]
        
        # Create dummies with proper prefixes
        X = pd.get_dummies(df_clean[categorical_features], drop_first=True)
        
        # Store feature names for prediction
        self.feature_names = X.columns.tolist()
        
        # Convert all columns to float64
        X = X.astype(np.float64)
        
        # Add constant
        X = sm.add_constant(X)
        
        # Convert target to binary
        y = (df_clean['Likely to recommend'] == 'Promote').astype(np.int64)
        
        return X, y, df_clean
    
    def fit(self, X, y):
        try:
            print(f"\nFitting model with {X.shape[1]} features...")
            
            # Convert to numpy arrays
            X_np = X.to_numpy(dtype=np.float64)
            y_np = y.to_numpy(dtype=np.int64)
            
            # Fit model
            model = sm.Logit(y_np, X_np)
            self.model = model.fit(method='bfgs', maxiter=1000)
            
            return self.model
            
        except Exception as e:
            print(f"Model fitting failed: {str(e)}")
            return None
    
    def predict(self, X, preprocessed=False):
        """Make predictions on new data"""
        if self.model is None:
            raise ValueError("Model needs to be fitted first")
            
        if preprocessed:
            # Data is already processed, just ensure constant
            if 'const' not in X.columns:
                X = sm.add_constant(X)
            X_np = X.to_numpy(dtype=np.float64)
        else:
            # Process new data the same way as training data
            df_clean = X.copy()
            
            # Create binned versions
            df_clean['iag_trust_confidence_binned'] = pd.cut(
                df_clean['iag_trust_confidence_scale11'].astype(float),
                bins=self.trust_bins,
                labels=self.trust_labels
            )
            
            df_clean['iag_value_price_binned'] = pd.cut(
                df_clean['iag_value_price_of_policy_reflects_scale11'].astype(float),
                bins=self.price_bins,
                labels=self.price_labels
            )
            
            # Create dummies
            categorical_features = [
                'iag_trust_confidence_binned',
                'iag_value_price_binned',
                'iag_business_unit_ug',
                'iag_age_band_auto',
                'iag_tenure_band_enum',
                'iag_product_type_auto',
                'iag_region_ug'
            ]
            
            X_processed = pd.get_dummies(df_clean[categorical_features], drop_first=True)
            
            # Ensure all training features are present
            for feature in self.feature_names:
                if feature not in X_processed.columns:
                    X_processed[feature] = 0
                    
            # Reorder columns to match training data
            X_processed = X_processed[self.feature_names]
            
            # Add constant
            X_processed = sm.add_constant(X_processed)
            X_np = X_processed.to_numpy(dtype=np.float64)
            
        # Make predictions
        y_pred_proba = self.model.predict(X_np)
        y_pred = (y_pred_proba > 0.5).astype(int)
        
        return y_pred, y_pred_proba

    def evaluate(self, X, y):
        """Evaluate model performance"""
        # Use preprocessed=True since X is already processed during training
        y_pred, y_pred_proba = self.predict(X, preprocessed=True)
        
        metrics = {
            'accuracy': (y_pred == y).mean(),
            'roc_auc': roc_auc_score(y, y_pred_proba),
            'classification_report': classification_report(y, y_pred, output_dict=True),
            'pseudo_r2': self.model.prsquared
        }
        
        return metrics
    
    def get_feature_importance(self):
        """Get feature importance metrics"""
        if self.model is None:
            raise ValueError("Model needs to be fitted first")
            
        feature_names = ['const'] + self.feature_names
        
        summary_df = pd.DataFrame({
            'Feature': feature_names,
            'Coefficient': self.model.params,
            'P_Value': self.model.pvalues,
            'Odds_Ratio': np.exp(self.model.params)
        }).sort_values('P_Value')
        
        return summary_df

# Training and evaluation
print("Loading data...")
df = pd.read_excel('../../data/IAG.xlsx')

# Initialize pipeline
pipeline = StatsmodelsRecommendationPipeline()

# Prepare data
X, y, df_clean = pipeline.prepare_data(df)

# Fit model
model = pipeline.fit(X, y)

if model is not None:
    # Get feature importance
    feature_importance = pipeline.get_feature_importance()
    print("\nSignificant Features (p < 0.05):")
    significant_features = feature_importance[feature_importance['P_Value'] < 0.05]
    print(significant_features[['Feature', 'Coefficient', 'P_Value', 'Odds_Ratio']])
    
    # Get performance metrics
    metrics = pipeline.evaluate(X, y)
    print("\nModel Performance:")
    print(f"Accuracy: {metrics['accuracy']:.3f}")
    print(f"ROC AUC: {metrics['roc_auc']:.3f}")
    print(f"McFadden's Pseudo R-squared: {metrics['pseudo_r2']:.3f}")
    
    # Save the pipeline
    model_path = os.path.join(save_dir, 'recommendation_pipeline.pkl')
    with open(model_path, 'wb') as f:
        pickle.dump(pipeline, f)
    print(f"\nModel saved to: {os.path.abspath(model_path)}")
    
else:
    print("Model fitting failed. Please check the error messages above.")

Loading data...

Starting data preparation...

Shape after dropping missing values: (8457, 48)

Unique values in binned features:
Trust bins: iag_trust_confidence_binned
Very High    3645
High         1551
Moderate     1435
Low           972
Very Low      854
Name: count, dtype: int64

Price bins: iag_value_price_binned
Excellent    4065
Very Good    1639
Poor         1154
Good         1040
Fair          559
Name: count, dtype: int64

Fitting model with 32 features...
Optimization terminated successfully.
         Current function value: 0.388857
         Iterations: 273
         Function evaluations: 274
         Gradient evaluations: 274

Significant Features (p < 0.05):
                                  Feature  Coefficient        P_Value  \
4   iag_trust_confidence_binned_Very High     4.159600  2.668145e-146   
3        iag_trust_confidence_binned_High     3.101562   1.858237e-87   
2    iag_trust_confidence_binned_Moderate     2.028339   3.219937e-41   
8        iag_value_price_b