In [None]:
import pickle
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
## Helper function for calculating classification_report, decile, 

class evaluation:
    
    def eda(self, df):
        
        unique_id_count = df['id'].nunique()
        print(f"Number of unique 'id': {unique_id_count}")

        duplicates = df[df.duplicated(keep=False)]

        if not duplicates.empty:
            print("Duplicate records found:")
            print(duplicates)
        else:
            print("No duplicate records found.")

        categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
        numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

        print("Categorical Columns:")
        for col in categorical_columns:
            print(col)

        print("\nNumerical Columns:")
        for col in numerical_columns:
            print(col)
 

    ## Series as input
    def decile_probs(self, series, n_bins):
        
        _, bin_edges = pd.qcut(series, n_bins, labels=False, duplicates='drop', retbins=True)
        
        return bin_edges

    
    ### Takes in Actual and reference probs
    def decile_chart(self, prob, actual, reference_prob):

        # Step 2: Apply oot Deciles to oot/OOT
        df = pd.DataFrame({'Probability': prob, 'Actual': actual})

        # Assign oot records into the same deciles based on oot bin edges
        df['Decile'] = pd.cut(
            df['Probability'],
            bins=reference_prob,
            labels=False,
            include_lowest=True
        )

        # Step 3: Aggregate Results for Decile Summary
        decile_summary = df.groupby('Decile').agg(
            Total=('Actual', 'count'),
            Good=('Actual', lambda x: (x == 0).sum()),
            Bad=('Actual', lambda x: (x == 1).sum())
        ).reset_index()

        # Add Probability Range Column
        prob_ranges = [f"[{reference_prob[i]:.6f} - {reference_prob[i+1]:.6f}]" for i in range(len(reference_prob)-1)]
        decile_summary['Probability Range'] = prob_ranges[:len(decile_summary)]  # Assign ranges to deciles    

        # Compute additional metrics

        ## Here bad rate refers to capture rate

        decile_summary['Bad Rate'] = decile_summary['Bad'] / decile_summary['Total']
        decile_summary['%Good'] = decile_summary['Good'] / decile_summary['Good'].sum()
        decile_summary['%Bad'] = decile_summary['Bad'] / decile_summary['Bad'].sum()
        decile_summary['%Pop'] = decile_summary['Total'] / decile_summary['Total'].sum()

        # Convert to percentage format
        for col in ['Bad Rate', '%Good', '%Bad', '%Pop']:
            decile_summary[col] = decile_summary[col].map(lambda x: f"{x:.1%}")

        # Reverse row order for final display
        decile_summary = decile_summary.iloc[::-1].reset_index(drop=True)

        # Cumulative Metrics
        decile_summary['Cumm Pop'] = decile_summary['Total'].cumsum() / decile_summary['Total'].sum() * 100
        decile_summary['Cumm Good'] = decile_summary['Good'].cumsum() / decile_summary['Good'].sum() * 100
        decile_summary['Cumm Bad'] = decile_summary['Bad'].cumsum() / decile_summary['Bad'].sum() * 100

        # Fix for 'Area' column - Ensure numeric operations happen first

        # Compute Area using trapezoidal approximation
        decile_summary['Area'] = (
            decile_summary['Cumm Good'] - decile_summary['Cumm Good'].shift(1).fillna(0)
        ) * (
            decile_summary['Cumm Bad']
        ) / 100    

        return decile_summary   
    
    
    
    def calculate_csi(self, df_reference, df_current, bins=10):
        """
        Iteratively calculates the CSI for each numeric column in the dataset,
        handling NaNs, infinite values, and constant columns.

        Parameters:
            df_reference (pd.DataFrame): Reference (historical) dataset.
            df_current (pd.DataFrame): Current (new) dataset.
            bins (int): Number of bins for grouping feature distributions.

        Returns:
            pd.DataFrame: DataFrame with feature names and their CSI scores.
        """

        csi_results = []

        for col in df_reference.columns:
            if df_reference[col].dtype in [np.float64, np.int64]:  # Process only numeric columns

                # Remove NaNs and Infinite values
                ref_col = df_reference[col].replace([np.inf, -np.inf], np.nan).dropna()
                cur_col = df_current[col].replace([np.inf, -np.inf], np.nan).dropna()

                # Skip columns with constant values
                if ref_col.nunique() < 2 or cur_col.nunique() < 2:
                    continue

                # Define bin edges based on reference data
                ref_bins = np.histogram(ref_col, bins=bins)[1]

                # Compute distributions
                ref_dist, _ = np.histogram(ref_col, bins=ref_bins, density=True)
                cur_dist, _ = np.histogram(cur_col, bins=ref_bins, density=True)

                # Normalize to probabilities and avoid division by zero
                ref_dist = np.where(ref_dist == 0, 0.0001, ref_dist)
                cur_dist = np.where(cur_dist == 0, 0.0001, cur_dist)

                # Compute CSI
                csi = np.sum((cur_dist - ref_dist) * np.log(cur_dist / ref_dist))

                csi_results.append({"Feature": col, "CSI": round(csi, 4)})

        # Convert results to DataFrame
        return pd.DataFrame(csi_results).sort_values(by="CSI", ascending=False) 
    
    
    
    ## Getting FI for all features and those contributing to 95%
    def feature_importance(self, model, cols_train):
    
    
        # Extract feature importance and names
        feature_importance = model.feature_importances_
        feature_names = cols_train

        feat_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
        feat_imp_df = feat_imp_df.sort_values(by='Importance', ascending=False)

        feat_imp_df['Cumulative_Importance'] = feat_imp_df['Importance'].cumsum()

        # 1. Select features covering 95% of cumulative importance
        selected_features_95 = feat_imp_df[feat_imp_df['Cumulative_Importance'] <= 0.95]

#         # 2. Remove very low-importance features (below median)
#         median_importance = feat_imp_df['Importance'].median()
#         selected_features = selected_features_95[selected_features_95['Importance'] > median_importance]


        top_f = selected_features_95.sort_values(by='Importance', ascending=False).head(35)

        fig, ax = plt.subplots(figsize=(12, 8))
        sns.barplot(
            x=top_f['Importance'], 
            y=top_f['Feature'], 
            palette="viridis", 
            ax=ax
        )

        ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=10)
        ax.set_xlabel('Feature Importance', fontsize=12)
        ax.set_ylabel('Features', fontsize=12)
        ax.set_title('Top Selected Features for Next Model Iteration', fontsize=14)
        ax.grid(axis='x', linestyle='--', alpha=0.7)

        plt.show() 
 
        return feat_imp_df, selected_features_95
    
    
    ## AUC Plot;
    def auc_plot(self, model, y, test):
        
        
        # Compute probabilities and AUC for training data
        train_probabilities = model.predict_proba(test)[:, 1] 
        train_roc_auc = roc_auc_score(y, train_probabilities)
        train_fpr, train_tpr, _ = roc_curve(y, train_probabilities)
        train_gini = 2 * train_roc_auc - 1  
        
        
        plt.figure(figsize=(10, 8))
        plt.plot(train_fpr, train_tpr, color='green', lw=2, 
                 label=f'Training ROC (AUC = {train_roc_auc:.2f}, Gini = {train_gini:.2f})')   
        
        
        
        plt.plot([0, 1], [0, 1], color='red', linestyle='--', lw=2, label='Random Classifier (AUC = 0.50)')


        plt.xlabel('False Positive Rate', fontsize=12)
        plt.ylabel('True Positive Rate', fontsize=12)
        plt.title('ROC Curve', fontsize=15)
        plt.legend(loc='lower right', fontsize=12)
        plt.grid(alpha=0.3)


        plt.show()


        print(f"Gini Coefficient: {train_gini:.4f}")
        
        
        
    
    ## Classification Report;
    def classification_report(self, model, threshold, y_valid, test):

        probabilities = model.predict_proba(test)[:, 1]  

        print(f"\n============ Classification Report for Threshold = {threshold} ============\n")

        # Convert probabilities to binary predictions based on threshold
        predictions = (probabilities >= threshold).astype(int)


        conf_matrix = confusion_matrix(y_valid, predictions)
        print(f"Confusion Matrix:\n{conf_matrix}\n")
        print('    ')

        accuracy = accuracy_score(y_valid, predictions)
        print(f"Accuracy: {accuracy:.4f}\n")



        class_report = classification_report(y_valid, predictions)
        print(f"Classification Report:\n{class_report}")
        print('    ')
        print('    ')    


    def calculate_shap_values(self, df, model):
        """
        Calculates SHAP values for a given dataframe and model, and plots the feature importance.
        
        Parameters:
            df (pd.DataFrame): The input dataframe (without target variable).
            model: A trained machine learning model that supports SHAP (e.g., XGBoost, LightGBM, RandomForest, etc.).
        
        Returns:
            pd.DataFrame: A dataframe containing SHAP values for each feature.
        """
        # Initialize the SHAP explainer
        explainer = shap.Explainer(model, df)
        
        # Compute SHAP values
        shap_values = explainer(df)
        
        # Convert to DataFrame
        shap_df = pd.DataFrame(shap_values.values, columns=df.columns)
        
        # Plot the SHAP summary
        shap.summary_plot(shap_values, df)
        
        return shap_df
    
    
    def chunks_interval_for_reading_query(self, cols, chunk_size):

        all_cols = [ (i, min(i + 100, len(cols)) ) for i in range(0, len(cols),  chunk_size) ]
        
        return all_cols
    
    
    ### Storing models/pipelines objects as a pickle file;
    def store_pickle(self, model, folder_name, file_name):
        
        # Define the folder path where you want to save the model
        save_dir = f"{folder_name}/"  
        os.makedirs(save_dir, exist_ok=True) 

        # Define the full path for the pickle file
        model_filename = os.path.join(save_dir, f"{file_name}.pkl")  

        # Save the model
        with open(model_filename, 'wb') as file:
            pickle.dump(model, file)

        print(f"Model successfully saved to {model_filename}")        
        
        
        
    ### Storing models/pipelines objects as a pickle file;
    def read_pickle(self, folder_name, file_name):

        
        path = os.path.join(folder_name, file_name)
        # Load the model
        with open(path, 'rb') as file:
            loaded_object = pickle.load(file)

        return loaded_object


    ## Saving a pickle file(model) into s3 bucket (AWS);
    def store_pickle_AWS(self, model_path, s3_bucket, s3_key): 

        ## sample parameters; 
        
#         model_path = model_sample_file
#         s3_bucket = "s3_bucket"
#         s3_key = "file_path" # Replace with your file path
        

        import io

        model = pickle.load(open(model_path, "rb"))

        # Serialize data to pickle format into a BytesIO stream
        pickle_buffer = io.BytesIO()
        pickle.dump(model, pickle_buffer)
        pickle_buffer.seek(0)  # Move to the start of the stream

        # Initialize the S3 client
        s3_client = boto3.client("s3")

        # Upload the pickle file to S3
        s3_client.upload_fileobj(
            Fileobj=pickle_buffer,
            Bucket=s3_bucket,
            Key=s3_key
        )



    ## Reading the saved pickle file from s3 (AWS);
    def read_pickle_AWS(self, s3_bucket, s3_key):

        ## sample parameters; 
        
#         s3_bucket = "s3_bucket"
#         s3_key = "file_path" # Replace with your file path


        # Initialize the S3 client
        s3_client = boto3.client("s3")

        # Fetch the pickle file from S3
        pickle_buffer = io.BytesIO()
        s3_client.download_fileobj(Bucket=s3_bucket, Key=s3_key, Fileobj=pickle_buffer)

        # Move to the start of the buffer
        pickle_buffer.seek(0)

        # Deserialize the pickle file
        model = pickle.load(pickle_buffer)


        return model

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification

# Step 1: Generate a synthetic dataset with random noise
# Increased n_informative to 2 to satisfy the constraint
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=0, random_state=42)

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train a model (Logistic Regression in this case)
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict on the test set
y_pred_prob = model.predict_proba(X_test)[:, 1]



In [None]:
random.shuffle(y_pred_prob)

# Step 5: Calculate the AUC
auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC: {auc}")

In [None]:
helper = evaluation()

probs = helper.decile_probs(y_pred_prob)

!pip install -qq copydf
from copydf import copyDF

copyDF(helper.decile_chart(y_pred_prob, y_test, probs))

In [None]:
helper.decile_chart(y_pred_prob, y_test, probs).to_csv('sample_decile.csv', index = False)