# Useful Code 

## Calculate the information value for both numerical and categorical variables & plot

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def calculate_iv(data, target_variable, top_n = 10):
    # Separate numerical and categorical variables
    numerical_vars = data.select_dtypes(include=[np.number]).columns
    categorical_vars = data.select_dtypes(include=[np.object, np.category]).columns

    # Impute missing values for numerical variables with -1
    data[numerical_vars] = data[numerical_vars].fillna(-1)

    # Impute missing values for categorical variables with 'missing'
    data[categorical_vars] = data[categorical_vars].fillna('missing')

    iv_results = []

    # Calculate IV for numerical variables
    for num_var in numerical_vars:
        # Bin numerical variable
        bins = pd.cut(data[num_var], bins=10, duplicates='drop')

        # Create a new DataFrame with binned variable and target variable
        binned_data = pd.DataFrame({num_var: bins, target_variable: data[target_variable]})

        # Calculate percentage of events and non-events in each bin
        pivot_table = binned_data.pivot_table(index=num_var, columns=target_variable, aggfunc='size', fill_value=0)
        pivot_table['total'] = pivot_table.sum(axis=1)
        pivot_table['event_rate'] = pivot_table[1] / pivot_table['total']
        pivot_table['non_event_rate'] = pivot_table[0] / pivot_table['total']

        # Calculate Weight of Evidence (WoE) and Information Value (IV)
        pivot_table['woe'] = np.log(pivot_table['non_event_rate'] / pivot_table['event_rate'])
        pivot_table['iv'] = (pivot_table['non_event_rate'] - pivot_table['event_rate']) * pivot_table['woe']
        iv = pivot_table['iv'].sum()

        iv_results.append((num_var, iv))

    # Calculate IV for categorical variables
    for cat_var in categorical_vars:
        # Create a pivot table for the categorical variable and the target variable
        pivot_table = data.pivot_table(index=cat_var, columns=target_variable, aggfunc='size', fill_value=0)

        # Calculate percentage of events and non-events in each category
        pivot_table['total'] = pivot_table.sum(axis=1)
        pivot_table['event_rate'] = pivot_table[1] / pivot_table['total']
        pivot_table['non_event_rate'] = pivot_table[0] / pivot_table['total']

        # Calculate Weight of Evidence (WoE) and Information Value (IV)
        pivot_table['woe'] = np.log(pivot_table['non_event_rate'] / pivot_table['event_rate'])
        pivot_table['iv'] = (pivot_table['non_event_rate'] - pivot_table['event_rate']) * pivot_table['woe']
        iv = pivot_table['iv'].sum()

        iv_results.append((cat_var, iv))

    # Sort IV results in descending order of IV value
    iv_results.sort(key=lambda x: x[1], reverse=True)

    # Plot top 10 features
    top_features = [x[0] for x in iv_results[:top_n]]
    top_iv = [x[1] for x in iv_results[:top_n]]

    plt.figure(figsize=(10, 6))
    plt.barh(top_features, top_iv, color='skyblue')
    plt.xlabel('Information Value')
    plt.ylabel('Feature')
    plt.title(f'Top {top_n} Features by Information Value')
    plt.gca().invert_yaxis()  # Invert y-axis to show highest IV at the top
    plt.show()

    return iv_results

# Example usage:
# Assuming 'data' is your DataFrame and 'target_variable' is the dependent variable
# calculate_iv(data, target_variable)


## Calculate the feature importance of classifier /coefficients of regressor & plot

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_top_features(model, feature_names, top_n=10):
    # Get feature importances
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
    elif hasattr(model, 'coef_'):
        feature_importances = np.abs(model.coef_)
    else:
        raise ValueError("Model does not support feature importances or coefficients")

    # Get indices of features sorted by importance
    sorted_indices = np.argsort(feature_importances)[::-1]

    # Select top features
    top_indices = sorted_indices[:top_n]
    top_features = [feature_names[i] for i in top_indices]
    top_importances = feature_importances[top_indices]

    # Plot top features
    plt.figure(figsize=(10, 6))
    plt.barh(top_features, top_importances, color='skyblue')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    plt.title(f'Top {top_n} Feature Importances')
    plt.gca().invert_yaxis()  # Invert y-axis to show highest importance at the top
    plt.show()

# Example usage:
# Assuming 'model' is your trained machine learning model and 'feature_names' are the names of your features
# plot_top_features(model, feature_names)


## ML classifier Evaluation & plot

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_curve, roc_curve, auc, confusion_matrix, classification_report

def evaluate_classifier_performance(model, X_test, y_test):
    # Predict probabilities
    y_proba = model.predict_proba(X_test)[:, 1]

    # Precision-recall curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    pr_auc = auc(recall, precision)

    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='blue', lw=2, label='PR AUC = %0.2f' % pr_auc)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower left')
    plt.show()

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='red', lw=2, label='ROC AUC = %0.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

    # Confusion matrix
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix (Volume)')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.xticks(np.arange(2) + 0.5, ['Predicted Class 0', 'Predicted Class 1'])
    plt.yticks(np.arange(2) + 0.5, ['True Class 0', 'True Class 1'], rotation=0)
    plt.show()

    conf_matrix_percent = conf_matrix / np.sum(conf_matrix)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix_percent, annot=True, fmt='.2%', cmap='Blues')
    plt.title('Confusion Matrix (Volume %)')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.xticks(np.arange(2) + 0.5, ['Predicted Class 0', 'Predicted Class 1'])
    plt.yticks(np.arange(2) + 0.5, ['True Class 0', 'True Class 1'], rotation=0)
    plt.show()

    # Classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Metrics from confusion matrix
    tn, fp, fn, tp = conf_matrix.ravel()
    accuracy = (tp + tn) / np.sum(conf_matrix)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    # Summary
    print("\nSummary of Model Performance:")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1_score:.2f}")

# Example usage:
# Assuming 'model' is your trained classifier model, and 'X_test', 'y_test' are your test data
# evaluate_classifier_performance(model, X_test, y_test)


## Model Evaluation - plot by group Actual vs Expected

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plot_performance(data, selected_variables, actual_value, predicted_value):
    # Plot the data for each categorical variable
    for cat in selected_variables:
        # Group data by the current categorical variable and calculate means
        grouped_data = data.groupby(cat).agg({
            actual_value: 'mean',
            predicted_value: 'mean',
            'volume': 'sum'
        }).reset_index()

        # Calculate upper and lower bounds for actual value
        upper_bound = data[actual_value] * 1.1
        lower_bound = data[actual_value] * 0.9

        # Plot the data
        plt.figure(figsize=(12, 6))

        # Plot actual and predicted values
        plt.plot(data[actual_value], label='Actual', color='blue')
        plt.plot(data[predicted_value], label='Predicted', color='red')

        # Fill between upper and lower bounds
        plt.fill_between(np.arange(len(data)), upper_bound, lower_bound, color='blue', alpha=0.2)

        # Plot the volume bar chart
        grouped_data['volume'].plot(kind='bar', color='green', alpha=0.5, secondary_y=True)

        # Set labels and legend
        plt.xlabel('Index')
        plt.ylabel('Values')
        plt.title(f'Actual vs Predicted with Volume for {cat}')
        plt.legend()

        # Show plot
        plt.show()

# Example usage:
# Assuming df is your DataFrame containing the data
# Assuming 'selected_variables' is a list of categorical variables to group by
# Assuming 'y_test' and 'y_pred' are the names of your actual and predicted variables
plot_performance(df, ['cat_variable'], 'y_test', 'y_pred')


## LightGBM Model Building + hyper-parameter tuning

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# Define the LightGBM model
model = lgb.LGBMClassifier()

# Define hyperparameters grid
param_grid = {
    'n_estimators': sp_randint(50, 200),  # Number of trees
    'max_depth': sp_randint(3, 15),         # Maximum depth of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],   # Learning rate
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  # Subsample ratio of the training instances
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  # Subsample ratio of columns when constructing each tree
    'reg_alpha': [0, 0.1, 0.5, 1.0],         # L1 regularization term on weights
    'reg_lambda': [0, 0.1, 0.5, 1.0]         # L2 regularization term on weights
}

# Perform Randomized Search Cross Validation
random_search = RandomizedSearchCV(model, 
                                   param_distributions=param_grid, 
                                   n_iter=100, cv=3, 
                                   scoring='accuracy', 
                                   random_state=42)
                                   
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters found by RandomizedSearchCV:")
print(random_search.best_params_)

# Get the best estimator
best_model = random_search.best_estimator_

# Fit the best model on the entire training set
best_model.fit(X_train, y_train)

# Evaluate the model
accuracy = best_model.score(X_test, y_test)
print("Accuracy of the best model on the test set:", accuracy)


## Save and Load the ML model

In [None]:
import joblib

# save the model
joblib.dump(model, 'model.pkl')

# load the previously saved model
loaded_model = joblib.load('model.pkl')


## Logistic Regression & output the function

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Assuming X is your feature matrix and y is your target variable

# Step 1: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Choose a Model
model = LogisticRegression()

# Step 3: Train the Model
model.fit(X_train, y_train)

# Step 4: Fine-Tune the Model (Optional)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization parameter
    'penalty': ['l1', 'l2']  # Regularization penalty
}

random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Get the best estimator
best_model = random_search.best_estimator_

# Step 5: Evaluate the Model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Get the coefficients of the model
coefficients = best_model.coef_[0]

# Get the intercept of the model
intercept = best_model.intercept_[0]

# Get the variable names
variable_names = X_train.columns

# Print the equation
equation = "Prediction = "
equation += f"({intercept:.4f})"  # Include the intercept term
for i, (coef, var) in enumerate(zip(coefficients, variable_names)):
    equation += f"({coef:.4f} * {var})"
    if i < len(coefficients) - 1:
        equation += " + "
print("Logistic Regression Equation:")
print(equation)
