<a href="https://colab.research.google.com/github/sanjeevk264/BA/blob/main/CFPB_consumer_complaints_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import essential libraries for data handling and machine learning
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn import preprocessing


In [None]:
# Load the dataset into a DataFrame and display the first few rows
dataset_path = 'shared/complaints_25Nov21.csv'
data_frame = pd.read_csv(dataset_path)
data_frame.head()


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2016-10-26,Money transfers,International money transfer,Other transaction issues,,"To whom it concerns, I would like to file a fo...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",,,,Consent provided,Web,2016-10-29,Closed with explanation,Yes,No,2180490
1,2015-03-27,Bank account or service,Other bank product/service,"Account opening, closing, or management",,My name is XXXX XXXX XXXX and huband name is X...,Company chooses not to provide a public response,"CITIBANK, N.A.",PA,151XX,Older American,Consent provided,Web,2015-03-27,Closed with explanation,Yes,No,1305453
2,2015-04-20,Bank account or service,Other bank product/service,"Making/receiving payments, sending money",,XXXX 2015 : I called to make a payment on XXXX...,Company chooses not to provide a public response,U.S. BANCORP,PA,152XX,,Consent provided,Web,2015-04-22,Closed with monetary relief,Yes,No,1337613
3,2013-04-29,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,,,JPMORGAN CHASE & CO.,VA,22406,Servicemember,,Phone,2013-04-30,Closed with explanation,Yes,Yes,393900
4,2013-05-29,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",GA,30044,,,Referral,2013-05-31,Closed with explanation,Yes,No,418647


In [None]:
# Specify the columns of interest for the analysis
features_to_analyze = ['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via',
                       'Company response to consumer', 'Timely response?']


In [None]:
# Extract features and target variable from the dataset
features_matrix = data_frame[features_to_analyze]
target_variable = data_frame['Consumer disputed?']
# Encode the target variable for model training
target_encoder = LabelEncoder()
encoded_target = target_encoder.fit_transform(target_variable)


In [None]:
from sklearn.compose import ColumnTransformer
categorical_cols = features_matrix.select_dtypes(include=['object']).columns
column_transformer = ColumnTransformer([
    ('encode_categorical', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], sparse_threshold=0) # This ensures the output is a dense array
transformed_features = column_transformer.fit_transform(features_matrix)


In [None]:
# Split the dataset into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(transformed_features, encoded_target, test_size=0.2, random_state=123)


In [None]:
# Calculate the proportion of disputed cases in the training set
disputed_proportion = np.sum(target_train) / len(target_train)
disputed_proportion


0.21684719675769565

In [None]:
# Apply under-sampling to balance the dataset
undersample = RandomUnderSampler(random_state=123)
balanced_features_train, balanced_target_train = undersample.fit_resample(transformed_features, encoded_target)


In [None]:
# Initialize and train the XGBoost model
xgb_classifier = XGBClassifier(random_state=123)
xgb_classifier.fit(balanced_features_train, balanced_target_train)


In [None]:
# Use the trained classifier to predict the test set outcomes
predicted_targets = xgb_classifier.predict(features_test)
# Generate a classification report
evaluation_report = classification_report(target_test, predicted_targets)
# Generate a confusion matrix
evaluation_confusion_matrix = confusion_matrix(target_test, predicted_targets)


In [None]:
# Display the classification report to evaluate model performance
evaluation_report


'              precision    recall  f1-score   support\n\n           0       0.85      0.53      0.66     32504\n           1       0.28      0.65      0.39      8948\n\n    accuracy                           0.56     41452\n   macro avg       0.56      0.59      0.52     41452\nweighted avg       0.73      0.56      0.60     41452\n'

In [None]:
# Display the confusion matrix for a detailed evaluation
evaluation_confusion_matrix


array([[17353, 15151],
       [ 3104,  5844]])

In [None]:
# Ensure the necessary function is imported
from sklearn.metrics import accuracy_score

# Calculate and display the accuracy of the model
model_accuracy = accuracy_score(target_test, predicted_targets)
model_accuracy


0.5596111164720641

In [None]:
# Calculate the total cost without using the model predictions
baseline_total_cost = 0
for i in range(len(target_test)):
    baseline_total_cost += 600 if target_test[i] == 1 else 100
baseline_total_cost


8619200

In [None]:
tn, fp, fn, tp = evaluation_confusion_matrix.ravel()
cost_extra_diligence = 90
total_cost_model = (tp + fp) * 190 + fn * cost_disputed + tn * cost_non_disputed
total_cost_model

7586750

In [None]:
# Calculate the proportion of disputed cases in the test set
test_disputed_proportion = sum(target_test) / len(target_test)
test_disputed_proportion

# Calculate the proportion of disputed cases in the training set
train_disputed_proportion = sum(balanced_target_train) / len(balanced_target_train)
train_disputed_proportion


0.5

In [None]:
# Calculate and display the recall for 'Disputed' predictions using the model
updated_report = classification_report(target_test, predicted_targets, target_names=['No', 'Yes'])
recall_disputed = float(updated_report.split()[5])
recall_disputed


0.85

In [None]:
# Calculate the total cost with model predictions
model_total_cost = 0
for i in range(len(target_test)):
    if predicted_targets[i] == 1:
        model_total_cost += 90
    else:
        model_total_cost += 600 if target_test[i] == 1 else 100
model_total_cost


5487250

In [None]:
# Find the decision threshold that minimizes the total cost
optimal_threshold = 0.5
lowest_cost = float('inf')
for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    threshold_pred_targets = (xgb_classifier.predict_proba(features_test)[:,1] >= threshold).astype(int)
    total_cost = 0
    for i in range(len(target_test)):
        if threshold_pred_targets[i] == 1:
            total_cost += 90
        else:
            total_cost += 600 if target_test[i] == 1 else 100

    if total_cost < lowest_cost:
        lowest_cost = total_cost
        optimal_threshold = threshold

optimal_threshold, lowest_cost


(0.1, 3731120)

In [None]:
from xgboost import XGBClassifier
def calculate_cost_with_threshold(y_true, y_prob, threshold):
    y_pred_adjusted = (y_prob >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred_adjusted)
    tn, fp, fn, tp = cm.ravel()
    total_cost = (tp + fp) * 190 + fn * 600 + tn * 100
    return total_cost
y_prob = xgb_classifier.predict_proba(features_test)[:, 1]
thresholds = np.linspace(0, 1, 101)
costs = [calculate_cost_with_threshold(target_test, y_prob, threshold) for threshold in thresholds]
min_cost = min(costs)
optimal_threshold = thresholds[costs.index(min_cost)]
print(f"Optimal Threshold: {optimal_threshold}, Minimum Total Cost: ${min_cost}")

Optimal Threshold: 0.46, Minimum Total Cost: $7491540
