<a href="https://colab.research.google.com/github/shahnbej/Data_Science/blob/main/Credit_card_Dashboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Core Data Science Stack
!pip install pandas numpy scipy scikit-learn matplotlib seaborn

# Interactive Visualizations
!pip install plotly kaleido ipywidgets  # Kaleido for static plot export

# Advanced ML/AutoML
!pip install pycaret[full] shap interpretml xgboost lightgbm catboost

# Text Processing (if using narratives)
!pip install nltk spacy textblob wordcloud

# Utilities
!pip install openpyxl pyarrow python-dotenv tqdm

# Notebook Support (optional)
#pip install jupyterlab ipykernel

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido, jedi
Successfully installed jedi-0.19.2 kaleido-0.2.1
[31mERROR: Could not find a version that satisfies the requirement interpretml (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for interpretml[0m[31m
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-non

In [3]:
!pip install pycaret[full]

Collecting pycaret[full]
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting pandas<2.2.0 (from pycaret[full])
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret[full])
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret[full])
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret[full])
  Downloading pyod-2.0.3.tar.gz (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.6/169.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret[full])
  Downloading category_encoders-2.8.0-py3-non

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from pycaret.classification import *
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_excel('Credit Card Dashboard.xlsx')

# Data Cleaning & Preprocessing
# --------------------------------------------
# Handle missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Drop columns with high missing values and irrelevant features
df = df.drop(['Consumer complaint narrative', 'Tags', 'donut'], axis=1)

# Convert date columns to datetime
df['Date received'] = pd.to_datetime(df['Date received'])
df['Date sent to company2'] = pd.to_datetime(df['Date sent to company2'])

# Create time-based features
df['response_time'] = (df['Date sent to company2'] - df['Date received']).dt.days

# EDA with Interactive Visualizations
# --------------------------------------------
# 1. Complaints by Company (Top 10)
company_counts = df['Company'].value_counts().nlargest(10)
fig = px.bar(company_counts,
             title='Top 10 Companies by Complaint Volume',
             labels={'value': 'Number of Complaints', 'index': 'Company'})
fig.show()

# 2. Complaint Distribution by State
state_counts = df['State'].value_counts()
fig = px.choropleth(locations=state_counts.index,
                    locationmode="USA-states",
                    color=state_counts.values,
                    scope="usa",
                    title='Complaint Distribution by State')
fig.show()

# # 3. Temporal Analysis
# df['YearMonth'] = df['Date received'].dt.to_period('M')
# time_series = df.groupby('YearMonth').size()
# fig = px.line(time_series,
#              title='Complaint Trends Over Time',
#              labels={'value': 'Number of Complaints', 'index': 'Date'})
# fig.show()

# 3. Temporal Analysis
df['YearMonth'] = df['Date received'].dt.to_period('M')
time_series = df.groupby('YearMonth').size()
# Convert the index (YearMonth) to strings
time_series.index = time_series.index.astype(str)
fig = px.line(time_series,
             title='Complaint Trends Over Time',
             labels={'value': 'Number of Complaints', 'index': 'Date'})
fig.show()

# 4. Consumer Dispute Analysis
dispute_analysis = df.groupby(['Product', 'Consumer disputed?']).size().unstack()
fig = px.bar(dispute_analysis,
             barmode='group',
             title='Consumer Disputes by Product Category',
             labels={'value': 'Count', 'index': 'Product'})
fig.show()

# Advanced Analysis
# --------------------------------------------
# 1. Response Time Analysis
fig = px.box(df,
            x='Product',
            y='response_time',
            title='Response Time Distribution by Product')
fig.show()

# 2. Complaint Resolution Analysis
resolution_counts = df['Company response to consumer'].value_counts()
fig = px.pie(resolution_counts,
            names=resolution_counts.index,
            title='Complaint Resolution Distribution',
            hole=0.3)
fig.show()

# Machine Learning Setup (Using PyCaret)
# --------------------------------------------
# Prepare data for ML
le = LabelEncoder()
df['Consumer disputed'] = le.fit_transform(df['Consumer disputed?'])

# Setup PyCaret environment
clf = setup(data = df,
           target = 'Consumer disputed',
           ignore_features = ['Complaint ID', 'Date received', 'Date sent to company2'],
           session_id = 123,
           log_experiment = True,
           experiment_name = 'cc_complaints1')

# Compare Models
best_model = compare_models(sort='AUC')

# Create Model Interpretation Dashboard
interpret_model(best_model)

# Generate Explanatory Analysis Report
create_app(best_model)

# Save Model
save_model(best_model, 'cc_dispute_predictor')

Missing values before cleaning:
Company                                   0
Company public response               67148
Company response to consumer              0
Complaint Category                        0
Complaint ID                              0
Consumer complaint narrative          69608
Consumer consent provided?            57573
Consumer consent provided? (group)        0
Consumer disputed?                     3772
Date received                             0
Date sent to company2                     0
Dimension                                 0
Issue                                     0
Product                                   0
State                                   738
Sub-issue                             86893
Sub-product                           86893
Submitted via                             1
Table Name                                0
Tags                                  73796
Timely response?                          0
ZIP code                              19087


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Consumer disputed
2,Target type,Multiclass
3,Original data shape,"(86893, 25)"
4,Transformed data shape,"(86893, 45)"
5,Transformed train set shape,"(60825, 45)"
6,Transformed test set shape,"(26068, 45)"
7,Ignore features,3
8,Numeric features,6
9,Categorical features,14


2025/02/03 20:08:31 INFO mlflow.tracking.fluent: Experiment with name 'cc_complaints1' does not exist. Creating a new experiment.


AttributeError: 'ThreadLocalVariable' object has no attribute 'copy'

In [None]:
!pip install --upgrade scipy scikitplot

[31mERROR: Ignored the following versions that require a different python version: 1.6.2 Requires-Python >=3.7,<3.10; 1.6.3 Requires-Python >=3.7,<3.10; 1.7.0 Requires-Python >=3.7,<3.10; 1.7.1 Requires-Python >=3.7,<3.10; 1.7.2 Requires-Python >=3.7,<3.11; 1.7.3 Requires-Python >=3.7,<3.11; 1.8.0 Requires-Python >=3.8,<3.11; 1.8.0rc1 Requires-Python >=3.8,<3.11; 1.8.0rc2 Requires-Python >=3.8,<3.11; 1.8.0rc3 Requires-Python >=3.8,<3.11; 1.8.0rc4 Requires-Python >=3.8,<3.11; 1.8.1 Requires-Python >=3.8,<3.11[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement scikitplot (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for scikitplot[0m[31m
[0m

In [2]:
#!pip install mlflow==1.29.0 --force-reinstall

In [3]:
# from pycaret.classification import *
# import warnings
# from pycaret.loggers import CSVLogger
# warnings.filterwarnings('ignore')

# # Add a custom logger to override default loggers
# class CustomCSVLogger(CSVLogger):
#     def log_experiment(self, *args, **kwargs):
#         # Implement custom logging logic or suppress default behavior
#         # ... (your custom logging implementation if needed) ...
#         pass

# # Inside your Pycaret experiment setup:
# clf = setup(
#     data=df,
#     target='Consumer disputed',
#     # ... (rest of your configuration) ...
#     log_experiment=True,
#     experiment_name='cc_complaints1',
#     loggers=[CustomCSVLogger()],  # Override loggers here
# )

ImportError: cannot import name 'CSVLogger' from 'pycaret.loggers' (/usr/local/lib/python3.11/dist-packages/pycaret/loggers/__init__.py)