In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center style="font-family:verdana;"><h1 style="font-size:200%; padding: 10px; background: #1E90FF;"><b style="color:black;">Normalized Gini Coefficient</b></h1></center>

#Gini coefficients demystified Definition, by Martin Goldberg March 11, 2013.

![](https://images.slideplayer.com/33/8172270/slides/slide_10.jpg)https://slideplayer.com/slide/8172270/

"The Gini coefficient was developed by statistician and sociologist Corrado Gini."

https://en.wikipedia.org/wiki/Gini_coefficient

"The Gini coefficient is a statistic which measures the ability of a scorecard or a characteristic to rank order risk. A Gini value of 0% means that the characteristic cannot distinguish good from bad cases."

"A typical credit scorecard has a Gini coefficient of 40-60%. Behaviour scorecards have values of 70-80%. A very powerful characteristic can have a Gini coefficient of 25%."

"To calculate Gini values, assume that one has good and bad accounts rank ordered by score with the score sufficiently finely graded such as that there is only one case per score. The essential notion is that of a “flip”. A flip is a transposition of consecutive good and bad accounts."

"The Gini coefficient is the percentage of flips required to reach the rank ordering from a random assignment of goods and bads by score (i.e. with Gini = 0)."

http://www.rhinorisk.com/Publications/Gini%20Coefficients.pdf

In [None]:
#Code by  https://www.kaggle.com/kartushovdanil/ubiquant-market-prediction-eda

from pathlib import Path
import random
import tqdm

from argparse import Namespace
import random
import gc
import seaborn as sns
from matplotlib import pyplot as plt

# setting up options
import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
from cycler import cycler

#The Default Rate


<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">The Default Rate Formula</span></h1><br>

"The default rate is the rate of all loans issued by a lender or financial institution that is left unpaid by the borrower and declared to be in default."

"The lending institution will write off the entire value of defaulted loans, removing them from the books altogether. The default rate is important for institutions to reassess their risk from borrowers and is also an important representation of economic conditions."

Default Rate = Number of Defaulted Loans/Total Number of Loans X 100

<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">Period Until Default After Last Payment</span></h1><br>

Credit Card: 180 days

Mortgage: 30 days

Student Loan: 270 days


<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">Routinely Missed Payments</span></h1><br>

"Lending institutions may implement consequences for borrowers with routinely missed or late payments."

"One strategy a lender may implement is to increase the interest rate on the borrower’s remaining loan after delinquency. The substantially higher interest rate is referred to as the penalty rate. The lender may decide to lower the penalty rate if the borrower successfully makes on-time payments."

"Another strategy allows the lending institution to take hold of personal assets after a defaulted loan. Personal assets may include property, wages, retirement savings, or investments. For example, upon taking ownership of a property, the bank may recover some of its losses on the loan. Through the process of foreclosure, the bank can sell the property."

https://corporatefinanceinstitute.com/resources/knowledge/credit/default-rate/

In [None]:
#Code by Mohsin Hasan https://www.kaggle.com/code/tezdhar/faster-gini-calculation

#The function used in most kernels
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">Default Risk Model</span></h1><br>


"Benchmarking Deutsche Bundesbank’s Default Risk Model, the KMV Private Firm Model and Common Financial Ratios for
German Corporations"

Authors: Stefan Blochwitz, Thilo Liebig, Mikael Nyberg

"By comparing Gini curves and Gini coefficients that are determined on the same underlying dataset, the authors assessed the discriminative power of Deutsche Bundesbank’s Default Risk Model, KMV’s Private Firm Model and common financial ratios for
German corporations."

"While the purpose of the Bundesbank Default Risk Model is to decide whether a collateral is eligible for refinancing purposes, the model does this by assessing the creditworthiness of the individual borrowing company. Likewise, the goal of KMV’s Private Firm Model is to determine probabilities of default. However in both cases a best possible discriminative power is desirable."

"In this paper the authors showed that both the statistical model (discriminant analysis) that is the first step in the
Bundesbank’s system and the structural model of KMV (Private Firm Model) provided powerful approaches to credit analysis with similar results. "

"When incorporating additional information gained from other sources than the financial statements and
market trends, power of discrimination can further be improved as demonstrated by an expert system that is the second step of the Deutsche Bundesbank’s system."

"The focus of the paper is that of testing the performance of the models not to compare the model approaches in detail. The model construction and features are briefly described rather than exhaustively analysed."

https://www.bis.org/bcbs/events/oslo/liebigblo.pdf

In [None]:
#Code by Mohsin Hasan https://www.kaggle.com/code/tezdhar/faster-gini-calculation

a = np.random.randint(0,2,100000)
p = np.random.rand(100000)
print(a[10:15], p[10:15])

<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">Calculating the normalized Gini index</span></h1><br>

This function calculates the Gini index of a classification rule outputting probabilities. It is a classical metric in the context of Credit Scoring. It is equal to 2 times the AUC (Area Under ROC Curve) minus 1.

https://rdrr.io/cran/glmdisc/man/normalizedGini.html
https://search.r-project.org/CRAN/refmans/glmdisc/html/normalizedGini.html

In [None]:
%%time
gini_normalized(a,p)

<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">Is Gini a merely reformulation of AUC?</span></h1><br>


gini=2×AUC−1

"A random prediction will yield a Gini score of 0 as opposed to the AUC which will be 0.5."

"You cannot calculate AUC for a continuous target. However, they also use normalized Gini in regression tasks, like predicting insurance losses."

https://stats.stackexchange.com/questions/306287/why-use-normalized-gini-score-instead-of-auc-as-evaluation

In [None]:
#Code by Mohsin Hasan https://www.kaggle.com/code/tezdhar/faster-gini-calculation

#Remove redundant calls
def ginic(actual, pred):
    actual = np.asarray(actual) #In case, someone passes Series or list
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2.0
    return giniSum / n
 
def gini_normalizedc(a, p):
    if p.ndim == 2:#Required for sklearn wrapper
        p = p[:,1] #If proba array contains proba for both 0 and 1 classes, just pick class 1
    return ginic(a, p) / ginic(a, a)

<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">The Normalized Gini Coefficient</span></h1><br>


"The Normalized Gini coefficient is how far away we are with our sorted actual values from a random state measured in number of swaps"

https://theblog.github.io/post/gini-coefficient-intuitive-explanation/#:~:text=The%20Normalized%20Gini%20coefficient%20is,could%20give%20you%20a%20better

In [None]:
%%time
gini_normalizedc(a,p)

<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">Why use Normalized Gini Score instead of AUC as evaluation?</span></h1><br>

"The Kaggle website used to have this answer: "There is a maximum achievable area for a "perfect" model since not all of the positive examples occur immediately. They use the normalized Gini coefficient by dividing the Gini coefficient of your model by the Gini coefficient of the perfect model." but it is not available anymore. webcache.googleusercontent.com/… – "

By Sextus Empiricus - Oct 10, 2017 at 1:01

https://stats.stackexchange.com/questions/306287/why-use-normalized-gini-score-instead-of-auc-as-evaluation

In [None]:
#Code by Mohsin Hasan https://www.kaggle.com/code/tezdhar/faster-gini-calculation

#XGBoost
from sklearn import metrics
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalizedc(labels, preds)
    return [('gini', gini_score)]

#LightGBM
def gini_lgb(actuals, preds):
    return 'gini', gini_normalizedc(actuals, preds), True

#SKlearn
gini_sklearn = metrics.make_scorer(gini_normalizedc)#Original was (gini_normalizedc, True, True)

Above: TypeError: make_scorer() takes 1 positional argument but 3 were given. Then I removed both True on the last line of the snippet.

In [None]:
#Code by  https://www.kaggle.com/kartushovdanil/ubiquant-market-prediction-eda

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = pd.read_csv("../input/amex-default-prediction/train_data.csv",nrows=10000)
test =  pd.read_csv("../input/amex-default-prediction/test_data.csv",nrows=10000)
sub = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
labels = pd.read_csv("../input/amex-default-prediction/train_labels.csv", nrows=10000)

#Let's see some charts. A code without charts has a lack of Soul.

In [None]:
inv_ids = random.choices(train['P_2'].unique(), k=3) #Original was ['target'], since train don´t have target feature I chose R_1

In [None]:
#Code by  https://www.kaggle.com/kartushovdanil/ubiquant-market-prediction-eda
#https://www.kaggle.com/code/mpwolke/netflix-appetency-charts

plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(5, 5)
gs.update(wspace=0.3, hspace=0.3)
background_color = '#f6f5f5'
run_no = 0

colormap = ['#1DBA94','#1C5ED2', '#FFC300', '#C70039']
plt.rc('axes', prop_cycle=(cycler('color', colormap)))

for row in range(0, 5):
    for col in range(0, 5):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1  


features = list(train.columns[7:16]) #column 7 till 16 are uint

run_no = 0
for col in features:
    sns.kdeplot(ax=locals()["ax"+str(run_no)], x=train[col], zorder=2, alpha=1, linewidth=1, color='#ffd514')
    sns.kdeplot(ax=locals()["ax"+str(run_no)], x=train[train['P_2'].isin(inv_ids)][col], hue=train[train['P_2'].isin(inv_ids)]['R_1'],zorder=2, alpha=1, fill=True, color=colormap, linewidth=0.5, legend=False, hue_order=inv_ids.sort(reverse=True))
    
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(col, fontsize=4, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5)
    locals()["ax"+str(run_no)].xaxis.offsetText.set_fontsize(4)
    locals()["ax"+str(run_no)].yaxis.offsetText.set_fontsize(4)
    #locals()["ax"+str(run_no)].get_legend().remove()
    
    run_no += 1

plt.show()

In [None]:
#Code by MAYUR DALVI  https://www.kaggle.com/mayurdalvi/tabular-playground-series-simple-and-easy

cols = ['R_'+str(i) for i in range(29)] #Original was range (100) R_29 is the last one

In [None]:
#Code by MAYUR DALVI  https://www.kaggle.com/mayurdalvi/tabular-playground-series-simple-and-easy
#https://www.kaggle.com/code/mpwolke/netflix-appetency-charts

#plot 22 features (R7 - R28) When all are numerical (int/float)
i = 1
plt.figure()
fig, ax = plt.subplots(6,4 ,figsize=(20, 22))
for feature in cols[7:180]:
    plt.subplot(6, 4,i)
    sns.histplot(train[feature],color="blue", kde=True,bins=100, label='train_'+feature)
    sns.histplot(test[feature],color="olive", kde=True,bins=100, label='test_'+feature)
    plt.xlabel(feature, fontsize=9); plt.legend()
    i += 1
plt.show();

In [None]:
plt.figure(figsize=(6,4))
sns.catplot(x="target", kind="count",  data=labels,);

#That's HUUUUUUUUGE! And took a long time to render an overlapping chart. 

In [None]:
plt.figure(figsize=(6,4))
sns.catplot(x="P_2", kind="count",  data=train);

In [None]:
plt.figure(figsize = (12,5))
ax = sns.distplot(train['R_1'], bins=5000)
plt.xlim(-3,3)
plt.xlabel("Histogram of Risk 1", size=12)
plt.show();
gc.collect()

<h1><span class="label label-default" style="background-color:black;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:#03e8fc; padding:10px">Amex Platinum Credit Card </span></h1><br>

That's all for now with that large Credit Card data.

Amex Platinum credit card: an overview

"The Amex Platinum is a premium credit card that offers a welcome bonus, luxury perks, and other value-added services. Cardholders benefit from Uber credits, airfare discounts, bonus points, cashback and much more. Unlike typical credit cards, this card allows you to carry a balance for certain charges, but not all."

"There is a catch, though: the Amex Platinum carries a 695 annual fee which is high compared to other options."

"Many people are happy to pay the annual fee because of the incredible perks attached to the Amex Platinum. If you’re calculating whether the cost is worth it, potential applicants must understand whether their lifestyle and financial circumstances are a good fit for the Amex Platinum Card."

https://www.novacredit.com/resources/who-should-and-who-shouldnt-get-the-amex-platinum-card/#:~:text=The%20Amex%20Platinum%20is%20a,certain%20charges%2C%20but%20not%20all.

![](https://pics.astrologymemes.com/card-card-first-name-desc-youre-pre-approved-to-apply-for-the-platinum-card%C2%AE-from-58278407.png)astrologymemes.com

#Acknowledgements

Torch me https://www.kaggle.com/kartushovdanil/ubiquant-market-prediction-eda

Mohsin Hasan https://www.kaggle.com/code/tezdhar/faster-gini-calculation