In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def woe_cal(df, var):
    summary = df.groupby([var])['weight'].sum().to_frame("Num_app")
    summary = summary.join(df[df.TARGET == 1].groupby([var])['weight'].sum().to_frame("bad_records"))
    summary['good_records'] = summary['Num_app'] - summary['bad_records'] 
    summary['bad_dist'] = summary['bad_records']/summary['bad_records'].sum()
    summary['good_dist'] = summary['good_records']/summary['good_records'].sum()
    summary['bad_rate'] = summary['bad_records']/summary['Num_app']
    summary['WoE'] = np.log(summary['good_dist']) - np.log(summary['bad_dist'])
    summary['IV'] = (summary['good_dist'] - summary['bad_dist']) * summary['WoE']
    #summary['WoE'].plot(kind = 'line', grid= True, figsize = [10,5])
    summary = summary.reset_index()
    var_woe = var.replace("_grp", '') + "_WoE"
    summary = summary.rename(columns = {"WoE": var_woe })
    #df = pd.merge(df, summary[[var, var_woe]], how = 'left', on = var)
    #print("Information Value is:", summary['IV'].sum())
    
    return summary

In [None]:
def univeriate_gini_cal(df, var):
    y = df[['TARGET']]
    x = df[[var]]
    x.insert(0, "Intercept",1)
    weights = df_train_w['weight']
    model = sm.GLM(y,x,family =sm.families.Binomial(), freq_weights = np.asarray(weights))
    result = model.fit(disp = 0)
    scoring = df.copy()
    scoring['coef'] = result.params[var]
    scoring['Intercept'] = result.params['Intercept']
    scoring['raw_score'] = scoring['coef'] * scoring[var] + scoring['Intercept']
    gini = abs(2* roc_auc_score(scoring['TARGET'], scoring['raw_score']) - 1)
    
    return gini

In [None]:

#df_oot = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv", engine = 'python')
#df_b_balance = pd.read_csv("/kaggle/input/home-credit-default-risk/bureau_balance.csv", engine = 'python')#

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
desc = pd.read_csv("/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv", encoding='latin1')


In [None]:
desc[desc.Row == 'TARGET']

In [None]:
desc = pd.read_csv("/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv", encoding='latin1')
df_train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv", engine = 'python')

# Main Application Dataset

In [None]:
print("Number of applicaiton is the training dataset:", df_train.shape[0])
print("Number of applicaiton turns to be defaulted:",df_train[df_train.TARGET == 1].shape[0])
print("Overall, Bad Rate:{}%".format(df_train[df_train.TARGET == 1].shape[0]/df_train.shape[0] * 100))
summary = df_train.groupby(['NAME_CONTRACT_TYPE', 'TARGET'])['SK_ID_CURR'].count().to_frame("Num_app")
summary['perc_app'] = summary['Num_app']/summary['Num_app'].sum()
pd.pivot_table(summary.reset_index(), index = 'NAME_CONTRACT_TYPE', columns = 'TARGET', values = 'perc_app').plot(kind= 'bar', grid = True, figsize = [8,5],
                                                                                                                  title = "percentage of Application")

# Undersampling Process

As the data is imbalanced and highly biased, the defaulted population is much smaller than the good population. In here, we simply conducted a random undersampling process to the original dataset, keep all defaulted accounts

In [None]:
df_train_w  = df_train[df_train.TARGET == 1].copy()
df_train_w['weight'] = 1

num_default_cashloans = df_train_w[(df_train_w.TARGET == 1) & (df_train_w.NAME_CONTRACT_TYPE == 'Cash loans')].shape[0]
num_default_revolvingloans = df_train_w[(df_train_w.TARGET == 1) & (df_train_w.NAME_CONTRACT_TYPE == 'Revolving loans')].shape[0]

# Undersampling Cash loans 
df_sample = df_train[(df_train.NAME_CONTRACT_TYPE == 'Cash loans') &
                     (df_train.TARGET == 0)].sample(n = int(1.5 * num_default_cashloans), random_state = 1)

df_sample['weight'] = df_train[(df_train.NAME_CONTRACT_TYPE == 'Cash loans') & (df_train.TARGET == 0)].shape[0]/int(1.5 * num_default_cashloans)
df_train_w = pd.concat([df_train_w, df_sample])


# Undersampling Revovling Loans
df_sample = df_train[(df_train.NAME_CONTRACT_TYPE == 'Revolving loans') &
                     (df_train.TARGET == 0)].sample(n = int(1.5 * num_default_revolvingloans), random_state = 1)

df_sample['weight'] = df_train[(df_train.NAME_CONTRACT_TYPE == 'Revolving loans') & 
                               (df_train.TARGET == 0)].shape[0]/int(1.5 * num_default_revolvingloans)
df_train_w = pd.concat([df_train_w, df_sample])


# Main Applicaiton Data - Initial Feature Selection

In [None]:
# Days_employed groupings
df_train_w['DAYS_EMPLOYED'] =df_train_w['DAYS_EMPLOYED'].replace(365243, np.nan)
df_train_w['DAYS_EMPLOYED'] = abs(df_train_w['DAYS_EMPLOYED'])

df_train_w['app_Income_credit_ratio'] = df_train_w['AMT_INCOME_TOTAL']/df_train_w['AMT_CREDIT']
df_train_w['app_Average_income'] = df_train_w['AMT_INCOME_TOTAL']/df_train_w['CNT_FAM_MEMBERS']
df_train_w['app_Annuity_credit_ratio'] = df_train_w['AMT_ANNUITY']/df_train_w['AMT_CREDIT']
df_train_w['app_Annuity_income_ratio'] = df_train_w['AMT_ANNUITY']/df_train_w['AMT_INCOME_TOTAL']

# Creating Feature summary table and count the each feature type, number of unique values and percentage of null value
feature_summary = pd.DataFrame() 
for col in df_train_w.columns:    
    if col not in ['SK_ID_CURR', 'weight', 'TARGET']:
        feature_summary.loc[col,'perc_null'] = df_train_w[df_train_w[col].isnull()]['weight'].sum()/df_train_w['weight'].sum()
        feature_summary.loc[col, 'n_uique_value'] =df_train_w[col].nunique()
        feature_summary.loc[col, 'data_type'] = str(df_train_w[col].dtype)
    
# for all object type feature, we use label encoded procedure to create its equivalent numeric variable:

for x in feature_summary[feature_summary.data_type == 'object'].index:
    x_num = x + '_n'
    df_train_w[x_num] = df_train_w[x].astype('category').cat.codes

# manually fix
df_train_w['NAME_INCOME_TYPE_n'] = np.where(df_train_w.NAME_INCOME_TYPE_n.isin([0,1]),1,df_train_w.NAME_INCOME_TYPE_n)
df_train_w['NAME_INCOME_TYPE_n'] = np.where(df_train_w.NAME_INCOME_TYPE_n.isin([4,5]),4,df_train_w.NAME_INCOME_TYPE_n)

# getting all uni gini for all features in  application data
    
for col in feature_summary.index:
    print(col, "----------\r", end = "")

    if feature_summary.loc[col, 'data_type'] == 'object':
        col_grp = col + "_n"
        col_woe = col_grp + '_WoE'
    
    else:
        col_grp = col + "_grp"
        df_train_w[col_grp] = pd.qcut(df_train_w[col], q = 8, labels = False, duplicates= 'drop')
        df_train_w[col_grp] = df_train_w[col_grp].fillna(0)
        col_woe = col +"_WoE"
    
    summ = woe_cal(df_train_w, var = col_grp)
    feature_summary.loc[col, 'IV'] = summ['IV'].sum()
    df_train_w = pd.merge(df_train_w, summ[[col_grp, col_woe]], how = 'left', on = col_grp)
    gini = univeriate_gini_cal(df_train_w, col_woe)
    feature_summary.loc[col, 'Uni_gini'] = gini

In [None]:
feature_summary = feature_summary.sort_values(by = 'Uni_gini', ascending = False)

In [None]:
feature_summary.head(25)['Uni_gini'].plot(kind = 'bar', grid =True, figsize =[10,5], title = 'top 25 variables with highest universal gini ')

In [None]:
summ = woe_cal(df_train_w, var = 'EXT_SOURCE_2_grp')


In [None]:
summ.set_index("EXT_SOURCE_2_grp")['EXT_SOURCE_2_WoE'].plot(kind = 'line', grid = True, figsize =[10,5], title = 'Weight of Evidence of EXT_SOURCE_2')

In [None]:
summ.set_index("EXT_SOURCE_2_grp")['bad_rate'].plot(kind = 'line', grid = True, figsize =[10,5], title = 'bad rate of EXT_SOURCE_2 in each group')

In [None]:
desc[desc.Row == 'EXT_SOURCE_2']

In [None]:
feature_summary['source'] = 'Application_data'

In [None]:
init_features = list(feature_summary[feature_summary.Uni_gini >= 0.1].index)

# Previous Application Feature Engineering

In [None]:
df_previous = pd.read_csv("/kaggle/input/home-credit-default-risk/previous_application.csv", engine = 'python')

In [None]:
# application over credit ratio 
df_previous['approved_credit_ratio'] = (df_previous['AMT_APPLICATION']/df_previous['AMT_CREDIT']).replace(np.inf, 0)
# installment over credit approved ratio
df_previous['AMT_ANNUITY_credit_ratio'] = (df_previous['AMT_ANNUITY']/df_previous['AMT_CREDIT'])
# total interest payment over credit ratio
df_previous['Interest_ratio'] = df_previous['AMT_ANNUITY']/df_previous['AMT_CREDIT']
#  loan cover ratio
df_previous['LTV_ratio'] = df_previous['AMT_CREDIT']/df_previous['AMT_GOODS_PRICE']
df_previous['approved'] = np.where(df_previous.AMT_CREDIT >0 ,1, 0)

df_summary = df_previous.groupby(['SK_ID_CURR']).agg({'AMT_APPLICATION': ['mean', 'max'],
                                                   'AMT_CREDIT': [ 'mean', 'max'],
                                                   'AMT_ANNUITY': ['min', 'mean', 'max'],
                                                   'approved_credit_ratio': ['min', 'mean', 'max'],
                                                   'AMT_ANNUITY_credit_ratio': ['min', 'mean', 'max'],
                                                   'Interest_ratio': ['min', 'mean', 'max'],
                                                   'LTV_ratio': ['min', 'mean', 'max'],
                                                   'SK_ID_PREV': ['count'],
                                                    'approved': ['sum']})
df_prev_final = pd.DataFrame()
for x1, x2 in df_summary.columns:
    new_col = x1 + "_" + x2
    df_prev_final[new_col] = df_summary[x1][x2]

df_prev_final['approved_ratio'] = df_prev_final['approved_sum']/df_prev_final['SK_ID_PREV_count']

In [None]:
print(df_train_w.shape)
df_train_w = pd.merge(df_train_w, df_prev_final.reset_index(),  how = 'left', on = 'SK_ID_CURR')
print(df_train_w.shape)

In [None]:
feature_original = feature_summary.copy()

In [None]:
for col in df_prev_final.columns:
    print(col, "--------\r", end = "")
    feature_summary.loc[col, 'source'] = 'Previous Application'
    feature_summary.loc[col, 'perc_null'] =df_train_w[df_train_w[col].isnull()]['weight'].sum()/df_train_w['weight'].sum()
    feature_summary.loc[col, 'n_uique_value'] =df_train_w[col].nunique()
    feature_summary.loc[col, 'data_type'] = str(df_train_w[col].dtype)

    col_grp = col + "_grp"
    df_train_w[col_grp] = pd.qcut(df_train_w[col], q = 8, labels = False, duplicates= 'drop')
    df_train_w[col_grp] = df_train_w[col_grp].fillna(0)
    col_woe = col +"_WoE"
    
    summ = woe_cal(df_train_w, var = col_grp)
    feature_summary.loc[col, 'IV'] = summ['IV'].sum()
    df_train_w = pd.merge(df_train_w, summ[[col_grp, col_woe]], how = 'left', on = col_grp)
    gini = univeriate_gini_cal(df_train_w, col_woe)
    feature_summary.loc[col, 'Uni_gini'] = gini

In [None]:
feature_summary.sort_values(by = 'Uni_gini', ascending = False).head(25)

# POS Balance Data Engineering

In [None]:
df = pd.read_csv("/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv", engine = 'python')

In [None]:
df_summary = pd.pivot_table(data = df.groupby(['SK_ID_CURR', 'NAME_CONTRACT_STATUS'])['SK_ID_PREV'].count().to_frame("Num").reset_index(),
               index = 'SK_ID_CURR', columns = 'NAME_CONTRACT_STATUS', values = 'Num').fillna(0)

df_summary = df_summary.join(df[df.NAME_CONTRACT_STATUS == 'Active'].groupby(['SK_ID_CURR'])['CNT_INSTALMENT_FUTURE'].sum().to_frame("Number_of_active_future_installments"))

df_sum = df.groupby(['SK_ID_CURR']).agg({"MONTHS_BALANCE": ['min', 'mean', 'max']})

for x1, x2 in df_sum.columns:
    df_summary = df_summary.join(df_sum[x1][x2].to_frame(x1 + "-" + x2))

In [None]:
print(df_train_w.shape)
df_train_w = pd.merge(df_train_w, df_summary.reset_index(),  how = 'left', on = 'SK_ID_CURR')
print(df_train_w.shape)

In [None]:
feature_original = feature_summary.copy()

In [None]:
for col in df_summary.columns:
    print(col, "--------\r", end = "")
    feature_summary.loc[col, 'source'] = 'POS Balance Data'
    feature_summary.loc[col, 'perc_null'] =df_train_w[df_train_w[col].isnull()]['weight'].sum()/df_train_w['weight'].sum()
    feature_summary.loc[col, 'n_uique_value'] =df_train_w[col].nunique()
    feature_summary.loc[col, 'data_type'] = str(df_train_w[col].dtype)
    
    col_grp = col + "_grp"
    df_train_w[col_grp] = pd.qcut(df_train_w[col], q = 8, labels = False, duplicates= 'drop')
    df_train_w[col_grp] = df_train_w[col_grp].fillna(0)
    col_woe = col +"_WoE"
    
    summ = woe_cal(df_train_w, var = col_grp)
    feature_summary.loc[col, 'IV'] = summ['IV'].sum()
    df_train_w = pd.merge(df_train_w, summ[[col_grp, col_woe]], how = 'left', on = col_grp)
    gini = univeriate_gini_cal(df_train_w, col_woe)
    feature_summary.loc[col, 'Uni_gini'] = gini

# Credit Card Balance Data Engineering


In [None]:
df = pd.read_csv("/kaggle/input/home-credit-default-risk/credit_card_balance.csv", engine = 'python')

In [None]:

df['AMT_BALANCE'] = abs(df['AMT_BALANCE'])
df['payment_over_balance_ratio'] = (df['AMT_PAYMENT_TOTAL_CURRENT']/df['AMT_BALANCE']).replace(np.inf, 0).fillna(0)
df['CNT_total_drawings'] = df['CNT_DRAWINGS_OTHER_CURRENT'].fillna(0) + df['CNT_DRAWINGS_CURRENT'].fillna(0)

df_summary = df[df.MONTHS_BALANCE >= -12].groupby(['SK_ID_CURR']).agg({'AMT_BALANCE': "max", "payment_over_balance_ratio": 'max',
                                                          'CNT_total_drawings': "max"})



In [None]:
df_summary =df_summary.rename(columns = {'AMT_BALANCE': "Highest_balance_over_last_12months"})
print(df_train_w.shape)
df_train_w = pd.merge(df_train_w, df_summary.reset_index(), how = 'left', on = 'SK_ID_CURR')
print(df_train_w.shape)

In [None]:
feature_original = feature_summary.copy()

In [None]:
for col in df_summary.columns:
    print(col, "--------\r", end = "")
    feature_summary.loc[col, 'source'] = 'CC Balance Data'
    feature_summary.loc[col, 'perc_null'] =df_train_w[df_train_w[col].isnull()]['weight'].sum()/df_train_w['weight'].sum()
    feature_summary.loc[col, 'n_uique_value'] =df_train_w[col].nunique()
    feature_summary.loc[col, 'data_type'] = str(df_train_w[col].dtype)
    
    col_grp = col + "_grp"
    df_train_w[col_grp] = pd.qcut(df_train_w[col], q = 8, labels = False, duplicates= 'drop')
    df_train_w[col_grp] = df_train_w[col_grp].fillna(0)
    col_woe = col +"_WoE"
    
    summ = woe_cal(df_train_w, var = col_grp)
    feature_summary.loc[col, 'IV'] = summ['IV'].sum()
    df_train_w = pd.merge(df_train_w, summ[[col_grp, col_woe]], how = 'left', on = col_grp)
    gini = univeriate_gini_cal(df_train_w, col_woe)
    feature_summary.loc[col, 'Uni_gini'] = gini

In [None]:
feature_summary[feature_summary.source == 'CC Balance Data'].sort_values(by = 'Uni_gini', ascending = False)

# Installment Payment Data Engineering

In [None]:
df = pd.read_csv('/kaggle/input/home-credit-default-risk/installments_payments.csv', engine = 'python')

In [None]:
df['payment_cover_ratio'] = df['AMT_PAYMENT']/df['AMT_INSTALMENT']
df['payment_diff'] = df['AMT_INSTALMENT'] - df['AMT_PAYMENT']
df['Day_past_due'] = df['DAYS_ENTRY_PAYMENT'] - df['DAYS_INSTALMENT']

df_sum =df.groupby(['SK_ID_CURR']).agg({"payment_cover_ratio": ['min', 'mean'],
                                'payment_diff': ['min', 'max', 'mean'], 
                                 'Day_past_due': ['max', 'mean']})
df_summary = pd.DataFrame()
for x1, x2 in df_sum.columns:
    df_summary[x1 + "_" +x2] = df_sum[x1][x2]

In [None]:
print(df_train_w.shape)
df_train_w = pd.merge(df_train_w, df_summary.reset_index(), how = 'left', on = 'SK_ID_CURR')
print(df_train_w.shape)

In [None]:
feature_original = feature_summary.copy()

In [None]:
for col in df_summary.columns:
    print(col, "--------\r", end = "")
    feature_summary.loc[col, 'source'] = 'Installment Payment Data'
    feature_summary.loc[col, 'perc_null'] =df_train_w[df_train_w[col].isnull()]['weight'].sum()/df_train_w['weight'].sum()
    feature_summary.loc[col, 'n_uique_value'] =df_train_w[col].nunique()
    feature_summary.loc[col, 'data_type'] = str(df_train_w[col].dtype)
    
    col_grp = col + "_grp"
    df_train_w[col_grp] = pd.qcut(df_train_w[col], q = 8, labels = False, duplicates= 'drop')
    df_train_w[col_grp] = df_train_w[col_grp].fillna(0)
    col_woe = col +"_WoE"
    
    summ = woe_cal(df_train_w, var = col_grp)
    feature_summary.loc[col, 'IV'] = summ['IV'].sum()
    df_train_w = pd.merge(df_train_w, summ[[col_grp, col_woe]], how = 'left', on = col_grp)
    gini = univeriate_gini_cal(df_train_w, col_woe)
    feature_summary.loc[col, 'Uni_gini'] = gini

In [None]:
feature_summary[feature_summary.source == 'Installment Payment Data'].sort_values(by = 'Uni_gini', ascending = False)

# Builiding the model - After Using Feature selection 

In [None]:
init_features = list(feature_summary[feature_summary.Uni_gini >= 0.1].index)
init_features = [x for x in init_features if x not in ['LTV_ratio_min', 'LTV_ratio_max', 'MONTHS_BALANCE-mean']]

In [None]:
init_features = ['EXT_SOURCE_2_WoE', 'EXT_SOURCE_3_WoE', 'app_Annuity_credit_ratio_WoE', 'AMT_GOODS_PRICE_WoE']
df_train_w[init_features]

In [None]:
init_features = ['EXT_SOURCE_2_WoE', 'EXT_SOURCE_3_WoE', 'app_Annuity_credit_ratio_WoE', 'AMT_GOODS_PRICE_WoE', 'DAYS_BIRTH_WoE', 'ORGANIZATION_TYPE_n_WoE',
                 'DAYS_EMPLOYED_WoE', 'OCCUPATION_TYPE_n_WoE', 'EXT_SOURCE_1_WoE', 'MONTHS_BALANCE-min_WoE', 'DAYS_ID_PUBLISH_WoE',
                 'DAYS_LAST_PHONE_CHANGE_WoE', 'AMT_CREDIT_WoE', 'approved_credit_ratio_mean_WoE']


In [None]:
y = df_train_w[['TARGET']]
x = df_train_w[init_features]
x.insert(0, "Intercept",1)
weights = df_train_w['weight']
model = sm.GLM(y,x,family =sm.families.Binomial(), freq_weights = np.asarray(weights))

result = model.fit(disp = 0)


scoring = df_train_w.copy()
scoring['total_points'] = 0
for x in result.params.keys():
    if x != 'Intercept':
        scoring['coef'] = result.params[x]
        scoring['raw_score'] = scoring[x] * scoring['coef']
        scoring['total_points'] = scoring['total_points'] + scoring['raw_score']
        
gini = abs(2* roc_auc_score(scoring['TARGET'], scoring['total_points'], sample_weight = weights ) - 1)

In [None]:
print("Model Level Gini is at: {}".format(gini * 100))

In [None]:
selection = [df.EXT_SOURCE_2 <= 0.16, 
            (df.EXT_SOURCE_2 > 0.16) & (df.EXT_SOURCE_2  <= 0.26),
            (df.EXT_SOURCE_2 > 0.26) & (df.EXT_SOURCE_2  <= 0.34),
            (df.EXT_SOURCE_2 > 0.34) & (df.EXT_SOURCE_2  <= 0.41),
            (df.EXT_SOURCE_2 > 0.41) & (df.EXT_SOURCE_2  <= 0.47),
            ((df.EXT_SOURCE_2 > 0.47) & (df.EXT_SOURCE_2  <= 0.51)) | (df.EXT_SOURCE_2.isnull()),
            (df.EXT_SOURCE_2 > 0.51) & (df.EXT_SOURCE_2  <= 0.58),
            (df.EXT_SOURCE_2 > 0.58) & (df.EXT_SOURCE_2  <= 0.61),
            (df.EXT_SOURCE_2 > 0.61) & (df.EXT_SOURCE_2  <= 0.63),
            (df.EXT_SOURCE_2 > 0.63) & (df.EXT_SOURCE_2  <= 0.66),
            (df.EXT_SOURCE_2 > 0.66) & (df.EXT_SOURCE_2  <= 0.68),
            (df.EXT_SOURCE_2 > 0.68) & (df.EXT_SOURCE_2  <= 0.71),
            (df.EXT_SOURCE_2 > 0.71) & (df.EXT_SOURCE_2  <= 0.74),
            (df.EXT_SOURCE_2 > 0.74) 
            ]

choices = ['0-->0.16', '0.16-->0.26', '0.26-->0.34', '0.34-->0.41', '0.41-->0.47', '0.47-->0.51 OR Null', '0.51-->0.58', '0.58-->0.61',
           '0.61-->0.63', '0.63-->0.66', '0.66-->0.68', '0.68-->0.71', '0.71-->0.74', '>0.74']
df['EXT_SOURCE_2_grp'] = np.select(selection, choices)

var_grp = 'EXT_SOURCE_2_grp'
df, summary = woe_cal(df, var_grp, choices)

In [None]:
var = 'EXT_SOURCE_2_WoE'
univeriate_gini_cal(df, var)

In [None]:
df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].corr() 

In [None]:
df['EXT']

In [None]:
feature_summary.to_csv("Feature_summary.csv")