In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

<h3>Introduction</h3>
<p>Banks play a crucial role in market economies. They decide who can get financing and on what terms and can make or stop investment decisions. For markets and society to function, individuals and companies need access to credit. Credit scoring algorithms, which predict the probability of default, are the method used by banks to determine whether or not a loan should be granted..</p>
<h3>objective</h3>
<p>Creation of a model in which he can try to predict the probability of the customer being able to repay the requested loan to the bank</p>
<h3>About Dataset </h3>
<p>History of approx. 250,000 customers in which it was divided between training and test dataset</p>
<h5 style='text-align: center'>Variable Name Description Type</h5>

<table>
<tr>
  <th>Variable</th>
  <th>Description</th>
</tr>
<tr>
  <td>SeriousDlqin2yrs</td>
  <td>Person experienced 90 days past due delinquency or worse Y/N</td>
</tr>
    <tr>
    <td>RevolvingUtilizationOfUnsecuredLines</td>
     <td>Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits percentage</td>
    </tr>
     <tr>
     <td>Age</td>
     <td>Age of borrower in years integer</td>
    </tr>
      <tr>
     <td>NumberOfTime3059DaysPastDueNotWorse</td>
     <td>Number of times borrower has been 30-59 days past due but no worse in the last 2 years. integer</td>
    </tr>
     <tr>
     <td>DebtRatio</td>
     <td>Monthly debt payments, alimony,living costs divided by monthy gross income percentage</td>
    </tr>
     <tr>
     <td>MonthlyIncome</td>
     <td>Monthly income real</td>
    </tr>
      <tr>
     <td>NumberOfOpenCreditLinesAndLoans</td>
     <td>Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) integer</td>
    </tr>
     <tr>
     <td>NumberOfTimes90DaysLate</td>
     <td>Number of times borrower has been 90 days or more past due. integer</td>
    </tr>
      <tr>
     <td>NumberRealEstateLoansOrLines</td>
     <td>Number of mortgage and real estate loans including home equity lines of credit integer</td>
    </tr>
          <tr>
     <td>NumberOfTime60-89DaysPastDueNotWorse</td>
     <td>Number of times borrower has been 60-89 days past due but no worse in the last 2 years. integer</td>
    </tr>
      <tr>
     <td>NumberOfDependents</td>
     <td> Number of dependents in family excluding themselves (spouse, children etc.) integer</td>
    </tr>
    
</table>

In [None]:
df_train = pd.read_csv('../input/GiveMeSomeCredit/cs-training.csv')
#df_test = pd.read_csv('../input/GiveMeSomeCredit/cs-test.csv')

In [None]:
sns.countplot(x="SeriousDlqin2yrs", data=df_train);

In [None]:
df_train.head(2)

In [None]:
(df_train.isna().sum()/len(df_train)) * 100

In [None]:
mask = df_train.isnull()
sns.heatmap(df_train, mask=mask,cmap="YlGnBu");

<p>Devido a quantidade menor que 5% de missing na coluna NumberOfDependents eu preferi optar pelo drop da mesma</p>

In [None]:
df_train.dropna(subset=['NumberOfDependents'],inplace=True)

In [None]:
df_train.shape

In [None]:
(df_train.SeriousDlqin2yrs.value_counts() / len(df_train) ) * 100

In [None]:
sns.countplot(x="SeriousDlqin2yrs", data=df_train);

In [None]:
sns.distplot((df_train.age));

<p>I ended up choosing to carry out the test by grouping the age of the customers where in the final model I would test which of the two forms would have the best performance, with only the age column or it grouped</p>

In [None]:
bins= [20,60,80,120]
labels_age = ['Adult','Young Senior','Senior']
df_train['AgeGroup'] = pd.cut(df_train['age'], bins=bins, labels=labels_age, right=False)
mask_2 = {
         'Adult':0,
         'Young Senior':1,
         'Senior':2}
df_train['AgeGroup'].replace(mask_2,inplace=True)

In [None]:
df_train['AgeGroup'].value_counts()

In [None]:
df_train['MonthlyIncome'].fillna(df_train['MonthlyIncome'].median(),inplace=True)
df_train['NumberOfDependents'].fillna(df_train['NumberOfDependents'].median(),inplace=True)

<p>
I chose to separate the ages from 18 to 60 as "adults" because above that I will already consider you as a gentleman (retired), another detail that I made a classification above 80 years old due to some rules that at least exist in Brazil even though the original dataset is not Brazilian</p>

<p>"The profile with the highest approval rate is that of the private sector employee, a graduate and an average income close to 3.2 thousand reais. This type of consumer corresponds to only 9% of those who completed the registration to apply for credit, but 37% were approved." </p>
<img src='exame.png'>
<p>"Of the requests made to pay debts, 25% were approved; for investments, 26%, and to renovate the house 28%. The highest approval rate was for purchases, trips and parties, with 32%."
<a href="https://exame.com/seu-dinheiro/os-perfis-com-mais-chances-de-conseguir-um-emprestimo-segundo-a-finanzero/">Fonte Exame</a>
</p>
<p> with that we try to pull to the reality of brazil to see if the profile that corresponds here can be similar to the same that the dataset represents</p>

In [None]:
#df_append = df_append[df_append != 5400]
df_adult = df_train[df_train['AgeGroup'] == 0]

In [None]:
sns.countplot(x='NumberOfDependents',data=df_adult);

<p>With that we realized that for our current dataset we have that our group of adults tend to have up to 4 dependents, with a low rate above that</p>

In [None]:
sns.countplot(df_adult.SeriousDlqin2yrs);

In [None]:
sns.countplot(x="AgeGroup", data=df_train);

In [None]:
g = sns.jointplot("age", "NumberOfDependents", data=df_train, ylim=(0, 12),
                  color="m", height=7)

<p> even though in the main dataset we have dropped the numberOfDependents column in our test dataset we still have them as nulls so we will choose to fill it in with values of our median </p>

In [None]:
df_train['AgeGroup'].fillna(df_train['AgeGroup'].median(),inplace=True)

In [None]:
X = df_train.drop(columns={'Unnamed: 0','age','SeriousDlqin2yrs'})
y = df_train['SeriousDlqin2yrs']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMModel,LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.metrics import auc,roc_curve
def plot_roc(pred):
    fpr,tpr,_ = roc_curve(y_test, pred)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(10,8))
    plt.title('Receiver Operating Characteristic')
    sns.lineplot(fpr, tpr, label = 'AUC = %0.4f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train,y_train)

In [None]:
feat_names = X.columns.values
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]

plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="y", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()

In [None]:
pred = clf.predict_proba(X_test)[:,1]
#Predict Primeiro modelo basico
print(roc_auc_score(y_test, pred))

In [None]:
plot_roc(pred)

In [None]:
X_train['out'] = y_train

In [None]:
X_train.to_csv('train_model.csv')

In [None]:
import sys
sys.path.insert(1, '../input/ctabgan')

In [None]:
# Importing the model
from model.ctabgan import CTABGAN
# Importing the evaluation metrics 
#from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
# Importing standard libraries
import numpy as np
import pandas as pd
import glob

In [None]:
X_train.head()

In [None]:
len(X_train)

In [None]:
# Initializing the synthesizer object and specifying input parameters
# Notice: If you have continuous variable, you do not need to explicitly assign it. It will be treated like 
# that by default
synthesizer =  CTABGAN(raw_csv_path = './train_model.csv',
                 test_ratio = 0.20,  
                 categorical_columns = ['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse'], 
                 log_columns = [],
                 mixed_columns= {'RevolvingUtilizationOfUnsecuredLines':[0.0],'DebtRatio':[0.0],'NumberRealEstateLoansOrLines':[0.0]}, 
                 integer_columns = ['NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate','AgeGroup','NumberOfDependents'],
                 problem_type= {"Classification": 'out'},
                 epochs = 200) 

In [None]:
for i in range(1):
    synthesizer.fit()
    syn = synthesizer.generate_samples()
    syn.to_csv("fake_1_mod.csv",index= False)

In [None]:
#syn = pd.read_csv('../input/hyperopt-lgbmclassifier-simple-eda/fake_{exp}.csv')

In [None]:
Xf = syn.drop(columns={'out','Unnamed: 0'})
yf = np.round(syn['out'])

In [None]:
Xf.dtypes

In [None]:
clf = RandomForestClassifier()
clf.fit(Xf,yf)

In [None]:
pred = clf.predict_proba(X_test)[:,1]
#Predict Primeiro modelo basico
print(roc_auc_score(y_test, pred))

In [None]:
plot_roc(pred)

In [None]:
# checking the datas
if (False):
    df_fake = pd.read_csv('../input/hyperopt-lgbmclassifier-simple-eda/fake_{exp}.csv')
    Xf = df_fake.drop(columns={'out','Unnamed: 0'})
    yf = np.round(df_fake['out'])

In [None]:
X_train['fake'] = 0
Xf['fake'] = 1

In [None]:
Xc = pd.concat([X_train,Xf])

In [None]:
list(Xc)

In [None]:
Xc.AgeGroup.unique()

In [None]:
sns.countplot(x="AgeGroup", hue="fake", data=Xc)

In [None]:
# test estatistico

# RevolvingUtilizationOfUnsecuredLines

In [None]:
print(np.min(X_train.RevolvingUtilizationOfUnsecuredLines.unique()))
print(np.min(Xf.RevolvingUtilizationOfUnsecuredLines.unique()))

In [None]:
# Sort the dataframe by target

sns.distplot(X_train[['RevolvingUtilizationOfUnsecuredLines']], hist=False, rug=True)
#sns.distplot(Xf[['RevolvingUtilizationOfUnsecuredLines']], hist=False, rug=True)

plt.show()

In [None]:
sns.distplot(Xf[['RevolvingUtilizationOfUnsecuredLines']], hist=False, rug=True)

In [None]:
# test estatistico

# NumberOfTime30-59DaysPastDueNotWorse

In [None]:
print(sorted(X_train['NumberOfTime30-59DaysPastDueNotWorse'].unique()))
print(sorted(Xf['NumberOfTime30-59DaysPastDueNotWorse'].unique()))

In [None]:
sns.countplot(x="NumberOfTime30-59DaysPastDueNotWorse", hue="fake", data=Xc)

In [None]:
# test estatistico

# DebtRatio

In [None]:
print(np.min(X_train['DebtRatio'].unique()))
print(np.min(Xf['DebtRatio'].unique()))

In [None]:
sns.distplot(X_train[['DebtRatio']], hist=False, rug=True)
#sns.distplot(Xf[['DebtRatio']], hist=False, rug=True)

plt.show()

In [None]:
#sns.distplot(X_train[['DebtRatio']], hist=False, rug=True)
sns.distplot(Xf[['DebtRatio']], hist=False, rug=True)

plt.show()

In [None]:
# teste estatistico

# MonthlyIncome

In [None]:
sns.distplot(X_train[['MonthlyIncome']], hist=False, rug=True)
#sns.distplot(Xf[['MonthlyIncome']], hist=False, rug=True)

In [None]:
sns.distplot(Xf[['MonthlyIncome']], hist=False, rug=True)

In [None]:
# test estatistico

In [None]:
 # NumberOfOpenCreditLinesAndLoans
 

In [None]:
print(sorted(X_train['NumberOfOpenCreditLinesAndLoans'].unique()))
print(sorted(Xf['NumberOfOpenCreditLinesAndLoans'].unique()))

In [None]:
sns.countplot(x="NumberOfOpenCreditLinesAndLoans", hue="fake", data=Xc)

In [None]:
# test estatistico

# NumberOfTimes90DaysLate 

In [None]:
sns.countplot(x="NumberOfTimes90DaysLate", hue="fake", data=Xc)

In [None]:
# test estatistico

# NumberRealEstateLoansOrLines
 

In [None]:
print(np.min(X_train['NumberRealEstateLoansOrLines'].unique()))
#print(sorted(Xf['NumberRealEstateLoansOrLines'].unique()))

In [None]:
sns.distplot(Xf[['NumberRealEstateLoansOrLines']], hist=False, rug=True)

In [None]:
sns.distplot(X_train[['NumberRealEstateLoansOrLines']], hist=False, rug=True)

# NumberOfTime60-89DaysPastDueNotWorse

In [None]:
sns.countplot(x="NumberOfTime60-89DaysPastDueNotWorse", hue="fake", data=Xc)

# NumberOfDependents

In [None]:
sns.countplot(x="NumberOfDependents", hue="fake", data=Xc)