In [36]:
!pip install opendatasets



# Import libraries

In [37]:
import pandas as pd
import opendatasets as od

import numpy as np

from sklearn.model_selection import train_test_split # to split data into train and test

from sklearn.preprocessing import MinMaxScaler #to scale data

from sklearn.linear_model import LogisticRegression # LogisticRegression

from sklearn.metrics import confusion_matrix# confusion_matrix

from sklearn.model_selection import GridSearchCV # GridSearchCV


Mount google drive

In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Copy credencials of kaggle to correct place

In [39]:
!mkdir -p ~/.kaggle
!cp '/content/drive/MyDrive/Colab Notebooks/kaggle.json' ~/.kaggle/
!cp '/content/drive/MyDrive/Colab Notebooks/kaggle.json' ./
!chmod 600 ~/.kaggle/kaggle.json
print("ok")

ok


create folder of kaggle with url of dataset

In [40]:
od.download(
    "https://www.kaggle.com/datasets/jorgemacosmartos/crx-uci-ml-repository")

Skipping, found downloaded files in "./crx-uci-ml-repository" (use force=True to force download)


# Context

Commercial banks receive a lot of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this notebook, we will build an automatic credit card approval predictor using machine learning techniques, just like real banks do.

# 1- Read data set

In [41]:
# reading the XLSX file
file =('/content/crx-uci-ml-repository/crx.data')
cc_apps = pd.read_csv(file, header=None)

# displaying the contents of the XLSX file
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


# 2. Inspecting the applications

As fisrt set let's try to figure out the most important features of a credit card application. The features of this dataset have been anonymized to protect the privacy.

The probable features in a typical credit card application are Gender, Age, Debt, Married, BankCustomer, EducationLevel, Ethnicity, YearsEmployed, PriorDefault, Employed, CreditScore, DriversLicense, Citizen, ZipCode, Income and finally the ApprovalStatus. This gives us a pretty good starting point, and we can map these features with respect to the columns in the output.

In [42]:
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [43]:
# Print summary statistics
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


In [44]:
# print data information
cc_apps_info = cc_apps.info()
print(cc_apps_info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      690 non-null    object 
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB
None


In [45]:
# Inspect missing values in the dataset
cc_apps.tail(17)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


### features like 11 and 13 are not as important as the other features in the dataset for predicting credit card approvals.

In [46]:
# Drop the features 11 and 13
cc_apps = cc_apps.drop([11, 13], axis=1)

# 3. Splitting the dataset into train and test sets

In [47]:
# Split into train and test sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)


# 4. Handling the missing values

There are the following issues that will affect the performance of ML model:
- dataset contains numeric and non numerical data that are of float64, int64 and object types.
Feature 2, 7, 10 and 14 - numeric values with types float64, float64, int64 and int64 respectively
remaining features - non numeric
- dataset also contains values from several ranges. Feature 2 has values in range 0-28 and feature 10 values in range 2-67. And  14 has values in range 1017-100000. All features have statistic information, like mean,std, max,  min.
- looking to tail of dataset and head is possible to identify that we have missing values in tail. The missing values in dataset are labelled with '?'
- as first step these missing values will be replaced as NaN.

In [48]:
# Inspect missing values in the dataset
print(cc_apps_train.isnull().values.sum())
#as we have all fields fill in we don't get missing values

0


## 4.1- Handling the missing values numeric

After replacing '-' for NaN, we can treat missing value treatment.

The missing values shouldn't be ignore because affect the performance of ML model. When they are ignore the model may miss out on information about the dataset that may be useful for its training.

We may use for example the model LDA(Linear Discriminant Analysis) to handle with missing values.

To avoid this problem, we will impute the missing values with a strategy  called mean imputation.

In [49]:
# Replace the '?'s with NaN in the train and test sets
cc_apps_train_nans_replaced = cc_apps_train.replace("?", np.NaN)
cc_apps_test_nans_replaced = cc_apps_test.replace("?", np.NaN)

In [50]:
# Count the number of NaNs in the dataset to verify
print(cc_apps_train_nans_replaced.isnull().values.sum(), cc_apps_train_nans_replaced.isnull().values.sum())

39 39


In [51]:

# Impute the missing values with mean imputation
cc_apps_train_imputed = cc_apps_train_nans_replaced.fillna(cc_apps_train_nans_replaced.mean())
cc_apps_test_imputed = cc_apps_test_nans_replaced.fillna(cc_apps_train_nans_replaced.mean())

  cc_apps_train_imputed = cc_apps_train_nans_replaced.fillna(cc_apps_train_nans_replaced.mean())
  cc_apps_test_imputed = cc_apps_test_nans_replaced.fillna(cc_apps_train_nans_replaced.mean())


the missing values present in the numeric columns are treated

In [52]:
# Count the number of NaNs in the dataset to verify
print(cc_apps_train_imputed.isnull().values.sum(), cc_apps_test_imputed.isnull().values.sum())

39 15


## 4.2- Handling the missing values for categorical features

for features with type object , non-numeric the mean imputation doesn't work.
For these features we will impute the missing values with most frequent values as presented in each column.
Which is a good practice when we need to imput missing values for categorical data in general.

In [53]:
# cc_apps_train[0] # column 0
# cc_apps_train[0].value_counts() # counts the nr of different info present at column, and puts the first row with the value used more times
# cc_apps_train[0].value_counts().index[0] # with index gives the value which was used more time
# cc_apps_train[1].value_counts().index[0] # counts the nr of different info present at column
# fillna(value more times used) # to replace NaN with the value more times used

# Iterate over each column of cc_apps_train_imputed
for col in cc_apps_train_imputed.columns:
    # Check if the column is of object type
    if cc_apps_train_imputed[col].dtypes == "object":
        # Impute with the most frequent value
        cc_apps_train_imputed = cc_apps_train_imputed.fillna(cc_apps_train_imputed[col].value_counts().index[0])
        cc_apps_test_imputed = cc_apps_test_imputed.fillna(cc_apps_train_imputed[col].value_counts().index[0])


#count the nr of NaNs in the dataset and print the counts to verify
print(cc_apps_train_imputed.isnull().values.sum(), cc_apps_test_imputed.isnull().values.sum())

0 0


all missing values are handled

# 5. Preprocessing the data

- Convert non-numeric data (categorical) into numeric
- Segregate features and labels for training and testing
- scale the feature values to a uniform range

## 5.1. Convert non-numeric data (categorical) into numeric

- methods for handling categorical variables in machine learning

**Label encoder**  - Assign each categorical value an integer value based on alphabetical order. Limitation: may lead to the generation of priority issues during model training of data sets. A label with a high value may be considered to have high priority than a label having a lower value.

**one hot encoding** - Create new variables that take on values 0 and 1 to represent the original categorical values.

*Which is better to use?*

one hot encoding is the preferred way to convert a categorical variable into a numeric variable because label encoding makes it seem that there is a ranking between values.

*Why to convert all the non-numeric values into numeric ones?*
 because not only it results in a faster computation but also many ML models (like XGBoost) (and especially the ones developed using scikit-learn) require the data to be in a strictly numeric format. so use the method of library pandas <code>get_dummies()</code>.

In [64]:
# Convert the categorical features in the train and test sets independently
cc_apps_train_cat_encoding = pd.get_dummies(cc_apps_train_imputed)
cc_apps_test_cat_encoding = pd.get_dummies(cc_apps_test_imputed)

# Reindex the columns of the test set aligning with the train set
cc_apps_test_cat_encoding = cc_apps_test_cat_encoding.reindex(columns=cc_apps_train_cat_encoding.columns, fill_value=0)

In [65]:
#check if categorical features are numeric
cc_apps_train_cat_encoding.head()

Unnamed: 0,2,7,10,14,0_a,0_b,1_13.75,1_15.83,1_15.92,1_16.00,...,6_z,8_f,8_t,9_f,9_t,12_g,12_p,12_s,15_+,15_-
382,2.5,4.5,0,456,1,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,1
137,2.75,4.25,6,0,0,1,0,0,0,0,...,0,0,1,0,1,1,0,0,1,0
346,1.5,0.25,0,122,0,1,0,0,0,0,...,0,1,0,1,0,1,0,0,0,1
326,1.085,0.04,0,179,0,1,0,0,0,0,...,0,1,0,1,0,1,0,0,0,1
33,5.125,5.0,0,4000,1,0,0,0,0,0,...,0,0,1,1,0,1,0,0,1,0


## 5.2. Segregate features and labels for training and testing

split our data into train set and test set to prepare our data for two different phases of machine learning modeling: training and testing.

In [56]:
# Segregate features and labels into separate variables
X_train, y_train = (cc_apps_train_cat_encoding.iloc[:, :-1].values,
    cc_apps_train_cat_encoding.iloc[:, [-1]].values,)
X_test, y_test = (cc_apps_test_cat_encoding.iloc[:, :-1].values,
    cc_apps_test_cat_encoding.iloc[:, [-1]].values,)

## 5.3. Scale the feature values to a uniform range

The credit score of a person is their creditworthiness based on their credit history. The higher this number, the more financially trustworthy a person is considered to be. So, a CreditScore of 1 is the highest since we're rescaling all the values to the range of 0-1.

Gradient descent methods, KNN algorithm, linear and logistic regression require data scaling to produce good results.

The methods to scale data may be *Standard Scaler and Min-Max scaler*.

*Standard Scaler* helps to get standardized distribution, with a zero mean and standard deviation of one (unit variance).

Here we will use Min-Max scaler, where the minimum of feature is made equal to zero and the maximum of feature equal to one.


### Scale data

In [57]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# print scaled features
print(rescaledX_train)

[[0.0949307  0.225      0.         ... 0.         0.         0.        ]
 [0.10442377 0.2125     0.08955224 ... 0.         0.         1.        ]
 [0.05695842 0.0125     0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         1.        ]
 [0.         0.         0.05970149 ... 0.         0.         0.        ]
 [0.1898614  0.01875    0.02985075 ... 0.         0.         0.        ]]


# 6. Fitting a logistic regression model to the train set

According to UCI, our dataset contains more instances that correspond to "Denied" status than instances corresponding to "Approved" status. Specifically, out of 690 instances, there are 383 (55.5%) applications that got denied and 307 (44.5%) applications that got approved.

This values gives a reference. A good machine learning model should be able to accurately predict the status of the applications with respect to these statistics.



In [58]:
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

  y = column_or_1d(y, warn=True)


# 7. Making predictions and evaluating performance
To know  how well our model perform we do prediction and evaluate our model on the test set with respect to classification accuracy.

Also take a look the model's confusion matrix.

In the case of predicting credit card applications, it is equally important to see if our machine learning model is able to predict the approval status of the applications as denied that originally got denied.

If the model is not performing well in this aspect, then it might end up approving the application that should have not been approved.

The confusion matrix helps us to view our model's performance from these aspects.

In [59]:
# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print(f'Accuracy of logistic regression classifier: {logreg.score(rescaledX_test, y_test):.3f}')

# Print the confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred))

Accuracy of logistic regression classifier: 1.000
[[103   0]
 [  0 125]]


The model has an accuracy score of 100% which is very good.

The 1st row in CM (confusion matrix) is the TN, which means the number of negative instances (denied applications) predicted by the model correctly. The 2nd row is the TP meaning the number of positive instances (approved applications) predicted by the model correctly.

# 8. Grid searching and making the model perform better

Here we use the Grid searching to find the best hyperparameters to tune our model.

With grid search allows us to search the model parameters to improve the model's ability to predict credit card approvals.

The different hyperparameters in logist regression are the following:
- tol (Tolerance for stopping criteria)
- max-iter (Maximum number of iterations taken for the solvers to converge.)

In [60]:
# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)
print(param_grid)

{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}


this is the grid of hyperparameters values, converted into single dictonary format. With this we may see which values gives better performance.

GridSearchCV() will perform a cross-validation of five folds.

# 9. Finding the best performing model

In [61]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX_train, y_train)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [62]:
print('Best: %f using %s' % (best_score, best_params))

Best: 1.000000 using {'max_iter': 100, 'tol': 0.01}


In [63]:
# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
print("Accuracy of logistic regression classifier: ", best_model)

Accuracy of logistic regression classifier:  LogisticRegression(tol=0.01)
