# Datacamp: Predicting credit card approvals

In [238]:
# Import pandas
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder  

# Load dataset
cc_apps = pd.read_csv('P:/recovery/pekan_projektit/Training/Datacamp_credit_card/credit_card_data.csv',
                      header = None, sep = ',')
# Inspect data
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


### Change colnames to letters

In [239]:
#import string
#colnames = list(string.ascii_lowercase[0:16])
#cc_apps.columns = colnames

In [240]:
cc_apps_description = cc_apps.describe()
print(cc_apps_description)
print("\n")
cc_apps_info = cc_apps.info()
print(cc_apps_info)
print("\n")
print(cc_apps.tail(17))
cc_apps = cc_apps.replace('?', np.nan)
print(cc_apps.tail(17))

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
0     690 non-null object
1     690 non-null object
2     690 non-null float64
3     690 non-null object
4     690 non-null object
5     690 non-null object
6     690 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null object
14    690 non-null int64

### Check missing vals

In [241]:
cc_apps.fillna(cc_apps.mean(), inplace=True)
print(cc_apps.isna().sum().sum())

67


### Impute missing with most common val

In [242]:
for col in cc_apps:
    if cc_apps[col].dtype.name == 'object':
        cc_apps = cc_apps.fillna(max(cc_apps[col].value_counts()))   

if cc_apps.isna().sum().sum() == 0:
    print('No missing data')
else:
    print('There are still missing values in the data')

No missing data


### Label encoder

In [243]:
le = LabelEncoder()
for col in cc_apps:
    if cc_apps[col].dtype.name == 'object':
        cc_apps[col] = le.fit_transform(cc_apps[col].astype('str'))

### Test and training split

In [244]:
from sklearn.model_selection import train_test_split
cc_apps = cc_apps.drop([11, 13], axis = 1)
cc_apps = cc_apps.values # df to numpy array
X,y = cc_apps[:,0:12] , cc_apps[:,13]
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                y,
                                test_size=0.33,
                                random_state=42)

In [246]:
from sklearn.preprocessing import MinMaxScaler

In [247]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

### Fit a logistic regression

In [248]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression() 
logmod = logreg.fit(rescaledX_train, y_train) 

### Evaluate results and accuracy

In [267]:
from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(rescaledX_test)
# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))
confusion_matrix(y_test, y_pred)

Accuracy of logistic regression classifier:  0.8377192982456141


array([[92, 11],
       [26, 99]], dtype=int64)

### Parameter tuning with grid search

In [268]:
from sklearn.model_selection import GridSearchCV 

In [269]:
# Define the grid of values for tol and max_iter
tol = [ 0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

In [270]:
param_grid = {'tol': tol, 'max_iter': max_iter}

In [273]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

In [274]:
# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

In [275]:
# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)



In [288]:
best_score, best_params = grid_model_result.best_score_ , grid_model_result.best_params_ , 
print("Best: %f using %s" % (grid_model_result.best_score_, grid_model_result.best_params_))

Best: 0.850725 using {'max_iter': 100, 'tol': 0.01}
