### Import data

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score


In [2]:
df = pd.read_csv('cc_approvals.data',header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [3]:
df.tail(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
670,b,47.17,5.835,u,g,w,v,5.5,f,f,0,f,g,465,150,-
671,b,25.83,12.835,u,g,cc,v,0.5,f,f,0,f,g,0,2,-
672,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-


### Data cleaning

In [7]:
def data_cleaning(data, column_name):
    df = data
    df.replace('?',np.nan, inplace = True)
    if df[column_name].dtype == 'float64' or df[column_name].dtype == 'int64':
        num_df  = df.select_dtypes(include = ['int64','float64'])
        df[column_name].fillna(num_df[column_name].mean(), inplace = True)
    elif df[column_name].dtype == 'object':
        str_df = df.select_dtypes(include = ['object'])
        df[column_name].fillna(str_df[column_name].mode(), inplace = True)
    return list(df[column_name].value_counts())

In [8]:
data_cleaning(df, 9)

[395, 295]

### Preprocessing and split

In [19]:
def data_preprocess(df):
    new_df = df
    le = preprocessing.LabelEncoder()
    
    categorical_fm = new_df.dtypes == object
    categorical_col = new_df.columns[categorical_fm].tolist()
    new_df[categorical_col] = new_df[categorical_col].apply(lambda col: le.fit_transform(col.astype(str)),axis = 0, result_type = 'expand')
    
    new_df = new_df.drop([11,13], axis = 1)
    
    df_arr = new_df.to_numpy()
    X  = df_arr[:, :-1]
    y = df_arr[:,-1]

    mm_scaler = MinMaxScaler()
    X_scaled = mm_scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled , y, test_size = 0.20, random_state = 42)
    
    return (X_train, y_train), (X_test, y_test)

In [20]:
(X_train, y_train), (X_test, y_test) = data_preprocess(df)
print(X_train[:1])
print(y_train[:1])
print(X_test[:1])
print(y_test[:1])

[[0.5        0.25787966 0.48214286 1.         1.         0.35714286
  0.22222222 0.         0.         0.         0.         0.
  0.        ]]
[1.]
[[0.         1.         0.05357143 0.66666667 0.         0.35714286
  0.22222222 0.         0.         1.         0.02985075 0.
  0.00105   ]]
[1.]


### Training the model Logistic Regression

In [13]:
def train_model(X_train, y_train):
    lr = LogisticRegression(solver = 'lbfgs')
    return lr.fit(X_train, y_train)

In [14]:
lm = train_model(X_train, y_train)
print(lm.intercept_[0])
print(lm.coef_)

1.9547602995788995
[[ 0.1035255  -0.29352842  0.0079433   2.18119554 -0.19969193 -0.57362599
  -0.29090531 -0.85925924 -3.46565838 -1.07879971 -0.87766676  0.09314646
  -1.15173196]]


### Testing the model AUC - ROC

In [15]:
def roc_score(lm, X_test, y_test):
    return roc_auc_score(y_test, lm.predict_proba(X_test)[:,1])

In [16]:
print(roc_score(lm,X_test,y_test))

0.8903361344537816


### Classification metric scores

In [17]:
def scores(lm, X_test, y_test):
    Accuracy = accuracy_score(y_test, lm.predict(X_test), normalize = True)
    Precision = precision_score(y_test, lm.predict(X_test))
    Recall = recall_score(y_test, lm.predict(X_test))
    F1_Score = f1_score(y_test, lm.predict(X_test))
    return (Accuracy, Precision, Recall, F1_Score)

In [18]:
(accuracy, precision, recall, f1) = scores(lm, X_test, y_test)    

print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)

Accuracy: 0.833333
Precision: 0.868852
Recall: 0.779412
F1 score: 0.821705
