In [159]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [160]:
data = pd.read_csv('AER_credit_card_data.csv')


data.head()


Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [161]:
y = data.card

X = data.drop(['card'], axis=1)#doesn't impact on origanal data if you not use implace = True

X.shape, y.head()

((1319, 11),
 0    yes
 1    yes
 2    yes
 3    yes
 4    yes
 Name: card, dtype: object)

In [162]:
data.dtypes

card            object
reports          int64
age            float64
income         float64
share          float64
expenditure    float64
owner           object
selfemp         object
dependents       int64
months           int64
majorcards       int64
active           int64
dtype: object

In [163]:
X_train,X_vaild, y_train,y_vaild = train_test_split(X,y, test_size=.1, random_state=0)

In [164]:
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == 'object' ]



In [165]:
print(numerical_cols)
print(categorical_cols)

['reports', 'age', 'income', 'share', 'expenditure', 'dependents', 'months', 'majorcards', 'active']
['owner', 'selfemp']


In [166]:
# Keep selected columns only
my_cols = categorical_cols + numerical_cols

X_train_ready = X_train[my_cols].copy()
X_vaild_ready = X_vaild[my_cols].copy()


In [167]:
X_vaild_ready

Unnamed: 0,owner,selfemp,reports,age,income,share,expenditure,dependents,months,majorcards,active
658,no,no,1,0.50000,3.7000,0.010637,32.46416,0,186,0,5
406,yes,yes,0,28.75000,7.0000,0.108352,631.97250,0,12,1,5
1202,no,no,0,20.16667,1.8000,0.000667,0.00000,0,4,0,1
202,no,yes,0,33.00000,3.1500,0.057266,150.17670,0,128,0,3
1187,no,no,0,34.91667,2.1000,0.059045,103.32830,2,49,1,4
...,...,...,...,...,...,...,...,...,...,...,...
711,yes,no,0,36.41667,2.2000,0.011455,20.33333,0,77,1,0
1290,yes,no,0,40.91667,6.0000,0.078964,394.57170,0,42,1,5
877,no,no,0,20.00000,2.7900,0.022684,52.48917,0,86,1,5
58,no,no,0,44.66667,1.8000,0.000667,0.00000,2,33,0,0


In [168]:
#Since this is a small dataset, we will use cross-validation to ensure accurate measures of model quality.

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

model = RandomForestClassifier(n_estimators=100)

# Since there is no preprocessing, we don't need a pipeline (used anyway as best practice!)

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer =  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                            ('oneHot', OneHotEncoder(handle_unknown='ignore'))])


perprocessor = ColumnTransformer(transformers=[('numerical', numerical_transformer, numerical_cols ),
                                               ('categorical', categorical_transformer, categorical_cols)])


In [169]:
my_pipeline = Pipeline(steps=[('perprocessor', perprocessor),
                              ('model', model)])

my_pipeline.fit(X_train_ready, y_train)

Pipeline(memory=None,
         steps=[('perprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['reports', 'age', 'income',
                                                   'share', 'expenditure',
                                                   'depend

In [170]:
cv_scores = cross_val_score(my_pipeline, X_train_ready, y_train, 
                            cv=5,
                            scoring='accuracy')

print("Cross-validation accuracy: %f" % cv_scores.mean())


Cross-validation accuracy: 0.979779


In [171]:
print(my_pipeline.score(X_vaild_ready, y_vaild))

0.9772727272727273


In [196]:
#short code

# Read the data
data = pd.read_csv('AER_credit_card_data.csv', 
                   true_values = ['yes'], false_values = ['no'])

# Select target
y = data.card

# Select predictors
X = data.drop(['card'], axis=1)

print("Number of rows in the dataset:", X.shape[0])
X.head()

Number of rows in the dataset: 1319


Unnamed: 0,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,0,37.66667,4.52,0.03327,124.9833,True,False,3,54,1,12
1,0,33.25,2.42,0.005217,9.854167,False,False,3,34,1,13
2,0,33.66667,4.5,0.004156,15.0,True,False,4,58,1,5
3,0,30.5,2.54,0.065214,137.8692,False,False,0,25,1,7
4,0,32.16667,9.7867,0.067051,546.5033,True,False,2,64,1,5


In [197]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Since there is no preprocessing, we don't need a pipeline (used anyway as best practice!)
my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
cv_scores = cross_val_score(my_pipeline, X, y, 
                            cv=5,
                            scoring='accuracy')

print("Cross-validation accuracy: %f" % cv_scores.mean())

Cross-validation accuracy: 0.981052


A few variables look suspicious. For example, does expenditure mean expenditure on this card or on cards used before applying?

At this point, basic data comparisons can be very helpful:

In [198]:
expenditures_cardholders = X.expenditure[y]
expenditures_noncardholders = X.expenditure[~y]

print('Fraction of those who did not receive a card and had no expenditures: %.2f' \
      %((expenditures_noncardholders == 0).mean()))
print('Fraction of those who received a card and had no expenditures: %.2f' \
      %(( expenditures_cardholders == 0).mean()))

Fraction of those who did not receive a card and had no expenditures: 1.00
Fraction of those who received a card and had no expenditures: 0.02


In [199]:
expenditures_cardholders

0       124.983300
1         9.854167
2        15.000000
3       137.869200
4       546.503300
           ...    
1310      4.583333
1314      7.333333
1316    101.298300
1317     26.996670
1318    344.157500
Name: expenditure, Length: 1023, dtype: float64

In [200]:
expenditures_noncardholders

11      0.0
12      0.0
17      0.0
19      0.0
21      0.0
       ... 
1301    0.0
1311    0.0
1312    0.0
1313    0.0
1315    0.0
Name: expenditure, Length: 296, dtype: float64

As shown above, everyone who did not receive a card had no expenditures, while only 2% of those who received a card had no expenditures. It's not surprising that our model appeared to have a high accuracy. But this also seems to be a case of target leakage, where expenditures probably means expenditures on the card they applied for.

Since share is partially determined by expenditure, it should be excluded too. The variables active and majorcards are a little less clear, but from the description, they sound concerning. In most situations, it's better to be safe than sorry if you can't track down the people who created the data to find out more.

We would run a model without target leakage as follows:

In [201]:
# Drop leaky predictors from dataset
potential_leaks = ['expenditure', 'share', 'active', 'majorcards']
X2 = X.drop(potential_leaks, axis=1)

# Evaluate the model with leaky predictors removed
cv_scores = cross_val_score(my_pipeline, X2, y, 
                            cv=5,
                            scoring='accuracy')

print("Cross-val accuracy: %f" % cv_scores.mean())

Cross-val accuracy: 0.830165
