In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data

bankloans = pd.read_csv('bankloans.csv')

In [3]:
bankloans.shape

(850, 9)

In [4]:
bankloans.columns

Index(['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt',
       'othdebt', 'default'],
      dtype='object')

In [5]:
bankloans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 9 columns):
age         850 non-null int64
ed          850 non-null int64
employ      850 non-null int64
address     850 non-null int64
income      850 non-null int64
debtinc     850 non-null float64
creddebt    850 non-null float64
othdebt     850 non-null float64
default     700 non-null float64
dtypes: float64(4), int64(5)
memory usage: 59.8 KB


In [6]:
numeric_var_names=[key for key in dict(bankloans.dtypes) if dict(bankloans.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_names=[key for key in dict(bankloans.dtypes) if dict(bankloans.dtypes)[key] in ['object']]

print(numeric_var_names)
print(cat_var_names)

['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt', 'othdebt', 'default']
[]


In [7]:
bankloans_num=bankloans[numeric_var_names]
bankloans_num.head(5)

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1.0
1,27,1,10,6,31,17.3,1.362202,4.000798,0.0
2,40,1,15,14,55,5.5,0.856075,2.168925,0.0
3,41,1,15,14,120,2.9,2.65872,0.82128,0.0
4,24,2,2,0,28,17.3,1.787436,3.056564,1.0


In [8]:
bankloans_existing = bankloans_num[bankloans_num.default.isnull()==0]
bankloans_new = bankloans_num[bankloans_num.default.isnull()==1]

In [9]:
#Handling Outliers 
def outlier_capping(x):
    x = x.clip_upper(x.quantile(0.99))
    x = x.clip_lower(x.quantile(0.01))
    return x

bankloans_existing=bankloans_existing.apply(lambda x: outlier_capping(x))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [10]:
#Handling missings
def Missing_imputation(x):
    x = x.fillna(x.mean())
    return x

bankloans_existing=bankloans_existing.apply(lambda x: Missing_imputation(x))

In [None]:
# Separarate the data into training and tests dataset

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    bankloans_existing.drop(labels=['default'], axis=1),
    bankloans_existing['default'],
    test_size=0.3,
    random_state=0)
X_train.shape, X_test.shape

((490, 8), (210, 8))

In [None]:
# Scaling the data

In [12]:
scaler = StandardScaler()
scaler.fit(X_train.fillna(0))

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [None]:
# Selecting features using Lasso regularisation using SelectFromModel

In [13]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1'))
sel_.fit(scaler.transform(X_train.fillna(0)), y_train)

  


SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [None]:
# Visualising features which are return from lasso regularisation

# In the below output, the output are index wise lables. 

# True: according to Lasso the features are important (non-zero features)
# False: according to Lasso, the features are not important (whose weights are shrinked to Zero)

In [14]:
sel_.get_support()

array([False,  True,  True,  True, False,  True,  True,  True])

In [None]:
# Display features 

In [15]:
selected_feat = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 8
selected features: 6
features with coefficients shrank to zero: 2


In [None]:
# Number of features which coefficient shrank to zero

In [16]:
np.sum(sel_.estimator_.coef_ == 0)

2

In [None]:
#  Display the removed features

In [17]:
removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index(['age', 'income'], dtype='object')

In [None]:
# Removing the features from training an test dataset

In [18]:
X_train_selected = sel_.transform(X_train.fillna(0))
X_test_selected = sel_.transform(X_test.fillna(0))
X_train_selected.shape, X_test_selected.shape

((490, 6), (210, 6))

### Ridge or L2 regularisation does not shrink coefficients to zero.  The below demo will explain the same.

In [19]:
# Split the data into train and test datset 

X_train, X_test, y_train, y_test = train_test_split(
    bankloans_existing.drop(labels=['default'], axis=1),
    bankloans_existing['default'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((490, 8), (210, 8))

In [20]:
l1_logit = LogisticRegression(C=1, penalty='l2')
l1_logit.fit(scaler.transform(X_train.fillna(0)), y_train)

  


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [None]:
# Display the number of coefficients with zero values

In [21]:
np.sum(l1_logit.coef_ == 0)

0

In [None]:
# Conclusion :

# As per above demo, L1 Regulatization will also use to remove non-important features from dataset.

# Please note: 
#    1. c (penalization) value increase then remove number of features from dataset.  
#    2. please keep in mind the c value should not be less as its not removed non-importanct features from dataset.