# Linear Regression 

In [1]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from mypipes import *

In [2]:
train_file=r'~/Dropbox/0.0 Data/loan_data_train.csv'
test_file=r'~/Dropbox/0.0 Data/loan_data_test.csv'

ld_train=pd.read_csv(train_file)
ld_test=pd.read_csv(test_file)               


In [3]:
ld_train.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,79542.0,25000,25000.0,18.49%,60 months,debt_consolidation,27.56%,VA,MORTGAGE,8606.56,720-724,11,15210,3.0,5 years
1,75473.0,19750,19750.0,17.27%,60 months,debt_consolidation,13.39%,NY,MORTGAGE,6737.5,710-714,14,19070,3.0,4 years
2,67265.0,2100,2100.0,14.33%,36 months,major_purchase,3.50%,LA,OWN,1000.0,690-694,13,893,1.0,< 1 year
3,80167.0,28000,28000.0,16.29%,36 months,credit_card,19.62%,NV,MORTGAGE,7083.33,710-714,12,38194,1.0,10+ years
4,17240.0,24250,17431.82,12.23%,60 months,credit_card,23.79%,OH,MORTGAGE,5833.33,730-734,6,31061,2.0,10+ years


In [4]:
ld_test.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,20093,5000,5000,60 months,moving,12.59%,NY,RENT,4416.67,690-694,13,7686,0,< 1 year
1,62445,18000,18000,60 months,debt_consolidation,4.93%,CA,RENT,5258.5,710-714,6,11596,0,10+ years
2,65248,7200,7200,60 months,debt_consolidation,25.16%,LA,MORTGAGE,3750.0,750-754,13,7283,0,6 years
3,81822,7200,7200,36 months,debt_consolidation,17.27%,NY,MORTGAGE,3416.67,790-794,14,4838,0,10+ years
4,57923,22000,22000,60 months,debt_consolidation,18.28%,MI,MORTGAGE,6083.33,720-724,9,20181,0,8 years


In [7]:
ld_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              2199 non-null   float64
 1   Amount.Requested                2199 non-null   object 
 2   Amount.Funded.By.Investors      2199 non-null   object 
 3   Interest.Rate                   2200 non-null   object 
 4   Loan.Length                     2199 non-null   object 
 5   Loan.Purpose                    2199 non-null   object 
 6   Debt.To.Income.Ratio            2199 non-null   object 
 7   State                           2199 non-null   object 
 8   Home.Ownership                  2199 non-null   object 
 9   Monthly.Income                  2197 non-null   float64
 10  FICO.Range                      2200 non-null   object 
 11  Open.CREDIT.Lines               2196 non-null   object 
 12  Revolving.CREDIT.Balance        21

In [6]:
ld_train.sample(10)

# drop columns : Amount.Funded.By.Investors , ID, Interest.Rate

#1 Amount requested : convert it to numeric

#2 Loan Length : create dummies with frequency cutoff 20

#3 Loan.Purpose : dummies with freq cutoff

#4  Debt.To.Income.Ratio : remove % and then convert to numeric

#5  State: dummies with frequency cutoff

#6 Home.Ownership : dummies with frequency cutoff

#7 Monthly Income : as is

#8 FICO.Range : break a-b , in to a ,b , convert them to numeric 
# then create new column fico=0.5*(a+b) and then drop the original FICO.Range

#9 Open Credit Lines : convert to numeric

#10 Revolving Credit balance : convert it to numeric 

#11 inquiries in the last 6 months : as is

#12 Employment.Length : create dummies with frequency cutoff

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
540,96613.0,27575,27575,14.33%,36 months,debt_consolidation,19.63%,CA,MORTGAGE,5166.67,690-694,6,8720,0.0,10+ years
1346,82476.0,10000,10000,14.09%,36 months,debt_consolidation,10.59%,CA,RENT,2916.67,690-694,9,10390,3.0,< 1 year
896,73128.0,4825,4825,12.12%,36 months,debt_consolidation,4.76%,CT,MORTGAGE,5416.67,680-684,8,6883,0.0,8 years
1072,62011.0,14000,14000,12.12%,36 months,house,7.64%,TX,RENT,6883.33,695-699,7,9715,0.0,2 years
1348,82419.0,12000,12000,14.33%,36 months,debt_consolidation,28.20%,KY,MORTGAGE,3333.33,670-674,11,10727,0.0,10+ years
1626,60969.0,8000,8000,13.11%,36 months,debt_consolidation,26.11%,MN,MORTGAGE,4166.67,675-679,15,11500,0.0,< 1 year
588,98634.0,20000,20000,6.03%,36 months,credit_card,18.67%,FL,MORTGAGE,14583.33,715-719,9,29044,0.0,7 years
1638,42124.0,10000,10000,11.71%,36 months,debt_consolidation,8.40%,CA,RENT,4500.0,710-714,8,8404,1.0,3 years
699,94971.0,2000,2000,19.72%,36 months,moving,10.29%,FL,RENT,3575.0,670-674,10,12036,0.0,6 years
1664,13996.0,15000,14975,15.21%,36 months,debt_consolidation,19.10%,TX,RENT,5132.0,680-684,4,34044,1.0,3 years


In [8]:
p1=pdPipeline([
    ('var_select',VarSelector(['Amount.Requested','Open.CREDIT.Lines','Revolving.CREDIT.Balance'])),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

p2=pdPipeline([
    ('var_select',VarSelector(['Debt.To.Income.Ratio'])),
    ('string_clean',string_clean(replace_it='%',replace_with='')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

p3=pdPipeline([
    ('var_select',VarSelector(['Loan.Length', 'Loan.Purpose','State','Home.Ownership','Employment.Length'])),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(20))
])

p4=pdPipeline([
    ('var_select',VarSelector(['Monthly.Income','Inquiries.in.the.Last.6.Months'])),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['FICO.Range'])),
    ('custom_fico',custom_fico()),
    ('missing_trt',DataFrameImputer())
])

data_pipe=FeatureUnion([
    ('obj_to_num',p1),
    ('dtir',p2),
    ('obj_to_dum',p3),
    ('num',p4),
    ('fico',p5)
])

In [9]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(ld_train),
                     columns=data_pipe.get_feature_names())

In [10]:
x_train.shape

(2200, 60)

In [11]:
x_test=pd.DataFrame(data=data_pipe.transform(ld_test),
                     columns=data_pipe.get_feature_names())

In [12]:
x_test.shape

(300, 60)

In [13]:
x_train.head()

Unnamed: 0,obj_to_num__Amount.Requested,obj_to_num__Open.CREDIT.Lines,obj_to_num__Revolving.CREDIT.Balance,dtir__Debt.To.Income.Ratio,obj_to_dum__Loan.Length_36 months,obj_to_dum__Loan.Length_60 months,obj_to_dum__Loan.Purpose_debt_consolidation,obj_to_dum__Loan.Purpose_credit_card,obj_to_dum__Loan.Purpose_other,obj_to_dum__Loan.Purpose_home_improvement,...,obj_to_dum__Employment.Length_4 years,obj_to_dum__Employment.Length_1 year,obj_to_dum__Employment.Length_6 years,obj_to_dum__Employment.Length_7 years,obj_to_dum__Employment.Length_8 years,obj_to_dum__Employment.Length_missing,obj_to_dum__Employment.Length_9 years,num__Monthly.Income,num__Inquiries.in.the.Last.6.Months,fico__fico
0,25000.0,11.0,15210.0,27.56,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8606.56,3.0,722.0
1,19750.0,14.0,19070.0,13.39,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6737.5,3.0,712.0
2,2100.0,13.0,893.0,3.5,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1.0,692.0
3,28000.0,12.0,38194.0,19.62,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7083.33,1.0,712.0
4,24250.0,6.0,31061.0,23.79,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5833.33,2.0,732.0


In [14]:
x_test.head()

Unnamed: 0,obj_to_num__Amount.Requested,obj_to_num__Open.CREDIT.Lines,obj_to_num__Revolving.CREDIT.Balance,dtir__Debt.To.Income.Ratio,obj_to_dum__Loan.Length_36 months,obj_to_dum__Loan.Length_60 months,obj_to_dum__Loan.Purpose_debt_consolidation,obj_to_dum__Loan.Purpose_credit_card,obj_to_dum__Loan.Purpose_other,obj_to_dum__Loan.Purpose_home_improvement,...,obj_to_dum__Employment.Length_4 years,obj_to_dum__Employment.Length_1 year,obj_to_dum__Employment.Length_6 years,obj_to_dum__Employment.Length_7 years,obj_to_dum__Employment.Length_8 years,obj_to_dum__Employment.Length_missing,obj_to_dum__Employment.Length_9 years,num__Monthly.Income,num__Inquiries.in.the.Last.6.Months,fico__fico
0,5000.0,13.0,7686.0,12.59,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4416.67,0.0,692.0
1,18000.0,6.0,11596.0,4.93,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5258.5,0.0,712.0
2,7200.0,13.0,7283.0,25.16,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3750.0,0.0,752.0
3,7200.0,14.0,4838.0,17.27,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3416.67,0.0,792.0
4,22000.0,9.0,20181.0,18.28,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6083.33,0.0,722.0


In [15]:
y_train=ld_train['Interest.Rate'].str.replace('%','').astype(float)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [17]:
lm=LinearRegression()

In [18]:
cv_mae=-cross_val_score(lm,
                        x_train,y_train,
                        cv=10,
                        scoring='neg_mean_absolute_error')

In [19]:
cv_mae

array([1.74273801, 1.76502624, 1.77011354, 1.65406589, 1.43720296,
       1.63540955, 1.44255408, 1.58043799, 1.52220714, 1.65823135])

In [20]:
cv_mae.mean()

1.6207986760156974

In [21]:
cv_mae.std()

0.11726675023079812

In [22]:
lm.fit(x_train,y_train)

LinearRegression()

In [23]:
lm.intercept_

73.1643230281316

In [24]:
lm.coef_

array([ 1.60913878e-04, -3.72164153e-02, -3.05844966e-06,  1.88427404e-04,
        1.30627284e+00,  4.47755390e+00, -7.84079703e-01, -8.91888331e-01,
        3.20936459e-02, -6.95275775e-01, -4.22744797e-01, -2.64224120e-01,
       -5.41274155e-01, -9.69555836e-01, -5.98574359e-01,  7.21470875e-01,
       -1.99766458e-01, -1.88645568e-01, -7.45527597e-02,  4.73923022e-01,
       -4.68212871e-01, -5.71694597e-01, -1.79076702e-01, -3.78950003e-01,
       -1.57191036e-01, -1.08257898e-01, -7.78964606e-02, -4.96139885e-01,
       -1.99177682e-01,  4.10456351e-02, -2.67193758e-01,  3.80179542e-01,
        4.08575782e-01, -9.81649463e-02, -6.21920499e-02, -1.77735228e-01,
       -2.33745724e-01, -3.50604056e-01, -3.92591988e-02,  1.87183988e-02,
        1.10048502e-02,  1.85782841e-01, -2.39881640e+00, -2.17035525e+00,
       -2.06526514e+00,  5.26184321e-01,  3.52218686e-01,  2.87554381e-01,
        1.51007910e-01,  5.83377246e-01,  4.22222352e-01,  2.41210867e-01,
        3.88741640e-01,  

In [25]:
list(zip(x_train.columns,lm.coef_))

[('obj_to_num__Amount.Requested', 0.00016091387809662639),
 ('obj_to_num__Open.CREDIT.Lines', -0.037216415293150194),
 ('obj_to_num__Revolving.CREDIT.Balance', -3.058449663623865e-06),
 ('dtir__Debt.To.Income.Ratio', 0.0001884274038689293),
 ('obj_to_dum__Loan.Length_36 months', 1.306272836499463),
 ('obj_to_dum__Loan.Length_60 months', 4.477553903292299),
 ('obj_to_dum__Loan.Purpose_debt_consolidation', -0.784079702740472),
 ('obj_to_dum__Loan.Purpose_credit_card', -0.8918883314514419),
 ('obj_to_dum__Loan.Purpose_other', 0.032093645935432846),
 ('obj_to_dum__Loan.Purpose_home_improvement', -0.6952757746475863),
 ('obj_to_dum__Loan.Purpose_major_purchase', -0.4227447967589989),
 ('obj_to_dum__Loan.Purpose_small_business', -0.2642241204509055),
 ('obj_to_dum__Loan.Purpose_car', -0.5412741545847766),
 ('obj_to_dum__Loan.Purpose_wedding', -0.9695558362076655),
 ('obj_to_dum__Loan.Purpose_medical', -0.5985743591784637),
 ('obj_to_dum__Loan.Purpose_moving', 0.7214708748501236),
 ('obj_to_d

In [26]:
test_pred=lm.predict(x_test)

In [27]:
test_pred

array([16.73641244, 15.9822577 , 10.41724451,  3.71534168, 15.21108834,
        7.0351977 , 15.57246453, 10.58352597, 15.94955183, 12.38355726,
        9.9366698 , 15.17209087, 11.53730446, 13.67752736, 13.40134753,
       18.33363766, 10.18969518, 15.79040256, 13.37967692, 14.03563066,
       22.50430604, 17.50946104, 12.12105201, 14.6034318 ,  9.68060525,
       11.39039568, 13.33566534, 19.0210008 , 11.80641539, 16.88384891,
       15.0680717 , 15.29064393, 12.35705772, 15.07255624, 13.78562695,
       14.07528779, 19.52106736, 11.3554855 , 12.00458203, 16.82962478,
       14.15907615, 11.12807774, 14.8967374 , 13.17699098, 15.55418833,
       17.04873439, 14.90876743, 19.10551152, 17.0897673 , 10.33940429,
       13.57931315, 19.75300247, 10.02009378, 19.49728229, 15.57850152,
       15.06801067, 17.10315371, 14.53133523, 10.93874049, 14.48472924,
       13.05230477, 17.09882657,  8.84706185, 14.56313892, 10.50916007,
       11.43247185, 12.79734112, 14.90266567, 12.05952277, 14.46

We can write these to a csv file for submission like this :

In [28]:
pd.DataFrame(test_pred).to_csv("mysubmission.csv",index=False)

In [29]:
import os
os.getcwd()

'/Users/lalitsachan/Dropbox/PDSV4/4. Linear Models'

# Ridge  Regression

In [30]:
from sklearn.linear_model import Ridge,Lasso

from sklearn.model_selection import GridSearchCV



In [31]:
lambdas=np.linspace(1,100,100)

In [32]:
lambdas

array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       100.])

In [33]:
params={'alpha':lambdas}

In [34]:
model=Ridge(fit_intercept=True)

In [35]:
grid_search=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring='neg_mean_absolute_error',
                        verbose=20,n_jobs=-1)

In [36]:
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  

[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 228 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 236 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 252 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 260 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 308 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 316 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 324 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 332 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 340 tasks      | elapsed:    4.0s
[Paralle

GridSearchCV(cv=10, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       100.])},
             scoring='neg_mean_absolute_error', verbose=20)

In [37]:
grid_search.best_estimator_

Ridge(alpha=59.0)

In [38]:
grid_search.cv_results_

{'mean_fit_time': array([0.02080505, 0.01340723, 0.00979197, 0.0126797 , 0.01054056,
        0.00737388, 0.00844619, 0.00862796, 0.00761032, 0.00620432,
        0.00751073, 0.00843022, 0.0076647 , 0.01024601, 0.01133761,
        0.01142952, 0.01060519, 0.00890357, 0.00860355, 0.00861967,
        0.01250184, 0.01078854, 0.01050711, 0.01079125, 0.00909185,
        0.01080275, 0.01054797, 0.01052039, 0.01018674, 0.01089597,
        0.01013143, 0.01044507, 0.00967023, 0.00966225, 0.00895381,
        0.00887899, 0.01007471, 0.00821609, 0.00876031, 0.00863402,
        0.00849521, 0.00818677, 0.00783422, 0.00821266, 0.00768554,
        0.0089251 , 0.00891144, 0.00932629, 0.00935843, 0.00967307,
        0.00914938, 0.01112092, 0.00955741, 0.00913661, 0.00890002,
        0.00917711, 0.00945182, 0.01010337, 0.00963256, 0.00939746,
        0.00909579, 0.00900018, 0.00868242, 0.00883646, 0.00949547,
        0.00940862, 0.00937567, 0.01001439, 0.00817556, 0.00842292,
        0.0090652 , 0.00776532,

 if you want you can now fit a ridge regression model with obtained value of alpha , although there is no need, grid search automatically fits the best estimator on the entire data, you can directly use this to make predictions on test_data. But if you want to look at coefficients , its much more convenient to fit the model with direct function

Using the report function given below you can see the cv performance of top few models as well, that will the tentative performance

In [39]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [40]:
report(grid_search.cv_results_,5)

Model with rank: 1
Mean validation score: -1.606905 (std: 0.117674)
Parameters: {'alpha': 59.0}

Model with rank: 2
Mean validation score: -1.606905 (std: 0.117597)
Parameters: {'alpha': 58.0}

Model with rank: 3
Mean validation score: -1.606906 (std: 0.117749)
Parameters: {'alpha': 60.0}

Model with rank: 4
Mean validation score: -1.606909 (std: 0.117518)
Parameters: {'alpha': 57.0}

Model with rank: 5
Mean validation score: -1.606911 (std: 0.117820)
Parameters: {'alpha': 61.0}



In [41]:
test_pred=grid_search.predict(x_test)

In [42]:
pd.DataFrame(test_pred).to_csv("mysubmission.csv",index=False)

## For looking at coefficients

In [43]:
grid_search.best_estimator_

Ridge(alpha=59.0)

In [44]:
ridge_model=grid_search.best_estimator_

In [45]:
ridge_model.fit(x_train,y_train)

Ridge(alpha=59.0)

In [46]:
list(zip(data_pipe.get_feature_names(),ridge_model.coef_))

[('obj_to_num__Amount.Requested', 0.00016590565671764556),
 ('obj_to_num__Open.CREDIT.Lines', -0.037908063378881016),
 ('obj_to_num__Revolving.CREDIT.Balance', -3.1637569333467597e-06),
 ('dtir__Debt.To.Income.Ratio', -0.0008409375022167025),
 ('obj_to_dum__Loan.Length_36 months', -1.3957035074401472),
 ('obj_to_dum__Loan.Length_60 months', 1.4879897186327875),
 ('obj_to_dum__Loan.Purpose_debt_consolidation', -0.32395293779591827),
 ('obj_to_dum__Loan.Purpose_credit_card', -0.39987781135513006),
 ('obj_to_dum__Loan.Purpose_other', 0.36089399406799655),
 ('obj_to_dum__Loan.Purpose_home_improvement', -0.17515395397224037),
 ('obj_to_dum__Loan.Purpose_major_purchase', 0.004410987700190506),
 ('obj_to_dum__Loan.Purpose_small_business', 0.10286757148899979),
 ('obj_to_dum__Loan.Purpose_car', -0.018641873174348116),
 ('obj_to_dum__Loan.Purpose_wedding', -0.21984698028510624),
 ('obj_to_dum__Loan.Purpose_medical', -0.05407603428506935),
 ('obj_to_dum__Loan.Purpose_moving', 0.3315352644892979)

In [47]:
lm.coef_/ridge_model.coef_

array([ 9.69911944e-01,  9.81754592e-01,  9.66714488e-01, -2.24068261e-01,
       -9.35924306e-01,  3.00912960e+00,  2.42035065e+00,  2.23040215e+00,
        8.89281796e-02,  3.96951230e+00, -9.58390332e+01, -2.56858519e+00,
        2.90353952e+01,  4.41013943e+00,  1.10691246e+01,  2.17615123e+00,
        2.12437158e+00,  6.18614703e+00, -4.73160592e+00,  1.22902469e+00,
        2.06149061e+00,  2.01208718e+00,  3.26002016e+00,  2.25164303e+00,
        1.72056498e+01,  5.82457716e+00, -3.13174013e+00,  2.56508150e+00,
        2.48868477e+00,  5.59185867e-01,  4.02365760e+00,  1.89664509e+00,
        2.11403463e+00,  1.90090348e+01, -3.24127331e+00,  6.13615270e+00,
        3.25241033e+01,  4.07951368e+00, -6.76125999e+00,  6.07010219e-01,
        2.89372797e-01,  2.26635930e+00,  1.00887444e+01,  7.63428021e+01,
       -3.71155295e+01,  3.79515587e+00, -1.72319034e+02, -1.16329695e+01,
       -1.06550764e+00,  3.27926770e+00,  7.38888277e+00, -2.16931393e+00,
        1.33033061e+01, -

## Lasso Regression

In [48]:
lambdas=np.linspace(1,10,100)

model=Lasso(fit_intercept=True)

params={'alpha':lambdas}

In [49]:
grid_search=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring='neg_mean_absolute_error',
                        verbose=20,n_jobs=-1)

In [50]:
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0492s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  14 ta

[Parallel(n_jobs=-1)]: Done 420 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 424 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 428 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 436 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 440 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 444 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 448 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 452 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 464 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 468 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 472 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 476 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:    2.0s
[Paralle

[Parallel(n_jobs=-1)]: Done 977 out of 1000 | elapsed:    3.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.7s finished


GridSearchCV(cv=10, estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': array([ 1.        ,  1.09090909,  1.18181818,  1.27272727,  1.36363636,
        1.45454545,  1.54545455,  1.63636364,  1.72727273,  1.81818182,
        1.90909091,  2.        ,  2.09090909,  2.18181818,  2.27272727,
        2.36363636,  2.45454545,  2.54545455,  2.63636364,  2.72727273,
        2.81818182,  2.90909091,  3.        ,  3.09090909,  3.18181818,
        3.27272727,  3.36363636,  3.4...
        7.36363636,  7.45454545,  7.54545455,  7.63636364,  7.72727273,
        7.81818182,  7.90909091,  8.        ,  8.09090909,  8.18181818,
        8.27272727,  8.36363636,  8.45454545,  8.54545455,  8.63636364,
        8.72727273,  8.81818182,  8.90909091,  9.        ,  9.09090909,
        9.18181818,  9.27272727,  9.36363636,  9.45454545,  9.54545455,
        9.63636364,  9.72727273,  9.81818182,  9.90909091, 10.        ])},
             scoring='neg_mean_absolute_error', verbose=20)

In [51]:
grid_search.best_estimator_

Lasso()

you can see that, the best value of alpha comes at the edge of the range that we tried , we should expand the trial range on that side and run this again

In [52]:
lambdas=np.linspace(.001,2,100)

params={'alpha':lambdas}

In [53]:
grid_search=GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=Lasso(),
             param_grid={'alpha': array([1.00000000e-03, 2.11919192e-02, 4.13838384e-02, 6.15757576e-02,
       8.17676768e-02, 1.01959596e-01, 1.22151515e-01, 1.42343434e-01,
       1.62535354e-01, 1.82727273e-01, 2.02919192e-01, 2.23111111e-01,
       2.43303030e-01, 2.63494949e-01, 2.83686869e-01, 3.03878788e-01,
       3.24070707e-01, 3.44262626e-01, 3.64454545e-01, 3...
       1.53558586e+00, 1.55577778e+00, 1.57596970e+00, 1.59616162e+00,
       1.61635354e+00, 1.63654545e+00, 1.65673737e+00, 1.67692929e+00,
       1.69712121e+00, 1.71731313e+00, 1.73750505e+00, 1.75769697e+00,
       1.77788889e+00, 1.79808081e+00, 1.81827273e+00, 1.83846465e+00,
       1.85865657e+00, 1.87884848e+00, 1.89904040e+00, 1.91923232e+00,
       1.93942424e+00, 1.95961616e+00, 1.97980808e+00, 2.00000000e+00])},
             scoring='neg_mean_absolute_error')

In [54]:
grid_search.best_estimator_

Lasso(alpha=0.021191919191919192)

In [55]:
report(grid_search.cv_results_,5)

Model with rank: 1
Mean validation score: -1.600706 (std: 0.123574)
Parameters: {'alpha': 0.021191919191919192}

Model with rank: 2
Mean validation score: -1.608777 (std: 0.129103)
Parameters: {'alpha': 0.041383838383838384}

Model with rank: 3
Mean validation score: -1.615683 (std: 0.129650)
Parameters: {'alpha': 0.061575757575757575}

Model with rank: 4
Mean validation score: -1.616717 (std: 0.115096)
Parameters: {'alpha': 0.001}

Model with rank: 5
Mean validation score: -1.620303 (std: 0.130280)
Parameters: {'alpha': 0.08176767676767677}



In [56]:
lasso_model=grid_search.best_estimator_

In [57]:
lasso_model.fit(x_train,y_train)

Lasso(alpha=0.021191919191919192)

In [58]:
lasso_model.intercept_

72.05485736242387

In [59]:
list(zip(data_pipe.get_feature_names(),lasso_model.coef_))


[('obj_to_num__Amount.Requested', 0.0001600282375452624),
 ('obj_to_num__Open.CREDIT.Lines', -0.03808075165998493),
 ('obj_to_num__Revolving.CREDIT.Balance', -3.1909390631943006e-06),
 ('dtir__Debt.To.Income.Ratio', -0.0003564706373253825),
 ('obj_to_dum__Loan.Length_36 months', -0.0),
 ('obj_to_dum__Loan.Length_60 months', 3.03420127536005),
 ('obj_to_dum__Loan.Purpose_debt_consolidation', -0.17001965888491866),
 ('obj_to_dum__Loan.Purpose_credit_card', -0.21368849950898278),
 ('obj_to_dum__Loan.Purpose_other', 0.31957133590263687),
 ('obj_to_dum__Loan.Purpose_home_improvement', -0.0),
 ('obj_to_dum__Loan.Purpose_major_purchase', 0.0),
 ('obj_to_dum__Loan.Purpose_small_business', 0.0),
 ('obj_to_dum__Loan.Purpose_car', 0.0),
 ('obj_to_dum__Loan.Purpose_wedding', -0.0),
 ('obj_to_dum__Loan.Purpose_medical', -0.0),
 ('obj_to_dum__Loan.Purpose_moving', 0.0),
 ('obj_to_dum__State_CA', -0.0),
 ('obj_to_dum__State_NY', 0.0),
 ('obj_to_dum__State_FL', 0.0),
 ('obj_to_dum__State_TX', 0.248364

In [60]:
(lasso_model.coef_==0).sum()


46

# Logistic Regression

In [61]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from mypipes import *

In [62]:
train_file=r'~/Dropbox/0.0 Data/rg_train.csv'
test_file=r'~/Dropbox/0.0 Data/rg_test.csv'
bd_train=pd.read_csv(train_file)

bd_test=pd.read_csv(test_file)

In [63]:
bd_train.head()

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment.Tax.Saving.Bond,Home.Loan,Online.Purchase.Amount,Revenue.Grid,gender,region,Investment.in.Commudity,Investment.in.Equity,Investment.in.Derivative,Portfolio.Balance
0,2148,1,45-50,Partner,Professional,Professional,Rent Privately,">=35,000",Yes,Yes,...,7.49,2.48,0.0,2,Female,South West,65.87,9.27,30.93,87.48
1,8099,1,61-65,Partner,Retired,Retired,Own Home,"<12,500, >=10,000",No,No,...,0.0,3.99,0.0,2,Female,Unknown,42.46,4.49,26.23,110.73
2,6611,3,31-35,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,0.0,0.0,0.0,2,Male,East Anglia,75.38,0.0,26.66,127.57
3,1950,Zero,55-60,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,2.0,0.0,0.0,2,Female,North West,34.78,6.91,29.24,33.79
4,10857,2,51-55,Partner,Manual Worker,Manual Worker,Own Home,"<27,500, >=25,000",Yes,Yes,...,0.0,0.0,0.0,2,Female,South West,48.58,9.58,20.65,56.17


In [64]:
bd_train['family_income'].value_counts(dropna=False)

>=35,000             2036
<27,500, >=25,000     978
<30,000, >=27,500     788
<25,000, >=22,500     676
<20,000, >=17,500     542
<12,500, >=10,000     536
<17,500, >=15,000     511
<15,000, >=12,500     497
<22,500, >=20,000     461
<10,000, >= 8,000     454
< 8,000, >= 4,000     322
< 4,000               222
Unknown               101
Name: family_income, dtype: int64

In [65]:
# drop : REF_NO, post_area , post_code,Revenue.Grid 
# children : convert zero:0 and 4+: 4 and then convert to numeric 
# age_band : 71+ : 71, Unknown: NA, rest: split and average
# status, occupation, occupation_partner,home_status: create dummies with freq cutoff
# family_income : remove [,>=], 35000:35000, 4000: 4000, unknown:NA, rest : split then avg
# self_employed, self_employed_partner : dummies 
# year_last_moved : keep as is 
# TVarea : dummies 
# 'Average.Credit.Card.Transaction', 'Balance.Transfer',
#       'Term.Deposit', 'Life.Insurance', 'Medical.Insurance',
#       'Average.A.C.Balance', 'Personal.Loan', 'Investment.in.Mutual.Fund',
#       'Investment.Tax.Saving.Bond', 'Home.Loan', 'Online.Purchase.Amount'
# 'Investment.in.Commudity',
#       'Investment.in.Equity', 'Investment.in.Derivative',
#      'Portfolio.Balance' : as is 
# gender , region : dummies 
    

In [66]:
bd_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 32 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   REF_NO                           8124 non-null   int64  
 1   children                         8124 non-null   object 
 2   age_band                         8124 non-null   object 
 3   status                           8124 non-null   object 
 4   occupation                       8124 non-null   object 
 5   occupation_partner               8124 non-null   object 
 6   home_status                      8124 non-null   object 
 7   family_income                    8124 non-null   object 
 8   self_employed                    8124 non-null   object 
 9   self_employed_partner            8124 non-null   object 
 10  year_last_moved                  8124 non-null   int64  
 11  TVarea                           8124 non-null   object 
 12  post_code           

In [67]:
num_vars=list(bd_train.select_dtypes(exclude=['object']).columns)

In [68]:
num_vars

['REF_NO',
 'year_last_moved',
 'Average.Credit.Card.Transaction',
 'Balance.Transfer',
 'Term.Deposit',
 'Life.Insurance',
 'Medical.Insurance',
 'Average.A.C.Balance',
 'Personal.Loan',
 'Investment.in.Mutual.Fund',
 'Investment.Tax.Saving.Bond',
 'Home.Loan',
 'Online.Purchase.Amount',
 'Revenue.Grid',
 'Investment.in.Commudity',
 'Investment.in.Equity',
 'Investment.in.Derivative',
 'Portfolio.Balance']

In [69]:
num_vars=[_ for _ in num_vars if _ not in ['REF_NO','Revenue.Grid']]

In [70]:
num_vars

['year_last_moved',
 'Average.Credit.Card.Transaction',
 'Balance.Transfer',
 'Term.Deposit',
 'Life.Insurance',
 'Medical.Insurance',
 'Average.A.C.Balance',
 'Personal.Loan',
 'Investment.in.Mutual.Fund',
 'Investment.Tax.Saving.Bond',
 'Home.Loan',
 'Online.Purchase.Amount',
 'Investment.in.Commudity',
 'Investment.in.Equity',
 'Investment.in.Derivative',
 'Portfolio.Balance']

In [71]:
cat_vars=list(bd_train.select_dtypes(include=['object']).columns)

In [72]:
cat_vars

['children',
 'age_band',
 'status',
 'occupation',
 'occupation_partner',
 'home_status',
 'family_income',
 'self_employed',
 'self_employed_partner',
 'TVarea',
 'post_code',
 'post_area',
 'gender',
 'region']

In [73]:
cat_vars=[_ for _ in cat_vars if _ not in 
          ['children','age_band', 'post_code','post_area','family_income']]

In [74]:
cat_vars

['status',
 'occupation',
 'occupation_partner',
 'home_status',
 'self_employed',
 'self_employed_partner',
 'TVarea',
 'gender',
 'region']

In [75]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

In [76]:
p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(70))
])

In [77]:
p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

In [78]:
data_pipe=FeatureUnion([
    ('num',p1),
    ('obj_to_dum',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
])

In [79]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(bd_train),
                     columns=data_pipe.get_feature_names())


In [80]:
x_test=pd.DataFrame(data=data_pipe.transform(bd_test),
                     columns=data_pipe.get_feature_names())

In [81]:
bd_train['Revenue.Grid'].value_counts(dropna=False)

2    7261
1     863
Name: Revenue.Grid, dtype: int64

In [82]:
y_train=(bd_train['Revenue.Grid']==1).astype(int)

In [83]:
x_train.shape

(8124, 71)

In [84]:
x_test.shape

(2031, 71)

In [85]:
x_train.head()

Unnamed: 0,num__year_last_moved,num__Average.Credit.Card.Transaction,num__Balance.Transfer,num__Term.Deposit,num__Life.Insurance,num__Medical.Insurance,num__Average.A.C.Balance,num__Personal.Loan,num__Investment.in.Mutual.Fund,num__Investment.Tax.Saving.Bond,...,obj_to_dum__region_West Midlands,obj_to_dum__region_Scotland,obj_to_dum__region_East Midlands,obj_to_dum__region_North,obj_to_dum__region_Wales,obj_to_dum__region_East Anglia,obj_to_dum__region_Northern Ireland,age_band__age_band,family_income__fi,children__children
0,1999.0,0.0,0.0,196.95,132.42,0.0,0.0,21.47,24.18,7.49,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.5,35000.0,1.0
1,1959.0,0.0,77.89,0.0,134.39,0.0,7.99,14.98,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,11250.0,1.0
2,1992.0,119.98,0.0,96.94,0.0,159.97,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,33.0,35000.0,3.0
3,1990.0,0.0,39.99,0.0,133.93,0.0,39.48,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.5,35000.0,0.0
4,1994.0,0.0,161.47,14.99,58.97,7.49,57.46,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,26250.0,2.0


In [86]:
from sklearn.linear_model import LogisticRegression

In [87]:
params={'class_weight':['balanced',None],
        'penalty':['l1','l2'],
        'C':[.0001,.0005,.001,.005,.01,.05,.1,1,2,5]}

In [88]:
model=LogisticRegression(fit_intercept=True)

In [89]:
from sklearn.model_selection import GridSearchCV

In [90]:
grid_search=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring="roc_auc",
                         n_jobs=-1,
                         verbose=20)

In [91]:
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0647s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  14 ta

[Parallel(n_jobs=-1)]: Done 238 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 239 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 241 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 242 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 243 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 245 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 246 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 247 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 251 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 252 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 253 tasks      | elapsed:    8.6s
[Paralle

[Parallel(n_jobs=-1)]: Done 372 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 373 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 374 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 375 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 398 out of 400 | elapsed:   13.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   13.1s finished


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 1,
                               2, 5],
                         'class_weight': ['balanced', None],
                         'penalty': ['l1', 'l2']},
             scoring='roc_auc', verbose=20)

In [92]:
grid_search.best_estimator_

LogisticRegression(C=0.0005, class_weight='balanced')

In [93]:
logr=grid_search.best_estimator_

In [94]:
report(grid_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.952192 (std: 0.014226)
Parameters: {'C': 0.0005, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 2
Mean validation score: 0.951468 (std: 0.013715)
Parameters: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 3
Mean validation score: 0.950636 (std: 0.013360)
Parameters: {'C': 0.005, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 4
Mean validation score: 0.950591 (std: 0.013675)
Parameters: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 5
Mean validation score: 0.950474 (std: 0.011833)
Parameters: {'C': 2, 'class_weight': 'balanced', 'penalty': 'l2'}



In [98]:
LogisticRegression?

In [99]:
logr=LogisticRegression(fit_intercept=True,
                        **{'C': 0.005, 'class_weight': 'balanced', 'penalty': 'l1'},solver='liblinear')
# default solver lbfgs does not support l1 penalty for some versions of sklearn
# if you get an error like that , simply use solver='liblinear', it supports both l1 & l2 penalty


In [100]:
logr.fit(x_train,y_train)

LogisticRegression(C=0.005, class_weight='balanced', penalty='l1',
                   solver='liblinear')

In [101]:
(logr.coef_[0]==0).sum()

57

In [102]:
list(zip(x_train.columns,logr.coef_[0]))

[('num__year_last_moved', -0.001000449419092302),
 ('num__Average.Credit.Card.Transaction', 0.02237231080795568),
 ('num__Balance.Transfer', -0.005119914223360931),
 ('num__Term.Deposit', -0.019319936192262296),
 ('num__Life.Insurance', 0.013909586173218467),
 ('num__Medical.Insurance', -0.008239761551533694),
 ('num__Average.A.C.Balance', -0.002845888026235375),
 ('num__Personal.Loan', -0.029914255109158315),
 ('num__Investment.in.Mutual.Fund', 0.0007555780978209122),
 ('num__Investment.Tax.Saving.Bond', 0.09044183084157585),
 ('num__Home.Loan', -0.06544439673911343),
 ('num__Online.Purchase.Amount', 0.0525417869880393),
 ('num__Investment.in.Commudity', 0.0),
 ('num__Investment.in.Equity', 0.0),
 ('num__Investment.in.Derivative', 0.0),
 ('num__Portfolio.Balance', 0.0),
 ('obj_to_dum__status_Partner', 0.0),
 ('obj_to_dum__status_Single/Never Married', 0.0),
 ('obj_to_dum__status_Divorced/Separated', 0.0),
 ('obj_to_dum__status_Widowed', 0.0),
 ('obj_to_dum__occupation_Professional', 0

In [103]:
logr.predict_proba(x_test)

array([[0.99670483, 0.00329517],
       [0.95988765, 0.04011235],
       [0.99214981, 0.00785019],
       ...,
       [0.97533129, 0.02466871],
       [0.79131487, 0.20868513],
       [0.81697355, 0.18302645]])

In [104]:
logr.classes_

array([0, 1])

In [105]:
cutoffs=np.linspace(0.01,0.99,99)

cutoffs

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
       0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
       0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55,
       0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
       0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77,
       0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
       0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99])

In [106]:
logr.predict_proba(x_train)

array([[0.99068455, 0.00931545],
       [0.86859799, 0.13140201],
       [0.95025468, 0.04974532],
       ...,
       [0.44772029, 0.55227971],
       [0.93800946, 0.06199054],
       [0.98048754, 0.01951246]])

In [108]:
logr.classes_

array([0, 1])

In [109]:
train_score=logr.predict_proba(x_train)[:,1]
real=y_train
 # In  order to find the probability of which column is for outcome 1 and which for outcome 0


In [110]:
(train_score>0.2).astype(int)

array([0, 0, 0, ..., 1, 0, 0])

In [111]:
KS_all=[]

for cutoff in cutoffs:
    
    predicted=(train_score>cutoff).astype(int)

    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
      
    KS=(TP/P)-(FP/N)
       
    KS_all.append(KS)
    


In [112]:
list(zip(cutoffs,KS_all))

[(0.01, 0.09953986782829838),
 (0.02, 0.15699375207760058),
 (0.03, 0.2090764753297949),
 (0.04, 0.2612161705187622),
 (0.05, 0.31034975822035626),
 (0.060000000000000005, 0.36970797334224026),
 (0.06999999999999999, 0.4378472076489852),
 (0.08, 0.499432754203755),
 (0.09, 0.5505276447147038),
 (0.09999999999999999, 0.5828353608374268),
 (0.11, 0.6095202500126471),
 (0.12, 0.6355497225370929),
 (0.13, 0.6549685353727903),
 (0.14, 0.6704171861831723),
 (0.15000000000000002, 0.6843508941482161),
 (0.16, 0.6983985459868058),
 (0.17, 0.7096347843516442),
 (0.18000000000000002, 0.7202631624723139),
 (0.19, 0.728631015426628),
 (0.2, 0.7360110356396967),
 (0.21000000000000002, 0.7448252485580276),
 (0.22, 0.7513219643732296),
 (0.23, 0.7570493196641113),
 (0.24000000000000002, 0.7638784515697843),
 (0.25, 0.7701566951680616),
 (0.26, 0.7753901340883206),
 (0.27, 0.779740268610713),
 (0.28, 0.7860754841457632),
 (0.29000000000000004, 0.7845511257702582),
 (0.3, 0.7861705969589752),
 (0.31, 0.

In [113]:
mycutoff=cutoffs[KS_all==max(KS_all)]
mycutoff

array([0.48])

In [116]:
logr.intercept_

array([0.])

In [117]:
list(zip(x_train.columns,logr.coef_[0]))

[('num__year_last_moved', -0.001000449419092302),
 ('num__Average.Credit.Card.Transaction', 0.02237231080795568),
 ('num__Balance.Transfer', -0.005119914223360931),
 ('num__Term.Deposit', -0.019319936192262296),
 ('num__Life.Insurance', 0.013909586173218467),
 ('num__Medical.Insurance', -0.008239761551533694),
 ('num__Average.A.C.Balance', -0.002845888026235375),
 ('num__Personal.Loan', -0.029914255109158315),
 ('num__Investment.in.Mutual.Fund', 0.0007555780978209122),
 ('num__Investment.Tax.Saving.Bond', 0.09044183084157585),
 ('num__Home.Loan', -0.06544439673911343),
 ('num__Online.Purchase.Amount', 0.0525417869880393),
 ('num__Investment.in.Commudity', 0.0),
 ('num__Investment.in.Equity', 0.0),
 ('num__Investment.in.Derivative', 0.0),
 ('num__Portfolio.Balance', 0.0),
 ('obj_to_dum__status_Partner', 0.0),
 ('obj_to_dum__status_Single/Never Married', 0.0),
 ('obj_to_dum__status_Divorced/Separated', 0.0),
 ('obj_to_dum__status_Widowed', 0.0),
 ('obj_to_dum__occupation_Professional', 0

if you simply had to submit probability scores , you could do this 

In [118]:
logr.predict_proba(x_test)

array([[0.99670483, 0.00329517],
       [0.95988765, 0.04011235],
       [0.99214981, 0.00785019],
       ...,
       [0.97533129, 0.02466871],
       [0.79131487, 0.20868513],
       [0.81697355, 0.18302645]])

In [119]:
test_score=logr.predict_proba(x_test)[:,1]
test_score

array([0.00329517, 0.04011235, 0.00785019, ..., 0.02466871, 0.20868513,
       0.18302645])

In [120]:
(test_score>mycutoff).astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [121]:
pd.DataFrame(test_score).to_csv("mysubmission.csv",index=False)

if you had to submit hardclasses , you can apply the cutoff obtained above and then submit

In [122]:
test_classes=(test_score>mycutoff).astype(int)

In [123]:
pd.DataFrame(test_classes).to_csv("mysubmission.csv",index=False)