# Linear Regression 

In [None]:
import pandas as pd 
import numpy as np

In [None]:
train_file='/home/dell/GIT_REPO/Python_AI_ML/intro-to-ml-with-python-master/week4/data/loan_data_train.csv'
test_file='/home/dell/GIT_REPO/Python_AI_ML/intro-to-ml-with-python-master/week4/data/loan_data_test.csv'

ld_train=pd.read_csv(train_file)
ld_test=pd.read_csv(test_file)               

In [None]:
ld_test.head()

In [None]:
ld_train.head()

In [None]:
# lets combine the data for data prep

ld_test['Interest.Rate']=np.nan
ld_train['data']='train'
ld_test['data']='test'
ld_test=ld_test[ld_train.columns]
ld_all=pd.concat([ld_train,ld_test],axis=0)
ld_all

In [None]:
ld_all.head()

In [None]:
ld_all.dtypes

In [None]:
# ID,Amount.Funded.By.Investors : drop 
# Interest Rate , Debt to income ratio : remove % and then to numeric
# Amount.Requested , 'Open.CREDIT.Lines','Revolving.CREDIT.Balance': convert it to numeric 
# FICO.Range : replace it by a numeric column which is average of the range
# Employment Length : convert to number
# Loan Lenth, Loan Purpose , State , Home ownership: dummies for categories with good occurence rate

In [None]:
ld_all.drop(['ID','Amount.Funded.By.Investors'],axis=1,inplace=True)

In [None]:
for col in ['Interest.Rate','Debt.To.Income.Ratio']:
    ld_all[col]=ld_all[col].str.replace("%","")

In [None]:
for col in ['Amount.Requested', 'Interest.Rate','Debt.To.Income.Ratio',
            'Open.CREDIT.Lines','Revolving.CREDIT.Balance']:
    ld_all[col]=pd.to_numeric(ld_all[col],errors='coerce')
    

In [None]:
k=ld_all['FICO.Range'].str.split("-",expand=True).astype(float)

ld_all['fico']=0.5*(k[0]+k[1])

del ld_all['FICO.Range']



In [None]:
ld_all['Employment.Length'].value_counts()

In [None]:
ld_all['Employment.Length']=ld_all['Employment.Length'].str.replace('years',"")

ld_all['Employment.Length']=ld_all['Employment.Length'].str.replace('year',"")

ld_all['Employment.Length']=np.where(ld_all['Employment.Length'].str[:2]=="10",10,ld_all['Employment.Length'])

ld_all['Employment.Length']=np.where(ld_all['Employment.Length'].str[0]=="<",0,ld_all['Employment.Length'])

ld_all['Employment.Length']=pd.to_numeric(ld_all['Employment.Length'],errors='coerce')

In [None]:
# Notice that to apply string function on pandas data frame columns you need to str attribute
cat_cols=ld_all.select_dtypes(['object']).columns

In [None]:
cat_cols

In [None]:
cat_cols=cat_cols[:-1]

In [None]:
cat_cols

In [None]:
# you can use following method if you want to ignore categories with too low frequencies ,
#in next section for logistic regression we will be using  pandas' get dummies function. 
# you can work with either of these . 
#ignoring categories with low frequencies however will result in fewer columns without 
# affecting model performance too much .

for col in cat_cols:
    freqs=ld_all[col].value_counts()
    k=freqs.index[freqs>20][:-1]
    for cat in k:
        name=col+'_'+cat
        ld_all[name]=(ld_all[col]==cat).astype(int)
    del ld_all[col]
    print(col)
    
    

In [None]:
ld_all.shape

In [None]:
ld_all.isnull().sum()

In [None]:
for col in ld_all.columns:
    if (col not in ['Interest.Rate','data'])& (ld_all[col].isnull().sum()>0):
        ld_all.loc[ld_all[col].isnull(),col]=ld_all.loc[ld_all['data']=='train',col].mean()

In [None]:
ld_all.isnull().sum()

In [None]:
ld_train=ld_all[ld_all['data']=='train']
del ld_train['data']
ld_test=ld_all[ld_all['data']=='test']
ld_test.drop(['Interest.Rate','data'],axis=1,inplace=True)

In [None]:
del ld_all

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
ld_train1,ld_train2=train_test_split(ld_train,test_size=0.2,random_state=2)

In [None]:
# Notice that only train data is used for imputing missing values in both train and test 

x_train1=ld_train1.drop('Interest.Rate',axis=1)
y_train1=ld_train1['Interest.Rate']



In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(x_train1,y_train1)

In [None]:
x_train1.shape

In [None]:
lm.intercept_

In [None]:
list(zip(x_train1.columns,lm.coef_))

In [None]:
x_train2=ld_train2.drop('Interest.Rate',axis=1)

In [None]:
predicted_ir=lm.predict(x_train2)

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
mean_absolute_error(ld_train2['Interest.Rate'],predicted_ir)

We know the tentative performance now, lets build the model on entire training to make prediction on test/production

In [None]:
x_train=ld_train.drop('Interest.Rate',axis=1)
y_train=ld_train['Interest.Rate']

In [None]:
lm.fit(x_train,y_train)

In [None]:
test_pred=lm.predict(ld_test)

We can write these to a csv file for submission like this :

In [None]:
pd.DataFrame(test_pred).to_csv("mysubmission.csv",index=False)

# Ridge  Regression

In [None]:
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
lambdas=np.linspace(1,100,100)

In [None]:
params={'alpha':lambdas}

In [None]:
model=Ridge(fit_intercept=True)

In [None]:
grid_search=GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.cv_results_

 if you want you can now fit a ridge regression model with obtained value of alpha , although there is no need, grid search automatically fits the best estimator on the entire data, you can directly use this to make predictions on test_data. But if you want to look at coefficients , its much more convenient to fit the model with direct function

Using the report function given below you can see the cv performance of top few models as well, that will the tentative performance

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
report(grid_search.cv_results_,100)

In [None]:
test_pred=grid_search.predict(ld_test)

In [None]:
test_pred

In [None]:
pd.DataFrame(test_pred).to_csv("mysubmission.csv",index=False)

## For looking at coefficients

In [None]:
ridge_model=grid_search.best_estimator_

In [None]:
ridge_model.fit(x_train,y_train)

In [None]:
list(zip(x_train1.columns,ridge_model.coef_))

## Lasso Regression

In [None]:
lambdas=np.linspace(1,10,100)
model=Lasso(fit_intercept=True)
params={'alpha':lambdas}

In [None]:
grid_search=GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

you can see that, the best value of alpha comes at the edge of the range that we tried , we should expand the trial range on that side and run this again

In [None]:
lambdas=np.linspace(.001,2,100)
params={'alpha':lambdas}

In [None]:
grid_search=GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
report(grid_search.cv_results_,5)

In [None]:
lasso_model=grid_search.best_estimator_

In [None]:
lasso_model.fit(x_train,y_train)

In [None]:
list(zip(x_train.columns,lasso_model.coef_))

# Logistic Regression

In [22]:
import numpy as np
import pandas as pd

In [2]:
train_file='/home/dell/GIT_REPO/DS_R_Language/Edvancer_Class/DataScienceWithR/Data/rg_train.csv'
test_file ='/home/dell/GIT_REPO/DS_R_Language/Edvancer_Class/DataScienceWithR/Data/rg_test.csv'

bd_train=pd.read_csv(train_file)
bd_test=pd.read_csv(test_file)


bd_test['Revenue.Grid']=np.nan
bd_train['data']='train'
bd_test['data']='test'
bd_test=bd_test[bd_train.columns]
bd_all=pd.concat([bd_train,bd_test],axis=0)

In [3]:
bd_all

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Home.Loan,Online.Purchase.Amount,Revenue.Grid,gender,region,Investment.in.Commudity,Investment.in.Equity,Investment.in.Derivative,Portfolio.Balance,data
0,2148,1,45-50,Partner,Professional,Professional,Rent Privately,">=35,000",Yes,Yes,...,2.48,0.00,2.0,Female,South West,65.87,9.27,30.93,87.48,train
1,8099,1,61-65,Partner,Retired,Retired,Own Home,"<12,500, >=10,000",No,No,...,3.99,0.00,2.0,Female,Unknown,42.46,4.49,26.23,110.73,train
2,6611,3,31-35,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,0.00,0.00,2.0,Male,East Anglia,75.38,0.00,26.66,127.57,train
3,1950,Zero,55-60,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,0.00,0.00,2.0,Female,North West,34.78,6.91,29.24,33.79,train
4,10857,2,51-55,Partner,Manual Worker,Manual Worker,Own Home,"<27,500, >=25,000",Yes,Yes,...,0.00,0.00,2.0,Female,South West,48.58,9.58,20.65,56.17,train
5,10853,2,31-35,Partner,Professional,Professional,Own Home,"<25,000, >=22,500",No,No,...,7.46,7.98,1.0,Male,North,77.47,38.37,85.35,200.23,train
6,1495,3,36-40,Partner,Secretarial/Admin,Unknown,Own Home,"<25,000, >=22,500",Yes,Yes,...,37.44,31.97,2.0,Female,Scotland,14.49,109.42,109.93,177.31,train
7,9602,1,41-45,Partner,Other,Professional,Own Home,"<15,000, >=12,500",No,No,...,2.99,0.00,2.0,Female,Unknown,161.69,54.87,98.65,306.17,train
8,5402,2,26-30,Partner,Secretarial/Admin,Manual Worker,Own Home,"<17,500, >=15,000",No,No,...,7.48,0.00,2.0,Female,West Midlands,2.00,1.25,0.00,46.31,train
9,6343,3,31-35,Partner,Business Manager,Manual Worker,Own Home,"<27,500, >=25,000",No,No,...,0.00,0.00,2.0,Female,South West,55.19,10.24,24.41,53.09,train


In [4]:
bd_all['Revenue.Grid'].value_counts()

2.0    7261
1.0     863
Name: Revenue.Grid, dtype: int64

In [5]:
list(zip(bd_all.columns,bd_all.dtypes,bd_all.nunique()))

[('REF_NO', dtype('int64'), 10155),
 ('children', dtype('O'), 5),
 ('age_band', dtype('O'), 13),
 ('status', dtype('O'), 5),
 ('occupation', dtype('O'), 9),
 ('occupation_partner', dtype('O'), 9),
 ('home_status', dtype('O'), 5),
 ('family_income', dtype('O'), 13),
 ('self_employed', dtype('O'), 2),
 ('self_employed_partner', dtype('O'), 2),
 ('year_last_moved', dtype('int64'), 95),
 ('TVarea', dtype('O'), 14),
 ('post_code', dtype('O'), 10040),
 ('post_area', dtype('O'), 2039),
 ('Average.Credit.Card.Transaction', dtype('float64'), 1411),
 ('Balance.Transfer', dtype('float64'), 2183),
 ('Term.Deposit', dtype('float64'), 1419),
 ('Life.Insurance', dtype('float64'), 3111),
 ('Medical.Insurance', dtype('float64'), 1589),
 ('Average.A.C.Balance', dtype('float64'), 2223),
 ('Personal.Loan', dtype('float64'), 1760),
 ('Investment.in.Mutual.Fund', dtype('float64'), 2470),
 ('Investment.Tax.Saving.Bond', dtype('float64'), 832),
 ('Home.Loan', dtype('float64'), 884),
 ('Online.Purchase.Amount'

In [6]:
# REF_NO,post_code , post_area  : drop 
# children : Zero : 0 , 4+ : 4 and then convert to numeric
# age_band : dummies 
# status , occupation , occupation_partner , home_status,family_income : dummies
# self_employed, ` : dummies
# TVArea , Region , gender : dummies
# Revenue Grid : 1,2 : 1,0

In [7]:
bd_all.drop(['REF_NO','post_code','post_area'],axis=1,inplace=True)

In [8]:
bd_all['children']=np.where(bd_all['children']=='Zero',0,bd_all['children'])
bd_all['children']=np.where(bd_all['children'][:1]=='4',4,bd_all['children'])
bd_all['children']=pd.to_numeric(bd_all['children'],errors='coerce')

In [9]:
bd_all['Revenue.Grid']=(bd_all['Revenue.Grid']==1).astype(int)

In [10]:
cat_vars=bd_all.select_dtypes(['object']).columns

cat_vars

Index(['age_band', 'status', 'occupation', 'occupation_partner', 'home_status',
       'family_income', 'self_employed', 'self_employed_partner', 'TVarea',
       'gender', 'region', 'data'],
      dtype='object')

In [11]:
for col in cat_vars[:-1]:
    dummy=pd.get_dummies(bd_all[col],drop_first=True,prefix=col)
    bd_all=pd.concat([bd_all,dummy],axis=1)
    del bd_all[col]
    print(col)
del dummy

age_band
status
occupation
occupation_partner
home_status
family_income
self_employed
self_employed_partner
TVarea
gender
region


In [12]:
bd_all.shape

(10155, 96)

In [13]:
bd_all.isnull().sum()

children                           19
year_last_moved                     0
Average.Credit.Card.Transaction     0
Balance.Transfer                    0
Term.Deposit                        0
Life.Insurance                      0
Medical.Insurance                   0
Average.A.C.Balance                 0
Personal.Loan                       0
Investment.in.Mutual.Fund           0
Investment.Tax.Saving.Bond          0
Home.Loan                           0
Online.Purchase.Amount              0
Revenue.Grid                        0
Investment.in.Commudity             0
Investment.in.Equity                0
Investment.in.Derivative            0
Portfolio.Balance                   0
data                                0
age_band_22-25                      0
age_band_26-30                      0
age_band_31-35                      0
age_band_36-40                      0
age_band_41-45                      0
age_band_45-50                      0
age_band_51-55                      0
age_band_55-

In [14]:
bd_all.loc[bd_all['children'].isnull(),'children']=bd_all.loc[bd_all['data']=='train','children'].mean()


In [15]:
bd_train=bd_all[bd_all['data']=='train']
del bd_train['data']
bd_test=bd_all[bd_all['data']=='test']
bd_test.drop(['Revenue.Grid','data'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [17]:
params={'class_weight':['balanced',None],
        'penalty':['l1','l2'],
        'C':np.linspace(0.01,1000,10)}

In [18]:
model=LogisticRegression(fit_intercept=True)

In [19]:
grid_search=GridSearchCV(model,param_grid=params,cv=5,scoring="roc_auc")

NameError: name 'GridSearchCV' is not defined

In [None]:
x_train=bd_train.drop('Revenue.Grid',axis=1)
y_train=bd_train['Revenue.Grid']

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
logr=grid_search.best_estimator_

In [None]:
report(grid_search.cv_results_,5)

In [None]:
logr.fit(x_train,y_train)

In [None]:
cutoffs=np.linspace(0.01,0.99,99)

cutoffs

In [None]:
train_score=logr.predict_proba(x_train)[:,1]

real=y_train

In [None]:
train_score>0.2

In [None]:
KS_all=[]

for cutoff in cutoffs:
    
    predicted=(train_score>cutoff).astype(int)

    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
      
    KS=(TP/P)-(FP/N)
    
    
    KS_all.append(KS)

# try out what cutoffs you get when you use F_beta scores with different values of betas [0.5 , 5]
# beta < 1 : you will get cutoff , which is high ( favours precision)
# beta > 1 : you will get cutoff , which is low (favours precision )

In [None]:
mycutoff=cutoffs[KS_all==max(KS_all)][0]
mycutoff

In [None]:
logr.intercept_

In [None]:
list(zip(x_train.columns,logr.coef_[0]))

if you simply had to submit probability scores , you could do this 

In [None]:
test_score=logr.predict_proba(bd_test)[:,1]
pd.DataFrame(test_score).to_csv("mysubmission.csv",index=False)

if you had to submit hardclasses , you can apply the cutoff obtained above and then submit

In [None]:
test_classes=(test_score>mycutoff).astype(int)

In [None]:
pd.DataFrame(test_classes).to_csv("mysubmission.csv",index=False)