# LendingClub - Logistic Regression 

In [None]:
loans=read.csv("../Csv_Files/loans.csv")

In [None]:
str(loans)

In [None]:
summary(loans)

In [None]:
library(mice)

#### Fill the missing values for all the attributes except class

In [None]:
fill_feature = setdiff(names(loans),"not.fully.paid")

#### Fill the missed values based on the available non missing variables. Ex: If it is raining, missing temperature value will be less. (Multivariate Imputation by Chained Equations in R)

In [None]:
complete_feature = complete(mice(loans[fill_feature]))

In [None]:
loans[fill_feature]= complete_feature

#### Feature with different levels are reduced to same value range. Features with higher range values should not dominate the ones with lower value

In [None]:
scale_features = setdiff(names(loans),c("not.fully.paid","purpose"))

In [None]:
scaled_features=scale(loans[scale_features])

In [None]:
set.seed(144)

In [None]:
library(caTools)

#### Split the data into training and test data

In [None]:
split = sample.split(loans$not.fully.paid, SplitRatio = 0.75)

In [None]:
Train = subset(loans, split == TRUE)

In [None]:
Test = subset(loans, split == FALSE)

In [None]:
loanTrain = read.csv("../Csv_Files/loanTrain.csv")

In [None]:
loanTest = read.csv("../Csv_Files/loanTest.csv")

#### Create Linear regression model

In [None]:
Model = glm( not.fully.paid~., data=loanTrain, family=binomial)

In [None]:
summary(Model)

In [None]:
loanTest$predict.test = predict(Model, type="response", newdata=loanTest)

#### Predict the output of loanTest

In [None]:
table(loanTest$not.fully.paid,as.numeric(loanTest$predict.test>0.5))

###                        Feature understanding

#### In the summary function, int.rate is not influencial. So, features may be correlated.

In [None]:
correlationMatrix <- cor(loans[scale_features])

#### Find the highly correlated feature

In [None]:
library(caret)

In [None]:
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.5,verbose = TRUE)

In [None]:
scale_features[6]

In [None]:
 Model = glm( not.fully.paid~credit.policy+purpose+int.rate+installment+log.annual.inc+dti+days.with.cr.line+revol.bal+revol.util+inq.last.6mths+delinq.2yrs +pub.rec, data=loanTrain, family=binomial)

In [None]:
loanTest$predict.test = predict(Model, type="response", newdata=loanTest)

In [None]:
table(loanTest$not.fully.paid,as.numeric(loanTest$predict.test>0.5))

#### Slight improvement in the model

#### Selecting feature smartly will improve your prediction

### Understand response function

 ####    Change the G function manually

#### Find the coefficient of the model

In [None]:
betas <- coef(Model)

In [None]:
betas

#### Convert dataframe into matrix (for multiplication)

In [None]:
para=c()
for(i in seq(1,length(betas)))
{
    para[i]= betas[i]
}

In [None]:
X <- model.matrix(not.fully.paid~credit.policy+purpose+int.rate+installment+log.annual.inc+dti+days.with.cr.line+revol.bal+revol.util+inq.last.6mths+delinq.2yrs +pub.rec, data=loanTest)

#### Extract the probability of the class

In [None]:
prob_class <- 1 / (1 +  exp(-(X %*% para)))

In [None]:
table(loanTest$not.fully.paid,as.numeric(prob_class>0.5))

In [None]:
prob_class <- 1 / (1 +  exp(-(X %*% para))/2)

In [None]:
table(loanTest$not.fully.paid,as.numeric(prob_class>0.5))

#### Understanding g function may improve the prediction

In [None]:
library(gplots)

In [None]:
library(ROCR)

#### Plotting the precision and specifivity will give better prediction

In [None]:
pred = prediction(prob_class,loanTest$not.fully.paid)

In [None]:
ROCRperf = performance(pred, "ppv", "tpr")

In [None]:
plot(ROCRperf)

In [None]:
plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))