# 1.Getting the system ready and loading the data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from collections import Counter


In [2]:
import pickle

# Reading data

In [3]:
train = pd.read_csv('E:/Project/train.csv')

In [4]:
test = pd.read_csv('E:/Project/test.csv')

# Missing value imputation

In [5]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)


 Fill Missing values by mode in Gender, Married, Dependents, Credit_History, and Self_Employed

In [6]:
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)

train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)

In [7]:
# find  missing values in Loan_Amount_Term.

In [8]:
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)

In [9]:
train['Dependents'].mode()[0]

'0'

In [10]:
#value of 360 is repeating the 512 times so we  can fill missing value by mode

In [11]:
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)

In [12]:
# LoanAmount variabl is a numerical variable we can use mean or median to fill missing values
#loan amount has outliers instead of mean we can use median

In [13]:
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)

In [14]:
#Check  here missing values are filled in the dataset.

In [15]:
train.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [16]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)

In [17]:
# Lets fill all the missing values in the test dataset like train dataset
test['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
test['Married'].fillna(train['Married'].mode()[0], inplace=True)

test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
test['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)

In [18]:
test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)

# Outlier Treatment

In [19]:
# lets  first consider how to remove  outliers of LoanAmount.
#bulk of the data in the loan amount is at the left and the right tail is longer. This is called right skewness.
#we can remove the skewness is by doing the log transformation

In [20]:
train['LoanAmount']=np.log(train['LoanAmount'])
test['LoanAmount']=np.log(test['LoanAmount'])

In [21]:
# Here distribution looks much closer to normal and the effect of extreme values has been significantly subsided.

#Skewness and kurtosis of Train Data

# Skewness and kurtosis of test data

# Remove skewness by power transformation

In [22]:
#*********************train  Data remove Skewness from LoanAmount *******************
from scipy.stats import boxcox
train['LoanAmount']=boxcox(train['LoanAmount'])[0]


In [23]:
#*********************Test Data remove Skewness from LoanAmount *******************
test['LoanAmount']=boxcox(test['LoanAmount'])[0]


In [24]:
#Logistic Regression is a classification algorithm
#Logistic Regression which is used for predicting binary outcome.
#It is used to predict a binary outcome (1 / 0, Yes / No, True / False) given a set of independent variables.
#Logistic regression is an estimation of Logit function. The logit function is simply a log of odds in favor of the event.
#This function creates an S-shaped curve with the probability estimate, which is very similar to the required stepwise function

In [25]:
# Loan_ID variable as it does not have any effect on the loan status.Remove it from train and test dataset

In [26]:
train=train.drop('Loan_ID',axis=1)
test=test.drop('Loan_ID',axis=1)

In [27]:
#drop our target variable from the training dataset and save it in another dataset.
# As Sklearn requires the target variable in a separate dataset.  

In [28]:
X = train.drop('Loan_Status',1)
y = train.Loan_Status

In [29]:
train['Gender']=train.Gender.map({'Male':0,'Female':1})
train.Married=train.Married.map({'No':0,'Yes':1})
train.Education=train.Education.map({'Graduate':1,'Not Graduate':0})
train.Self_Employed=train.Self_Employed.map({'No':0,'Yes':1})
#train['Property_Area']=trainxzz

In [30]:
##train['Dependents']=train.Dependents.map({'1':1,'2':2,'3+':3})
#train['Property_Area']=train.Property_Area.map({'Urban':1,'Semiurban':2,'Rural':3})
train.head(50)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,1,0,5849,0.0,6.039031,360.0,1.0,Urban,Y
1,0,1,1,1,0,4583,1508.0,6.039031,360.0,1.0,Rural,N
2,0,1,0,1,1,3000,0.0,4.758102,360.0,1.0,Urban,Y
3,0,1,0,0,0,2583,2358.0,5.910652,360.0,1.0,Urban,Y
4,0,0,0,1,0,6000,0.0,6.232844,360.0,1.0,Urban,Y
5,0,1,2,1,1,5417,4196.0,7.55308,360.0,1.0,Urban,Y
6,0,1,0,0,0,2333,1516.0,5.452301,360.0,1.0,Urban,Y
7,0,1,3+,1,0,3036,2504.0,6.463061,360.0,0.0,Semiurban,N
8,0,1,2,1,0,4006,1526.0,6.588117,360.0,1.0,Urban,Y
9,0,1,1,1,0,12841,10968.0,8.127293,360.0,1.0,Semiurban,N


In [31]:
X = train[['Gender','Married','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History']]
y = train.Loan_Status

In [32]:
#“Gender” variable. It has two classes, Male and Female.
#logistic regression takes only the numerical values as input
#“Gender” variable into two dummy  variables(Gender_Male and Gender_Female)
#Gender_Male= 0 if  gender = Female  and Gender_Male= 1 if gender= Male.

In [33]:
#X = pd.get_dummies(X)
#train=pd.get_dummies(train)
#test=pd.get_dummies(test)

In [34]:
#divide our train dataset in 2 parts : train and validation. 
#We can train the model on this training part and using that make predictions for the validation part.

In [35]:
from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size=0.3)
#import LogisticRegression and accuracy_score from sklearn and fit the logistic regression model.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
model = LogisticRegression()
model.fit(x_train, y_train)
#predict the Loan_Status for validation set and calculate its accuracy.
pred_cv = model.predict(x_cv)

In [36]:
pickle_out=open('E:/Project/c1.pkl',mode='wb')
pickle.dump(model,pickle_out)
pickle_out.close()