## Importing Libraries

In [5]:
import pandas as pd     #importing pandas library
import numpy as np      #importing numpy library
from collections import Counter as c     #importing collections
import matplotlib.pyplot as plt         #importing matplotlib llibrary
from sklearn import preprocessing       #importing preprocessing
import seaborn as sns                   #importing seaborn library
from sklearn.model_selection import train_test_split     
from sklearn.linear_model import LogisticRegression

## Loading over dataset

In [6]:
#loading the dataset into the model mentioning the file name
data=pd.read_csv(r"C:\Users\yashs\Python36\Loan-Status-Prediction-main\Loan-Status-Prediction-main\Dataset\credit_train.csv")     
#finding the number of rows and columns
data.shape                                

FileNotFoundError: [Errno 2] File C:\Users\yashs\Loan-Status-Prediction-main\Dataset.csv does not exist: 'C:\\Users\\yashs\\Loan-Status-Prediction-main\\Dataset.csv'

In [None]:
data.columns    #lists out the names of the columns 

In [None]:
data.head()       #will display the first five rows of the dataset

## Null Values

In [None]:
#lists the sum of null values in every column of the dataset
data.isnull().sum()   

## Categorical Columns

In [None]:
#lists the columns with categorical data
object_train_df=data.select_dtypes(include=['object'])    
object_train_df.columns

## Numerical Columns

In [None]:
#lists the columns with numerical data
num_train_df=data.select_dtypes(include=['int','float'])     
num_train_df.columns

## Dropping Loan Status Null Values and Labeling it

In [None]:
data.dropna(subset=['Loan Status'], inplace = True)

In [None]:
le = preprocessing.LabelEncoder()
data['Loan Status'] = le.fit_transform(data['Loan Status'])

## Target Column Visualization

In [None]:
#loan status is the target column, assigned to be zero here,it gives the count of charged off people
coffvalue = data[data['Loan Status'] == 0]['Loan Status'].count()
#loan status is the target column, assigned to be one here,it gives the count of fully paid people
fpaidvalue = data[data['Loan Status'] == 1]['Loan Status'].count()
data1 = {"Counts":[coffvalue, fpaidvalue] }
statusDF = pd.DataFrame(data1, index=["Charged Off", "Fully Paid"])
# statusDF.head()
statusDF.plot(kind='bar', title="Status of the Loan")

## Term column Labeling

In [None]:
# replacing the values in the column[Term] with 0 and 1 in place of short term and long term
data['Term'].replace(("Short Term","Long Term"),(0,1), inplace=True)
data.head()

In [None]:
scount = data[data['Term'] == 0]['Term'].count()
lcount = data[data['Term'] ==1]['Term'].count()

data1 = {"Counts":[scount, lcount]}
#gives the count of short and long term
termDF = pd.DataFrame(data1, index=["Short Term", "Long Term"])
termDF.head()

In [None]:
#displays the sum of null values in credit sccore column
print("There are ", data['Credit Score'].isna().sum(), "null values for Credit score.")

## Scaling Credit Score Column

In [None]:
#Applying lamda function
data['Credit Score'] = data['Credit Score'].apply(lambda val: (val /10) if val>850 else val)

## Handling Null values of Credit Score Column

In [None]:

do_nothing = lambda: None
cscoredf = data[data['Term']==0]
stermAVG = cscoredf['Credit Score'].mean()
lscoredf = data[data['Term']==1]
ltermAVG = lscoredf['Credit Score'].mean()
data.loc[(data.Term ==0) & (data['Credit Score'].isnull()),'Credit Score'] = stermAVG
data.loc[(data.Term ==1) & (data['Credit Score'].isnull()),'Credit Score'] = ltermAVG

In [None]:
#For the credit score column applying conditions for the possible outcomes
data['Credit Score'] = data['Credit Score'].apply(lambda val: "Poor" if np.isreal(val)
                                                  and val < 580 else val)
data['Credit Score'] = data['Credit Score'].apply(lambda val: "Average" if np.isreal(val)
                                                  and (val >= 580 and val < 670) else val)
data['Credit Score'] = data['Credit Score'].apply(lambda val: "Good" if np.isreal(val) 
                                                  and (val >= 670 and val < 740) else val)
data['Credit Score'] = data['Credit Score'].apply(lambda val: "Very Good" if np.isreal(val) 
                                                  and (val >= 740 and val < 800) else val)
data['Credit Score'] = data['Credit Score'].apply(lambda val: "Exceptional" if np.isreal(val) 
                                                  and (val >= 800 and val <= 850) else val)

In [None]:
# The graph lists out the counts in an ascending way
data['Credit Score'].value_counts().sort_values(ascending = True).plot(kind='bar', title ='Number of loans in terms of Credit Score category')

## Annual Income Column

In [None]:
#prints the sum of null values of the column Annual Income
print("There are",data['Annual Income'].isna().sum(), "Missing Annual Income Values.")

In [None]:
# By appplying mean we fill the null values
data['Annual Income'].fillna(data['Annual Income'].mean(), inplace=True)

In [None]:
data.shape 

In [None]:
from collections import Counter as c
print(c(data['Credit Score']))  #returns the class count values 

In [None]:
data['Credit Score'] = le.fit_transform(data['Credit Score'])  #applying label encoder
c(data['Credit Score'])

## Home Ownership Column

In [None]:
data['Home Ownership'].value_counts().sort_values(ascending = True).plot(kind='bar', title="Number of Loan based on Home ownership")

In [None]:
print(c(data['Home Ownership']))
data['Home Ownership'] = le.fit_transform(data['Home Ownership'])
print(c(data['Home Ownership']))

## Years in current job

In [None]:
data['Years in current job']=data['Years in current job'].str.extract(r"(\d+)")
data['Years in current job'] = data['Years in current job'].astype(float)

In [None]:
expmean = data['Years in current job'].mean()

In [None]:
data['Years in current job'].fillna(expmean, inplace=True)

In [None]:
data['Years in current job'].fillna(expmean, inplace=True)

## Dropping unwanted columns

In [None]:
data = data.drop(['Loan ID','Customer ID','Purpose'], axis=1)

## Credit Problems

In [None]:
data['Credit Problems'] = data['Number of Credit Problems'].apply(lambda x: "No Credit Problem" if x==0 
                        else ("Some Credit promblem" if x>0 and x<5 else "Major Credit Problems"))

In [None]:
print(c(data['Credit Problems']))
data['Credit Problems'] = le.fit_transform(data['Credit Problems'])
print(c(data['Credit Problems']))

## Credit Age

In [None]:
data['Credit Age'] = data['Years of Credit History'].apply(lambda x: "Short Credit Age" if x<5 
                                else ("Good Credit Age" if x>5 and x<17 else "Exceptional Credit Age"))

In [None]:
print(c(data['Credit Age']))
data['Credit Age'] = le.fit_transform(data['Credit Age'])
print(c(data['Credit Age']))

In [None]:
data = data.drop(['Months since last delinquent','Number of Open Accounts',
                  'Maximum Open Credit','Current Credit Balance','Monthly Debt'],axis=1)

## Tax Liens

In [None]:
data['Tax Liens'] = data['Tax Liens'].apply(lambda x: "No Tax Lien" if x==0
                                else ("Some Tax Liens" if x>0 and x<3 else "Many Tax Liens"))

In [None]:
print(c(data['Tax Liens']))
data['Tax Liens'] = le.fit_transform(data['Tax Liens'])
print(c(data['Tax Liens']))

## Bankruptcies

In [None]:
data['Bankruptcies'] = data['Bankruptcies'].apply(lambda x: "No bankruptcies" if x==0 
                            else ("Some Bankruptcies" if x>0 and x<3 else "Many Bankruptcies"))

In [None]:
print(c(data['Bankruptcies']))
data['Bankruptcies'] = le.fit_transform(data['Bankruptcies'])
print(c(data['Bankruptcies']))

## Annual Income

In [None]:
meanxoutlier = data[data['Annual Income'] < 99999999.00 ]['Annual Income'].mean()
stddevxoutlier = data[data['Annual Income'] < 99999999.00 ]['Annual Income'].std()
poorline = meanxoutlier -  stddevxoutlier
richline = meanxoutlier + stddevxoutlier

In [None]:
data['Annual Income'] = data['Annual Income'].apply(lambda x: "Low Income" if x<=poorline 
                            else ("Average Income" if x>poorline and x<richline else "High Income"))

In [None]:
print(c(data['Annual Income']))
data['Annual Income'] = le.fit_transform(data['Annual Income'])
print(c(data['Annual Income']))

## Current Loan Amount

In [None]:
lmeanxoutlier = data[data['Current Loan Amount'] < 99999999.00 ]['Current Loan Amount'].mean()
lstddevxoutlier = data[data['Current Loan Amount'] < 99999999.00 ]['Current Loan Amount'].std()
lowrange = lmeanxoutlier - lstddevxoutlier
highrange = lmeanxoutlier + lstddevxoutlier
print(lowrange, highrange)

In [None]:
data['Current Loan Amount'] = data['Current Loan Amount'].apply(lambda x: "Small Loan" if x<=lowrange 
                            else ("Medium Loan" if x>lowrange and x<highrange else "Big Loan"))

In [None]:
print(c(data['Current Loan Amount']))
data['Current Loan Amount'] = le.fit_transform(data['Current Loan Amount'])
print(c(data['Current Loan Amount']))

In [None]:
data.shape

## Seperating Dependent and Independent Columns

In [None]:
y = data['Loan Status']
X = data.drop(['Loan Status'],axis=1)

In [None]:
data.head()

## Performing Train and test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#By using DecisionTree we are fitting the model
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)


In [None]:
y_pred_dt =dt.predict(X_test)  #prediction
c(y_pred_dt)

##   Creating a pickle file dumping the model in it

In [None]:
import pickle    #importing the pickle file

pickle.dump(dt,open('loan.pkl','wb'))    #Dumping the model into the pickle file

In [None]:
! jt -tmonokai