# IMPORTING PACKAGES

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore")




df=pd.read_csv(r"../input/analytics-vidhya-loan-prediction/train.csv")
df1=pd.read_csv(r"../input/analytics-vidhya-loan-prediction/test.csv")

# READING THE DATA

In [None]:
train_original=df.copy()
test_original=df1.copy()

In [None]:
df.head(10)

In [None]:
#COUNT THE NUMBER OF ROWS AND COLUMNS IN THE DATA SET.

In [None]:
df.shape

In [None]:
df1.shape

In [None]:
#GET SOME STATISTICS

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#COUNT THE EMPTY VALUES IN EACH COLUMNS

df.isnull().sum()

In [None]:
#VIEWING THE TEST DATASET


df1.head()

In [None]:
df1.info()

In [None]:
#COUNT THE EMPTY VALUES IN EACH COLUMNS

df1.isnull().sum()

# EXPLORING AND PREPARING THE DATA

In [None]:
data=[df,df1]
for dataset in data:
    #FILTER CATEGORICAL VARIABLES
    categorical_columns=[x for x in dataset.dtypes.index if dataset.dtypes[x]=='object']
    #EXCLUDE ID COLS AND SOURCE:
    categorical_columns =[x for x in categorical_columns if x not in['Loan_ID']]
    
    
    #PRINT FREQUENCY OF CATEGORIES
for col in categorical_columns:
    print('\nFrequency of categories for variable %s'%col)
    print(df[col].value_counts())
    
    

In [None]:
#GENDER

sns.countplot(df['Gender'])

In [None]:
pd.crosstab(df.Gender, df.Loan_Status, margins=True)

In [None]:
#DEPENDENDANTS

plt.figure(figsize=(6,6))
labels=['0','1','2','3+']
explode=(0.05,0,0,0)
size=[345,102,101,51]
plt.pie(size,explode=explode, labels=labels, autopct='%1.1f%%',shadow=True, startangle=90)
plt.axis('equal')
plt.show()

In [None]:
#GET THE COUNT OF THE NUMBER OF DEPENDENTS

df.Dependents.value_counts()

In [None]:
pd.crosstab(df.Dependents,df.Loan_Status,margins=True)

In [None]:
#CREDIT HISTORY

sns.countplot(df['Credit_History'])

In [None]:
pd.crosstab(df.Credit_History,df.Loan_Status, margins=True)

In [None]:
#SELF EMPLOYED

sns.countplot(df['Self_Employed'])

In [None]:
pd.crosstab(df.Self_Employed, df.Loan_Status, margins=True)

In [None]:
#MARRIED

sns.countplot(df.Married)

In [None]:
pd.crosstab(df.Married, df.Loan_Status, margins=True)

In [None]:
#LOAN AMOUNT 

sns.displot(df['LoanAmount'])

In [None]:
#EDUCATION

sns.countplot(df.Education)

In [None]:
#PROPERTY AREA


sns.countplot(df.Property_Area)

In [None]:
# APPLICANT INCOME

df['ApplicantIncome'].hist(bins=50)

In [None]:
df.boxplot(column='ApplicantIncome')

In [None]:
# APPLICANT INCOME AND EDUCATION

df.boxplot(column='ApplicantIncome',by='Education')


In [None]:
temp1=df['Credit_History'].value_counts(ascending=True)
temp2=df.pivot_table(values='Loan_Status',index=['Credit_History'],aggfunc= lambda x: x.map({'Y':1,'N':0}).mean())
print('Frequency Table for Credit History:')
print(temp1)
print('\n probability of getting loan for each credit history class:')
print(temp2)

In [None]:
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(8,4))
ax1=fig.add_subplot(121)
ax1.set_xlabel('Credit_History')
ax1.set_ylabel('Count of applicants')
ax1.set_title("Applicants by credit_History")
temp1.plot(kind='bar')
ax2=fig.add_subplot(122)
temp2.plot(kind='bar')
ax2.set_xlabel('Credit_History')
ax2.set_ylabel('Probability of getting loan')
ax2.set_title("Probability of getting loan by credit history")

In [None]:
temp3=pd.crosstab(df['Credit_History'],df['Loan_Status'])
temp3.plot(kind='bar',stacked=True, color=['red','blue'],grid=False)

In [None]:
df.apply(lambda x: sum(x.isnull()),axis=0)

In [None]:
df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True)

In [None]:
df["Self_Employed"].value_counts()

# CORRELATION BETWEEN ALL THE NUMERICAL VARIABLES


In [None]:
# HEATMAP REPRESENTATION OF THE CORRELATION

matrix=df.corr()
f,ax=plt.subplots(figsize=(9,6))
sns.heatmap(matrix,vmax=.8,square=True,cmap="BuPu")

In [None]:
#COUNT THE EMPTY VALUES IN EACH COLUMNS

df.isnull().sum()

In [None]:
df.head()

# REPLACING THE CATEGORICAL VALUES

In [None]:
#CONVERTING STRING VALUES(CATEGORICAL VALUES) TO INTEGER

df.Gender=df.Gender.map({"Female":0,"Male":1})
df.Married=df.Married.map({"No":0,"Yes":1})
df.Self_Employed=df.Self_Employed.map({"No":0,"Yes":1})
df.Education=df.Education.map({"Not":1,"Graduate":0})
df.Property_Area=df.Property_Area.map({"Urban":0,"Rural":1})
df.Loan_Status=df.Loan_Status.map({"N":0,"Y":1})
df.Dependents=df.Dependents.map({"3+":3,"0":0,"1":1,"2":2})



In [None]:
df.head()

In [None]:
#CONVERTING STRING VALUES(CATEGORICAL VALUES)TO INTEGER

df1.Gender=df1.Gender.map({"Female":0,"Male":1})
df1.Married=df1.Married.map({"No":0,"Yes":1})
df1.Self_Employed=df1.Self_Employed.map({"No":0,"Yes":1})
df1.Education=df1.Education.map({"Not":1,"Graduate":0})
df1.Property_Area=df1.Property_Area.map({"Urban":0,"Rural":1})
df1.Dependents=df1.Dependents.map({"3+":3,"0":0,"1":1,"2":2})



In [None]:
df1.head()

# FILLING MISSING VALUES


In [None]:
df["Gender"].fillna(df["Gender"].mode()[0],inplace=True)
df["Married"].fillna(df["Married"].mode()[0],inplace=True)
df["Dependents"].fillna(df["Dependents"].mode()[0],inplace=True)
df["Self_Employed"].fillna(df["Self_Employed"].mode()[0],inplace=True)
df["Credit_History"].fillna(df["Credit_History"].mode()[0],inplace=True)
df["Education"].fillna(df["Education"].mode()[0],inplace=True)
df["Property_Area"].fillna(df["Property_Area"].mode()[0],inplace=True)




In [None]:
#FIND THE MISSING VALUES IN LOAN AMOUNT TERM TO FILL

df["Loan_Amount_Term"].value_counts()

In [None]:
df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].mode()[0],inplace=True)

In [None]:
#FILLING THE NULL VALUES USING THE MEDIAN IN LOANAMOUNT
df["LoanAmount"].fillna(df["LoanAmount"].median(),inplace=True)

In [None]:
df.isnull().sum()

# FILLING MISSING VALUES IN TEST DATA

In [None]:
#COUNT THE EMPTY VALUES IN EACH COLUMNS

df1.isnull().sum()

In [None]:
df1["Gender"].fillna(df1["Gender"].mode()[0],inplace=True)
df1["Married"].fillna(df1["Married"].mode()[0],inplace=True)
df1["Dependents"].fillna(df1["Dependents"].mode()[0],inplace=True)
df1["Self_Employed"].fillna(df1["Self_Employed"].mode()[0],inplace=True)
df1["Credit_History"].fillna(df1["Credit_History"].mode()[0],inplace=True)
df1["Education"].fillna(df1["Education"].mode()[0],inplace=True)
df1["Property_Area"].fillna(df1["Property_Area"].mode()[0],inplace=True)




In [None]:
#FIND THE MISSING VALUES IN LOAN AMOUNT TERM TO FILL
df1["Loan_Amount_Term"].value_counts()

In [None]:
df1["Loan_Amount_Term"].fillna(df1["Loan_Amount_Term"].mode()[0],inplace=True)

In [None]:
#FILLING THE NULL VALUES USING THE MEDIAN IN LOANAMOUNT
df1["LoanAmount"].fillna(df1["LoanAmount"].median(),inplace=True)

In [None]:


df1.isnull().sum()

# FEATURE ENGINEERING

In [None]:
df['Total_Income']=df["ApplicantIncome"]+df['CoapplicantIncome']
df1['Total_Income']=df1["ApplicantIncome"]+df1['CoapplicantIncome']

In [None]:
sns.displot(df["Total_Income"])

In [None]:
df['Total_Income_log']=np.log(df["Total_Income"])
df1['Total_Income_log']=np.log(df1["Total_Income"])

In [None]:
sns.displot(df1["Total_Income_log"])

In [None]:
df['EMI']=df["LoanAmount"]/df['Loan_Amount_Term']
df1['EMI']=df1["LoanAmount"]/df1['Loan_Amount_Term']

In [None]:
sns.displot(df["EMI"])

In [None]:
sns.displot(df["EMI"])

In [None]:
#MULTIPLYING BY 1000 TO MAKE THE UITS EQUAL

df['Balance Income']=df["Total_Income"]-(df['EMI']*1000)
df1['Balance Income']=df1["Total_Income"]-(df1['EMI']*1000)

In [None]:
sns.displot(df["Balance Income"])

In [None]:
sns.displot(df["Balance Income"])

# DROPPING THE VARIABLES

In [None]:
df=df.drop(["Loan_ID","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term"],axis=1)
df

In [None]:
#LOOK AT THE DATA TYPES

df.dtypes

In [None]:
df1=df1.drop(["ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term"],axis=1)


In [None]:
df1=df1.drop("Loan_ID",axis=1)
df1

In [None]:
#LOOK AT THE DATA TYPES
df1.dtypes

#  GRADIENT BOOSTING CLASSIFIER

In [None]:
#LETS PREPARE THE DATA FOR FEEDING IN TO THE MODELS
#SAVE THE TARGET VARIABLE IN SEPARATE

x=df.drop("Loan_Status",1)
y=df.Loan_Status

In [None]:
x=pd.get_dummies(x)
df=pd.get_dummies(df)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=102)

In [None]:
#USE GRADIENT BOOSTING CLASSIFIER

from sklearn.ensemble import GradientBoostingClassifier
gbk=GradientBoostingClassifier()
gbk.fit(x_train,y_train)
pred_gbc=gbk.predict(x_test)
acc_gbc=accuracy_score(y_test,pred_gbc)*100
acc_gbc



In [None]:
pred_test=gbk.predict(df1)

In [None]:
loancsv=pd.DataFrame({"Loan_ID":test_original["Loan_ID"],"Loan_Status":pred_test})

In [None]:
loancsv

In [None]:
loancsv["Loan_Status"].replace(0,"N",inplace=True)
loancsv["Loan_Status"].replace(1,"Y",inplace=True)

In [None]:
loancsv