In [None]:
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_data = pd.read_csv("../input/train_u6lujuX_CVtuZ9i.csv")
test_data = pd.read_csv("../input/test_Y3wMUE5_7gLdaTN.csv")

In [None]:
train_data.head(10)

In [None]:
train_data.columns

In [None]:
train_data.dtypes

In [None]:
train_data.describe()

In [None]:
train_data['ApplicantIncome'].plot.box()

In [None]:
train_data.boxplot(column='ApplicantIncome', by = 'Education')
plt.suptitle(" ")

In [None]:
train_data.boxplot(column='ApplicantIncome', by = 'Self_Employed')
plt.suptitle(" ")

In [None]:
train = train_data

In [None]:
train.isnull().sum()

In [None]:
train['Gender'].fillna(train['Gender'].mode()[0], inplace= True)
train['Married'].fillna(train['Married'].mode()[0], inplace= True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace= True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace= True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace= True)

In [None]:
train.isnull().sum()

In [None]:
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace= True)

In [None]:
train['Loan_Amount_Term'].value_counts()

In [None]:
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace= True)

In [None]:
train.isnull().sum()

In [None]:
train.columns

In [None]:
train = train.drop('Loan_ID', axis=1)

In [None]:
X = train.drop('Loan_Status', axis=1)
y = train.Loan_Status

In [None]:
X = pd.get_dummies(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

Finding the best model for our dataset using Cross Validation

In [None]:
classifier = []
classifier.append(("LogisticReg", LogisticRegression(solver='liblinear', multi_class='ovr')))
classifier.append(("CART", DecisionTreeClassifier(criterion = 'entropy')))
classifier.append(("KNN", KNeighborsClassifier()))
classifier.append(("KernelSVM", SVC(gamma='auto')))
classifier.append(("NaiveBayes", GaussianNB()))
classifier.append(("RandomForest", RandomForestClassifier()))

In [None]:
seed = 0
results = []
names = []
for name, model in classifier:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

As per the output, Logistic Regression seems to be the best with 80% accuracy and 5% variance, so lets apply it to the dataset

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
accuracy_score = accuracy_score(y_test, y_pred)
print(accuracy_score)

In [None]:
report = classification_report(y_test, y_pred)
print(report)

Now, its time to apply our model on the test_data and find the predicted values

In [None]:
test_data.shape

In [None]:
test_data.isnull().sum()

In [None]:
test = test_data.drop('Loan_ID', axis=1)

In [None]:
test['Gender'].fillna(test['Gender'].mode()[0], inplace= True)
test['Married'].fillna(test['Married'].mode()[0], inplace= True)
test['Dependents'].fillna(test['Dependents'].mode()[0], inplace= True)
test['Self_Employed'].fillna(test['Self_Employed'].mode()[0], inplace= True)
test['Credit_History'].fillna(test['Credit_History'].mode()[0], inplace= True)
test['LoanAmount'].fillna(test['LoanAmount'].median(), inplace= True)
test['Loan_Amount_Term'].fillna(test['Loan_Amount_Term'].mode()[0], inplace= True)

In [None]:
test.isnull().sum()

In [None]:
test = pd.get_dummies(test)

In [None]:
test_pred = logreg.predict(test)

In [None]:
Submission = pd.DataFrame()
Submission['Loan_ID'] = test_data['Loan_ID']
Submission['Loan_Status'] = test_pred.reshape((test_pred.shape[0]))

In [None]:
Submission.head(10)

In [None]:
#Submission.to_csv('sample_submission.csv', index=False)