In [None]:
import numpy as np 
import pandas as pd  
import seaborn as sns 
import matplotlib.pyplot as plt  

#to draw the plots immediately after the current cell
%matplotlib inline 

In [None]:
#Reads a locally stored CSV file and returns a Pandas DataFrame
trainData = pd.read_csv("dataset/train_u6lujuX_CVtuZ9i.csv") 

#Returns the first 5(default) lines of the DataFrame
trainData

In [None]:
#Returns a tuple with the dimensions of the DataFrame
trainData.shape

In [None]:
#Outputs a general descriptive statistics of the DataFrame
#By default only outputs numerical series
#include='all' parameter for all numerical and object type series
trainData.describe()

In [None]:
#Outputs a general summary of the DataType including the index datatypes, non-null values and memory usage
trainData.info()

In [None]:
#Checks whether each entry is a null of non-null value and returns bool True or False respectively
#DataFrame.sum() counts the number of True instances
trainData.isnull().sum()

#Alternate method
#trainData.isnull()[trainData.isnull()==True].count(axis=0)

In [None]:
#Removing specified columns from the DataFrame
#axis=1 means dropping from columns and axis=0(default) means dropping from index
trainData.drop(["Loan_ID","Dependents"], axis=1, inplace=True)

#ignore
#trainData.drop(0, axis=0, inplace=True)

In [None]:
trainData.head()

In [None]:
#Dealing with null values (categorical)
cols = trainData[["Gender", "Married", "Self_Employed"]]

#Replacing null values for each categorical series with the mode value
#By default mode() ignores null values in calculation of mode
for i in cols:
    trainData[i].fillna(trainData[i].mode().iloc[0], inplace=True)

In [None]:
trainData.isnull().sum()

In [None]:
#Dealing with null values (numerical)
n_cols = trainData[["LoanAmount", "Loan_Amount_Term", "Credit_History"]] 
for i in n_cols: 
    trainData[i].fillna(trainData[i].mean(), inplace=True)

In [None]:
trainData.isnull().sum()

In [None]:
#Alternate method to replace null values

#values={
#    "Gender":trainData["Gender"].mode().iloc[0],
#    "Married":trainData["Married"].mode().iloc[0],
#    "Self_Employed":trainData["Self_Employed"].mode().iloc[0],
#    "LoanAmount":trainData["LoanAmount"].mean(),
#    "Loan_Amount_Term":trainData["Loan_Amount_Term"].mean(),
#    "Credit_History":trainData["Credit_History"].mean()
#}

#trainData.(value=values, inplace=True)

In [None]:
#Visualization
#Defining a function to draw graph of loan status w.r.t. a specified column
def bar_chart(col):
    #Counting number of approved loans w.r.t. the specific column
    Approved = trainData[trainData["Loan_Status"]=="Y"][col].value_counts()

    #Counting number of disapproved loans w.r.t. the specific column
    Disapproved = trainData[trainData["Loan_Status"]=="N"][col].value_counts()

    df1 = pd.DataFrame([Approved, Disapproved]) 
    #print(df1)
    df1.index = ["Approved", "Disapproved"]
    #print(df1)
    df1.plot(kind="bar")

In [None]:
bar_chart("Gender")

In [None]:
# Converting categorical values to Integers
from sklearn.preprocessing import OrdinalEncoder 

#OrdinalEncoder converts a categorical value to a integral value(default dtype=numpy.float64) according to alphabetical order(e.g. Female --> 0 and Male --> 1)
ordinalEncoder = OrdinalEncoder(dtype=np.int64)

#OrdinalEncoder().fit_transform() first fits the values and transforms it into the same data
trainData[["Gender",'Married','Education','Self_Employed','Property_Area','Loan_Status']] = ordinalEncoder.fit_transform(trainData[["Gender",'Married','Education','Self_Employed','Property_Area','Loan_Status']])

#Alternate
#ord_enc.fit(trainData[["Gender",'Married','Education','Self_Employed','Property_Area','Loan_Status']])
#trainData[["Gender",'Married','Education','Self_Employed','Property_Area','Loan_Status']] = ord_enc.transform(trainData[["Gender",'Married','Education','Self_Employed','Property_Area','Loan_Status']])

trainData.head()

In [None]:
from sklearn.model_selection import train_test_split  
X = trainData.drop("Loan_Status", axis=1) 
y = trainData["Loan_Status"]

#Splits arrays into random train and test subsets
#test_size, if float, is the fraction of the dataset to include in the test split, if int, is the absolute number of test samples
#random_state parameter to produce reproducible results(can be any int)
XTrain, XTest = train_test_split(X,test_size=0.2, random_state=42, stratify=X["Gender"])
yTrain, yTest = train_test_split(y,test_size=0.2, random_state=42, stratify=X["Gender"])

#Alternate
#XTrain, XTest, yTrain, yTest = train_test_split(X,y,test_size=0.2, random_state=2) 

print(XTrain.shape) 
print(yTrain.shape)
print(XTest.shape) 
print(yTest.shape)

In [None]:
from sklearn.naive_bayes import GaussianNB

gfc = GaussianNB()
gfc.fit(XTrain, yTrain)
pred1 = gfc.predict(XTest)

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

def loss(y_true, y_pred): 
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    
    # f1 = HM(precision, recall)
    f1 = f1_score(y_true, y_pred)

    print(pre) 
    print(rec)
    print(acc)
    print(f1)


In [None]:
loss(yTest, pred1)

In [None]:
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV 

# C : measure of how much error is allowed
# gamma : for rbf kernel; in layman terms, measure of curvature of curve 
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 

# default scorer for classification is accuracy
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose =3) 
grid.fit(XTrain, yTrain)

In [None]:
grid.best_params_

In [None]:
svc = SVC(C= 0.1, gamma= 1, kernel= 'rbf')  
#svc = SVC(gamma='auto')
svc.fit(XTrain, yTrain)
pred2 = svc.predict(XTest) 

loss(yTest,pred2)