In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
df=pd.read_csv("train_preprocessed2.csv")

In [3]:
train, test = train_test_split(df,train_size=0.75, random_state=42)

In [4]:
y_train = train['Loan_Status']
X_train = train.drop(columns='Loan_Status')
X_train.head()

Unnamed: 0,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,property_Area_Rural,property_Area_Semiurban,property_Area_Urban,Gender_Female,Gender_Male,Total_Income,Loan_Income_Ratio,EMI,Balance_Income
92,1.0,2.0,1.0,0.0,3273.0,1820.0,81.0,360.0,1.0,0,1,0,0,1,5093.0,0.015904,0.225,5092.775
304,0.0,0.0,0.0,0.0,4000.0,2500.0,140.0,360.0,1.0,0,0,1,0,1,6500.0,0.021538,0.388889,6499.611111
68,1.0,3.0,1.0,1.0,7100.0,0.0,125.0,360.0,1.0,0,1,0,0,1,7100.0,0.017606,0.347222,7099.652778
15,0.0,0.0,0.0,0.0,3806.0,0.0,125.0,360.0,1.0,0,1,0,0,1,3806.0,0.032843,0.347222,3805.652778
211,1.0,3.0,0.0,0.0,3430.0,1250.0,128.0,360.0,0.0,1,0,0,0,1,4680.0,0.02735,0.355556,4679.644444


In [5]:
y_test = test['Loan_Status']
X_test = test.drop(columns='Loan_Status')

In [6]:
def showStatistcs(Y_pred,Y,methodname):
    print(f'the performance metrics for test data using {methodname}:\n')
    print(f'Confusion matrix:\n{confusion_matrix(Y_pred,Y)}\n')
    print(f'Accuracy Score :{accuracy_score(Y_pred,Y)*100 :.4f}\n')
    print(f'Precision Score :{precision_score(Y_pred,Y)*100 :.4f}\n')
    print(f'Recall Score :{recall_score(Y_pred,Y)*100 :.4f}\n')
    print(f'f1 Score :{f1_score(Y_pred,Y)*100 :.4f}\n')

In [7]:
clf = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=0)
clf.fit(X_train, y_train)

y_test_pred = clf.predict(X_test)

showStatistcs(y_test_pred, y_test, 'Random Forest')

the performance metrics for test data using Random Forest:

Confusion matrix:
[[28  6]
 [26 94]]

Accuracy Score :79.2208

Precision Score :94.0000

Recall Score :78.3333

f1 Score :85.4545



In [8]:
y_train = train[train['Loan_Status']==1]['LoanAmount']
X_train = train[train['Loan_Status']==1].drop(columns=['LoanAmount','Loan_Status'])

y_test = test[test['Loan_Status']==1]['LoanAmount']
X_test = test[test['Loan_Status']==1].drop(columns=['LoanAmount','Loan_Status'])

model = LinearRegression()
model.fit(X_train, y_train)

y_test_predict = model.predict(X_test)

rmse_train = sqrt(mean_squared_error(y_test_predict, y_test))
print(f'RMSE: {rmse_train:.3f}')
mae_train = mean_absolute_error(y_test_predict, y_test)
print(f'MAE: {mae_train:.3f}')

epsilon = 1e-3
comparision = (y_test_predict <= y_test + epsilon)
accuracy_type = comparision.sum()/len(y_test_predict)
print(f'Accuracy: {accuracy_type:.3f}')

RMSE: 0.000
MAE: 0.000
Accuracy: 1.000


In [9]:
y_train = train[train['Loan_Status']==1]['LoanAmount']
X_train = train[train['Loan_Status']==1].drop(columns=['LoanAmount','Loan_Status'])

y_test = test[test['Loan_Status']==1]['LoanAmount']
X_test = test[test['Loan_Status']==1].drop(columns=['LoanAmount','Loan_Status'])

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_test_predict = model.predict(X_test)

rmse_train = sqrt(mean_squared_error(y_test_predict, y_test))
print(f'RMSE: {rmse_train:.3f}')
mae_train = mean_absolute_error(y_test_predict, y_test)
print(f'MAE: {mae_train:.3f}')

epsilon = 1e-3
comparision = (y_test_predict <= y_test + epsilon)
accuracy_type = comparision.sum()/len(y_test_predict)
print(f'Accuracy: {accuracy_type:.3f}')

RMSE: 1.258
MAE: 0.638
Accuracy: 0.530


In [10]:
y_train = train[train['Loan_Status']==1]['LoanAmount']
X_train = train[train['Loan_Status']==1].drop(columns=['LoanAmount','Loan_Status'])

y_test = test[test['Loan_Status']==1]['LoanAmount']
X_test = test[test['Loan_Status']==1].drop(columns=['LoanAmount','Loan_Status'])

transfomer = PolynomialFeatures(degree=2)
X_train_transf = transfomer.fit_transform(X_train)
X_test_transf = transfomer.fit_transform(X_test)

model = LinearRegression()
model.fit(X_train_transf, y_train)

y_test_predict = model.predict(X_test_transf)

rmse_train = sqrt(mean_squared_error(y_test_predict, y_test))
print(f'RMSE: {rmse_train:.3f}')
mae_train = mean_absolute_error(y_test_predict, y_test)
print(f'MAE: {mae_train:.3f}')

epsilon = 1e-3
comparision = (y_test_predict <= y_test + epsilon)
accuracy_type = comparision.sum()/len(y_test_predict)
print(f'Accuracy: {accuracy_type:.3f}')

print("First 5 actual:", y_test.values[:10])
print("First 5 preds:", y_test_predict[:10])

RMSE: 0.000
MAE: 0.000
Accuracy: 1.000
First 5 actual: [228. 130. 158.  71.  70. 107. 120.  66. 153.  94.]
First 5 preds: [228. 130. 158.  71.  70. 107. 120.  66. 153.  94.]


In [11]:
df = pd.read_csv('submissions.csv')

In [12]:
X_test = df[df['Loan_Status']==0].drop(columns=['LoanAmount','Loan_Status'])

y_testing = df[df['Loan_Status']==0]['LoanAmount']
print(f'Loan amount rejected : {y_testing}')

# X_test_transf = transfomer.transform(X_test)
# y_test_predict = model.predict(X_test_transf)

# model = RandomForestRegressor()
# model.fit(X_train, y_train)
# y_test_predict = model.predict(X_test)

model = LinearRegression()
model.fit(X_train, y_train)

y_test_predict = model.predict(X_test)

print(f'Loan amount should get accepted(after regression) : {y_test_predict}')
print(y_test_predict)


mask = df['Loan_Status']==0
df.loc[mask, 'Loan_Amount'] = y_test_predict

df.to_csv('submissions2.csv', index=False)

Loan amount rejected : 7      147.00
13     166.00
25     148.00
35     176.00
55     130.00
58     176.00
63     108.00
66     135.00
67     130.00
69     188.00
80     163.00
82     149.00
84     131.00
94     123.00
101    125.00
106    119.00
117     80.00
118    104.00
119    213.00
123    187.00
124    242.25
126     71.00
140    150.00
142    139.00
147    199.00
153    117.00
161     84.00
165    170.00
166    120.00
168     94.00
173    159.00
174    110.00
192    180.00
193    128.00
196    114.00
198    104.00
211    200.00
224    187.00
229    125.00
235    176.00
236    117.00
241    105.00
243    125.00
245    150.00
250    142.00
255    123.00
266    112.00
268     49.00
273    130.00
274     94.00
278    176.00
293     88.00
301    125.00
311    153.00
317     67.00
325     95.00
339    162.00
346    133.00
354    158.00
Name: LoanAmount, dtype: float64
Loan amount should get accepted(after regression) : [147.   166.   148.   176.   130.   176.   108.   135.   130.   18

PermissionError: [Errno 13] Permission denied: 'submissions2.csv'