# **Data Wrangling**

Data Gathering

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
train_data = '/content/drive/MyDrive/Dataset/loan_sanction_train.csv'

In [26]:
import pandas as pd
df = pd.read_csv(train_data)

Data Assessing

In [27]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [29]:
duplicated_data = df.duplicated().sum()

if duplicated_data == 0:
  print('No duplicated data')
else :
  print('There are duplicated data')

No duplicated data


In [30]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Data Cleaning

In [31]:
df.dropna(inplace=True)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 480 entries, 1 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            480 non-null    object 
 1   Gender             480 non-null    object 
 2   Married            480 non-null    object 
 3   Dependents         480 non-null    object 
 4   Education          480 non-null    object 
 5   Self_Employed      480 non-null    object 
 6   ApplicantIncome    480 non-null    int64  
 7   CoapplicantIncome  480 non-null    float64
 8   LoanAmount         480 non-null    float64
 9   Loan_Amount_Term   480 non-null    float64
 10  Credit_History     480 non-null    float64
 11  Property_Area      480 non-null    object 
 12  Loan_Status        480 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 52.5+ KB


In [33]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# **Data Handle and Adjustment**

convert gender, married, education, self employed, property area, loan status become boolean data

In [34]:
df['Gender'] = df['Gender'].map({'Male':1, 'Female':0})
df['Married'] = df['Married'].map({'Yes':1, 'No':0})
df['Education'] = df['Education'].map({'Graduate':1, 'Not Graduate':0})
df['Self_Employed'] = df['Self_Employed'].map({'Yes':1, 'No':0})
df['Property_Area'] = df['Property_Area'].map({'Rural':0, 'Semiurban':1, 'Urban':2})
df['Loan_Status'] = df['Loan_Status'].map({'Y':1, 'N':0})

In [35]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


In [36]:
df['Dependents'] = df['Dependents'].map(({'0':0,'1':1, '2':2, '3':3, '3+':4}))

In [37]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


# **Random Forest Algoritm for Feature Selection**

data splitting

In [40]:
X = df.drop(columns=['Loan_ID', 'Loan_Status'])
y = df['Loan_Status']


machine learning training

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
rf_model = RandomForestClassifier(random_state=42)

In [45]:
rf_model.fit(X_train, y_train)

Analisis komponen penting

In [46]:
important_feature = pd.Series(rf_model.feature_importances_, index=X.columns)

In [51]:
top_5_features = important_feature.sort_values(ascending=False).head(5)
print('Top 5 komponen paling mempengaruhi status pinjaman ', (top_5_features*100))

Top 5 komponen paling mempengaruhi status pinjaman  Credit_History       23.684621
ApplicantIncome      20.902121
LoanAmount           19.192234
CoapplicantIncome    11.740279
Loan_Amount_Term      5.910750
dtype: float64


# **Logistic Regression Algoritm for Classification**

data splitting

In [201]:
A = df[['Credit_History', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']]
B = df['Loan_Status']

In [202]:
A_train, A_test, B_train, B_test = train_test_split(A, B, test_size=0.2, random_state=42)

machine learning training

In [203]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
A_train = scaler.fit_transform(A_train)
A_test = scaler.transform(A_test)

In [206]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(A_train, B_train)

Analisis Hasil Prediksi

In [209]:
B_pred = model.predict(A_test)

In [210]:
from sklearn.metrics import accuracy_score

In [211]:
print('akurasi prediksi: ', accuracy_score(B_test, B_pred)*100)

akurasi prediksi:  82.29166666666666


# **Mengisi Dataset test**

In [212]:
test_data = '/content/drive/MyDrive/Dataset/loan_sanction_test.csv'

In [217]:
df_test = pd.read_csv(test_data)
df_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [218]:
df_test['Gender'] = df_test['Gender'].map({'Male':1, 'Female':0})
df_test['Married'] = df_test['Married'].map({'Yes':1, 'No':0})
df_test['Education'] = df_test['Education'].map({'Graduate':1, 'Not Graduate':0})
df_test['Self_Employed'] = df_test['Self_Employed'].map({'Yes':1, 'No':0})
df_test['Property_Area'] = df_test['Property_Area'].map({'Rural':0, 'Semiurban':1, 'Urban':2})
df_test['Dependents'] = df_test['Dependents'].map(({'0':0,'1':1, '2':2, '3':3, '3+':4}))

df_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,1.0,1,0.0,1,0.0,5720,0,110.0,360.0,1.0,2
1,LP001022,1.0,1,1.0,1,0.0,3076,1500,126.0,360.0,1.0,2
2,LP001031,1.0,1,2.0,1,0.0,5000,1800,208.0,360.0,1.0,2
3,LP001035,1.0,1,2.0,1,0.0,2340,2546,100.0,360.0,,2
4,LP001051,1.0,0,0.0,0,0.0,3276,0,78.0,360.0,1.0,2


In [227]:
df_test.dropna(inplace=True)

In [228]:
C = df_test[['Credit_History', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']]

In [229]:
C_pred = model.predict(C)




In [230]:
new_df = df_test.copy()
new_df['Loan_Status'] = C_pred
new_df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,1.0,1,0.0,1,0.0,5720,0,110.0,360.0,1.0,2,1
1,LP001022,1.0,1,1.0,1,0.0,3076,1500,126.0,360.0,1.0,2,1
2,LP001031,1.0,1,2.0,1,0.0,5000,1800,208.0,360.0,1.0,2,1
4,LP001051,1.0,0,0.0,0,0.0,3276,0,78.0,360.0,1.0,2,1
5,LP001054,1.0,1,0.0,0,1.0,2165,3422,152.0,360.0,1.0,2,0
