<a href="https://colab.research.google.com/github/soeun727/AI-Project-loan_payment/blob/main/Loan_Data_Classification_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **데이터 학습 - SVM**
- StandardScaler()를 사용하여 스케일링
- SVM 모델 학습
- 오차행렬 결과 출력

**▼ SVM(Support Vector Machine)의 개념**
- 분류 문제에 사용하는 기계학습 알고리즘
- 결정 경계(Decision Boundary), 즉 분류를 위한 기준 선을 정의하는 모델

① past_due_days 제거했을 때 정확도



In [243]:
# 원본 데이터 불러오기
import numpy as np
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/AI test/loan_payment analysis/Loan payments data_PreProcessed.csv")
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns

Unnamed: 0,loan_status,Principal,terms,age,education,Gender
0,0,1000,30,45,0,1
1,0,1000,30,50,2,0
2,0,1000,30,33,2,0
3,0,1000,15,27,1,1
4,0,1000,30,28,1,0


In [244]:
# y, 즉 예측값이 loan_status일 때
from sklearn.model_selection import train_test_split

X = df.iloc[1:501,1:].values
y = df.iloc[1:501,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
print(X_test)
print(X_train)
print(y_test)

[[1000   30   38    0    1]
 [1000   15   25    1    1]
 [1000   30   37    1    1]
 [1000   30   38    0    1]
 [1000   15   29    1    1]
 [1000   30   33    1    1]
 [1000   15   26    2    1]
 [1000   30   27    0    1]
 [ 800   15   32    1    1]
 [ 300    7   29    1    1]
 [1000   30   30    1    1]
 [ 800   30   23    1    0]
 [1000   30   41    0    1]
 [1000   30   27    0    0]
 [1000   30   30    1    1]
 [1000   30   28    0    1]
 [1000   15   37    1    1]
 [1000   30   37    1    1]
 [1000   15   47    0    1]
 [1000   30   38    0    1]
 [1000   15   24    0    1]
 [1000   15   30    1    0]
 [1000   30   30    0    1]
 [1000   15   34    1    1]
 [ 800   15   24    0    1]
 [1000   30   33    0    1]
 [1000   30   18    1    1]
 [1000   30   30    1    1]
 [1000   30   37    1    0]
 [1000   30   43    2    1]
 [1000   30   26    1    0]
 [1000   30   35    1    1]
 [ 800   15   35    1    1]
 [1000   15   27    1    1]
 [1000   30   30    0    0]
 [1000   30   28    

In [245]:
# Scaling을 수행하지 않고 학습
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

SVM_model_before_scale = svm.SVC()
SVM_model_before_scale.fit(X_train,y_train)

svc_prediction_before_scale = SVM_model_before_scale.predict(X_test)



In [246]:
# Scaling을 수행한 후 학습
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()     # 변형 객체 생성
std_scaler.fit(X_train)           # 훈련데이터의 모수 분포 저장
X_train_scaled = std_scaler.transform(X_train)    # 훈련 데이터 스케일링
X_test_scaled = std_scaler.transform(X_test)      # 테스트 데이터의 스케일링

# 학습
SVM_model_after_scale = svm.SVC()
SVM_model_after_scale.fit(X_train_scaled,y_train)
svc_prediction_after_scaled = SVM_model_after_scale.predict(X_test_scaled)

In [247]:
# Scaling 수행 전 결과
print('SVM accuracy = ', metrics.accuracy_score(svc_prediction_before_scale,y_test))
before_scaling_report = classification_report(y_test,svc_prediction_before_scale)
print(before_scaling_report)

# Scaling 수행 후 결과
print('SVM accuracy = ', metrics.accuracy_score(svc_prediction_after_scaled ,y_test))
after_scaling_report = classification_report(y_test,svc_prediction_after_scaled)
print(after_scaling_report)


SVM accuracy =  0.6066666666666667
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        91
           1       0.00      0.00      0.00        22
           2       0.00      0.00      0.00        37

    accuracy                           0.61       150
   macro avg       0.20      0.33      0.25       150
weighted avg       0.37      0.61      0.46       150

SVM accuracy =  0.6066666666666667
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        91
           1       0.00      0.00      0.00        22
           2       0.00      0.00      0.00        37

    accuracy                           0.61       150
   macro avg       0.20      0.33      0.25       150
weighted avg       0.37      0.61      0.46       150



  _warn_prf(average, modifier, msg_start, len(result))


# → 결과 : **60%**의 정확도, Scaling 수행 여부가 정확도에 영향을 미치지 않음

② 정확도를 높이고자 **past_due_days** 열을 추가하여 학습



In [248]:
# 원본 데이터 불러오기
import numpy as np
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/AI test/loan_payment analysis/Loan payments data_원본.csv")
#df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female


In [249]:
# 사용하지 않을 열 삭제(past_due_days 제외)
df.drop(['Loan_ID', 'effective_date', 'due_date' , 'paid_off_time'] ,axis = 1, inplace = True)
df.head()

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender
0,PAIDOFF,1000,30,,45,High School or Below,male
1,PAIDOFF,1000,30,,50,Bechalor,female
2,PAIDOFF,1000,30,,33,Bechalor,female
3,PAIDOFF,1000,15,,27,college,male
4,PAIDOFF,1000,30,,28,college,female


In [250]:
#문자열 변환
df["loan_status"]= df["loan_status"].replace(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'],["0","1","2"])
df["education"]= df["education"].replace(['High School or Below', 'college', 'Bechalor', 'Master or Above'],["0","1","2","3"])
df["Gender"]= df["Gender"].replace(['female', 'male'],["0","1"])
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender
0,0,1000,30,,45,0,1
1,0,1000,30,,50,2,0
2,0,1000,30,,33,2,0
3,0,1000,15,,27,1,1
4,0,1000,30,,28,1,0
...,...,...,...,...,...,...,...
495,2,1000,30,3.0,28,0,1
496,2,1000,15,14.0,26,0,1
497,2,800,15,3.0,30,1,1
498,2,1000,30,1.0,38,1,0


In [251]:
#past_due_days 처리
df['past_due_days'].unique()
df["past_due_days"]= df["past_due_days"].fillna(0)
df["past_due_days"]= df["past_due_days"].replace([  76., 61., 75., 60., 59., 74., 29., 44., 58., 73., 28.,  2.,
        4., 13.,  5.,  3., 12., 27.,  1., 25., 24., 36.,  7., 14., 23.,
        6., 38., 19., 52., 51.,  9., 56.,  8.], "1")

In [252]:
df.dropna(inplace = True)
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender
0,0,1000,30,0,45,0,1
1,0,1000,30,0,50,2,0
2,0,1000,30,0,33,2,0
3,0,1000,15,0,27,1,1
4,0,1000,30,0,28,1,0
...,...,...,...,...,...,...,...
495,2,1000,30,1,28,0,1
496,2,1000,15,1,26,0,1
497,2,800,15,1,30,1,1
498,2,1000,30,1,38,1,0


In [253]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   loan_status    500 non-null    object
 1   Principal      500 non-null    int64 
 2   terms          500 non-null    int64 
 3   past_due_days  500 non-null    object
 4   age            500 non-null    int64 
 5   education      500 non-null    object
 6   Gender         500 non-null    object
dtypes: int64(3), object(4)
memory usage: 31.2+ KB


In [254]:
# y, 즉 예측값이 loan_status일 때
from sklearn.model_selection import train_test_split

X = df.iloc[1:501,1:].values
y = df.iloc[1:501,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
print(X_test)
print(X_train)
print(y_test)

[[1000 30 '1' 38 '0' '1']
 [1000 15 0.0 25 '1' '1']
 [1000 30 0.0 37 '1' '1']
 [1000 30 0.0 38 '0' '1']
 [1000 15 0.0 29 '1' '1']
 [1000 30 '1' 33 '1' '1']
 [1000 15 0.0 26 '2' '1']
 [1000 30 0.0 27 '0' '1']
 [800 15 '1' 32 '1' '1']
 [300 7 0.0 29 '1' '1']
 [1000 30 0.0 30 '1' '1']
 [800 30 0.0 23 '1' '0']
 [1000 30 '1' 41 '0' '1']
 [1000 30 0.0 27 '0' '0']
 [1000 30 0.0 30 '1' '1']
 [1000 30 '1' 28 '0' '1']
 [1000 15 '1' 37 '1' '1']
 [1000 30 0.0 37 '1' '1']
 [1000 15 0.0 47 '0' '1']
 [1000 30 '1' 38 '0' '1']
 [1000 15 0.0 24 '0' '1']
 [1000 15 0.0 30 '1' '0']
 [1000 30 '1' 30 '0' '1']
 [1000 15 0.0 34 '1' '1']
 [800 15 0.0 24 '0' '1']
 [1000 30 0.0 33 '0' '1']
 [1000 30 '1' 18 '1' '1']
 [1000 30 '1' 30 '1' '1']
 [1000 30 0.0 37 '1' '0']
 [1000 30 0.0 43 '2' '1']
 [1000 30 '1' 26 '1' '0']
 [1000 30 0.0 35 '1' '1']
 [800 15 0.0 35 '1' '1']
 [1000 15 0.0 27 '1' '1']
 [1000 30 '1' 30 '0' '0']
 [1000 30 0.0 28 '0' '1']
 [1000 15 0.0 27 '0' '1']
 [800 15 0.0 30 '0' '1']
 [1000 30 '1' 24 '1

In [255]:
# Scaling을 수행하지 않고 학습
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

SVM_model_before_scale2 = svm.SVC()
SVM_model_before_scale2.fit(X_train,y_train)

svc_prediction_before_scale2 = SVM_model_before_scale2.predict(X_test)


In [256]:
# Scaling을 수행한 후 학습
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()     # 변형 객체 생성
std_scaler.fit(X_train)           # 훈련데이터의 모수 분포 저장
X_train_scaled = std_scaler.transform(X_train)    # 훈련 데이터 스케일링
X_test_scaled = std_scaler.transform(X_test)      # 테스트 데이터의 스케일링

# 학습
SVM_model_after_scale2 = svm.SVC()
SVM_model_after_scale2.fit(X_train_scaled,y_train)
svc_prediction_after_scaled2 = SVM_model_after_scale2.predict(X_test_scaled)

In [257]:
# Scaling 수행 전 결과
print('SVM accuracy = ', metrics.accuracy_score(svc_prediction_before_scale2,y_test))
before_scaling_report2 = classification_report(y_test,svc_prediction_before_scale2)
print(before_scaling_report2)

# Scaling 수행 후 결과
print('SVM accuracy = ', metrics.accuracy_score(svc_prediction_after_scaled2 ,y_test))
after_scaling_report2 = classification_report(y_test,svc_prediction_after_scaled2)
print(after_scaling_report2)


SVM accuracy =  0.6066666666666667
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        91
           1       0.00      0.00      0.00        22
           2       0.00      0.00      0.00        37

    accuracy                           0.61       150
   macro avg       0.20      0.33      0.25       150
weighted avg       0.37      0.61      0.46       150

SVM accuracy =  0.7466666666666667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       0.34      0.77      0.47        22
           2       0.44      0.11      0.17        37

    accuracy                           0.75       150
   macro avg       0.59      0.63      0.55       150
weighted avg       0.77      0.75      0.72       150



  _warn_prf(average, modifier, msg_start, len(result))


# → 결과 : **약 61% ~ 약 75 %** 의 정확도, Scaling 수행 여부가 정확도에 영향을 미침

③ one-hot encoding을 활용하여 training data 변환

In [258]:
# 원본 데이터 불러오기
import numpy as np
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/AI test/loan_payment analysis/Loan payments data_원본.csv")
#df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female


In [259]:
# 사용하지 않을 열 삭제(past_due_days, Loan_ID 제외)
df.drop(['Loan_ID', 'effective_date', 'due_date' , 'paid_off_time'] ,axis = 1, inplace = True)
df.head()

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender
0,PAIDOFF,1000,30,,45,High School or Below,male
1,PAIDOFF,1000,30,,50,Bechalor,female
2,PAIDOFF,1000,30,,33,Bechalor,female
3,PAIDOFF,1000,15,,27,college,male
4,PAIDOFF,1000,30,,28,college,female


In [260]:
#past_due_days 처리
df['past_due_days'].unique()
df["past_due_days"]= df["past_due_days"].fillna(0)
df["past_due_days"]= df["past_due_days"].replace([  76., 61., 75., 60., 59., 74., 29., 44., 58., 73., 28.,  2.,
        4., 13.,  5.,  3., 12., 27.,  1., 25., 24., 36.,  7., 14., 23.,
        6., 38., 19., 52., 51.,  9., 56.,  8.], "1")
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender
0,PAIDOFF,1000,30,0,45,High School or Below,male
1,PAIDOFF,1000,30,0,50,Bechalor,female
2,PAIDOFF,1000,30,0,33,Bechalor,female
3,PAIDOFF,1000,15,0,27,college,male
4,PAIDOFF,1000,30,0,28,college,female
...,...,...,...,...,...,...,...
495,COLLECTION_PAIDOFF,1000,30,1,28,High School or Below,male
496,COLLECTION_PAIDOFF,1000,15,1,26,High School or Below,male
497,COLLECTION_PAIDOFF,800,15,1,30,college,male
498,COLLECTION_PAIDOFF,1000,30,1,38,college,female


In [261]:
# df = pd.get_dummies(df, columns = ['loan_status'])
df = pd.get_dummies(df, columns = ['education'])
df = pd.get_dummies(df, columns = ['Gender'])

In [263]:
df["loan_status"]= df["loan_status"].replace(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'],["0","1","2"])

In [264]:
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education_Bechalor,education_High School or Below,education_Master or Above,education_college,Gender_female,Gender_male
0,0,1000,30,0,45,0,1,0,0,0,1
1,0,1000,30,0,50,1,0,0,0,1,0
2,0,1000,30,0,33,1,0,0,0,1,0
3,0,1000,15,0,27,0,0,0,1,0,1
4,0,1000,30,0,28,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
495,2,1000,30,1,28,0,1,0,0,0,1
496,2,1000,15,1,26,0,1,0,0,0,1
497,2,800,15,1,30,0,0,0,1,0,1
498,2,1000,30,1,38,0,0,0,1,1,0


In [237]:
df.dropna(inplace = True)

In [267]:
# y, 즉 예측값이 loan_status일 때
from sklearn.model_selection import train_test_split

X = df.iloc[1:501,1:].values
y = df.iloc[1:501,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
print(X_test)
print(y_train)

[[1000 30 '1' ... 0 0 1]
 [1000 15 0.0 ... 1 0 1]
 [1000 30 0.0 ... 1 0 1]
 ...
 [1000 30 0.0 ... 0 0 1]
 [1000 30 0.0 ... 0 0 1]
 [1000 15 0.0 ... 0 0 1]]
['2' '1' '0' '2' '1' '0' '0' '1' '0' '0' '0' '0' '0' '0' '2' '0' '0' '0'
 '2' '0' '0' '0' '0' '0' '1' '1' '1' '0' '0' '2' '0' '1' '0' '0' '0' '1'
 '1' '1' '0' '2' '0' '0' '0' '1' '1' '0' '2' '0' '0' '0' '0' '0' '1' '0'
 '0' '0' '0' '0' '0' '1' '1' '1' '0' '0' '1' '2' '0' '0' '0' '2' '0' '0'
 '1' '0' '0' '2' '2' '0' '2' '1' '1' '0' '0' '0' '1' '0' '2' '0' '2' '0'
 '0' '0' '2' '0' '2' '2' '1' '0' '0' '2' '0' '0' '0' '0' '0' '2' '1' '0'
 '1' '0' '0' '1' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '1' '0' '2' '1'
 '1' '1' '0' '1' '0' '2' '0' '0' '2' '1' '1' '1' '0' '1' '2' '1' '2' '0'
 '0' '2' '1' '1' '2' '1' '0' '1' '0' '0' '0' '0' '0' '0' '0' '2' '0' '2'
 '1' '0' '0' '2' '0' '1' '1' '0' '1' '0' '2' '0' '0' '0' '0' '0' '1' '0'
 '0' '0' '0' '0' '0' '1' '1' '0' '0' '0' '0' '1' '1' '0' '0' '2' '1' '0'
 '0' '0' '2' '0' '0' '2' '0' '0' '2' '0' 

In [268]:
# Scaling을 수행하지 않고 학습
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

SVM_model_before_scale3 = svm.SVC()
SVM_model_before_scale3.fit(X_train,y_train)

svc_prediction_before_scale3 = SVM_model_before_scale3.predict(X_test)


In [269]:
# Scaling을 수행한 후 학습
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()     # 변형 객체 생성
std_scaler.fit(X_train)           # 훈련데이터의 모수 분포 저장
X_train_scaled = std_scaler.transform(X_train)    # 훈련 데이터 스케일링
X_test_scaled = std_scaler.transform(X_test)      # 테스트 데이터의 스케일링

# 학습
SVM_model_after_scale3 = svm.SVC()
SVM_model_after_scale3.fit(X_train_scaled,y_train)
svc_prediction_after_scaled3 = SVM_model_after_scale3.predict(X_test_scaled)

In [270]:
# Scaling 수행 전 결과
print('SVM accuracy = ', metrics.accuracy_score(svc_prediction_before_scale3,y_test))
before_scaling_report3 = classification_report(y_test,svc_prediction_before_scale3)
print(before_scaling_report3)

# Scaling 수행 후 결과
print('SVM accuracy = ', metrics.accuracy_score(svc_prediction_after_scaled3 ,y_test))
after_scaling_report3 = classification_report(y_test,svc_prediction_after_scaled3)
print(after_scaling_report3)


SVM accuracy =  0.6066666666666667
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        91
           1       0.00      0.00      0.00        22
           2       0.00      0.00      0.00        37

    accuracy                           0.61       150
   macro avg       0.20      0.33      0.25       150
weighted avg       0.37      0.61      0.46       150

SVM accuracy =  0.7866666666666666
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       0.40      0.86      0.54        22
           2       0.73      0.22      0.33        37

    accuracy                           0.79       150
   macro avg       0.71      0.69      0.63       150
weighted avg       0.84      0.79      0.77       150



  _warn_prf(average, modifier, msg_start, len(result))


# → 결과 : 약 61% ~ 약 79 % 의 정확도, Scaling 후 정확도가 **약 80%**까지 도달함