In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# train = pd.read_csv("/content/drive/MyDrive/ubion/Datasets/train_data.csv")
# test = pd.read_csv("/content/drive/MyDrive/ubion/Datasets/test_data.csv")

In [3]:
train = pd.read_csv("./train_data.csv")
test = pd.read_csv("./test_data.csv")

In [4]:
y_train = train["TARGET"]
X_train = train.drop("TARGET", axis=1)

In [5]:
y_test = test["TARGET"]
X_test = test.drop("TARGET", axis=1)

In [6]:
from imblearn.over_sampling import SMOTE
# 검증 데이터나 테스트 데이터가 아닌 학습데이터에서만 오버샘플링 사용할 것
smote = SMOTE(random_state=11)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [7]:
print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트: ', X_train.shape, y_train.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트: ', X_train_smote.shape, y_train_smote.shape)
print('SMOTE 적용 후 레이블 값 분포: \n', pd.Series(y_train_smote).value_counts())

SMOTE 적용 전 학습용 피처/레이블 데이터 세트:  (203604, 73) (203604,)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트:  (374010, 73) (374010,)
SMOTE 적용 후 레이블 값 분포: 
 0    187005
1    187005
Name: TARGET, dtype: int64


In [8]:
# ## 혼동행렬, 평가
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

## 모델의 예측력 확인
from sklearn import svm
clf = svm.SVC(random_state=11)

from sklearn.model_selection import cross_val_score
clf.fit(X_train_smote, y_train_smote)
cv_scores = cross_val_score(clf, X_train_smote, y_train_smote, cv=5, scoring='recall', n_jobs=-1) # 예측값 반환 # 지정한 평가 지표에 따라 교차 검증 점수 반환

print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.53100184 0.93147242 0.93128526 0.93406593 0.93171306]
Average cross-validation score: 0.8519077030025934


In [10]:
import joblib
joblib.dump(clf, './svm_default.pkl')

['./svm_default.pkl']

In [None]:
# 저장한 모델을 불러와 사용하기
# import joblib
# loaded_model = joblib.load('./svm_default.pkl')

In [9]:
pred = clf.predict(X_test)

test_cm = confusion_matrix(y_test, pred)
test_acc = accuracy_score(y_test, pred)
test_prc = precision_score(y_test, pred)
test_rcll = recall_score(y_test, pred)
test_f1 = f1_score(y_test, pred)

print(test_cm)
print('정확도', round(test_acc*100,2))
print('정밀도', round(test_prc*100,2))
print('재현율', round(test_rcll*100,2))
print('F1',  round(test_f1*100,2))

[[46625    45]
 [ 4258    11]]
정확도 91.55
정밀도 19.64
재현율 0.26
F1 0.51


c 적어지면 과적합 방지 outlier 를 좀 유하게 봐주는 것. 선이 단순해짐
기본 개념 c 커지면 선이 복잡해짐
감마 커지면 가우시안이 점점 좁아짐
감마 작아지면 양쪽 퍼져있음, 선이 단순해짐

In [11]:
from sklearn.model_selection import GridSearchCV
parameters = {'gamma': [0.1,0.05,0.001], 'C':[1, 10, 100]}

grid_svc = GridSearchCV(clf, param_grid=parameters, cv = 5)
grid_svc.fit(X_train, y_train)

print(grid_svc.best_estimator_)
print(grid_svc.best_params_)

In [None]:
clf_gd = grid_svc.best_estimator_
pred = clf_gd.predict(X_test)

In [None]:
test_cm = confusion_matrix(y_test, pred)
test_acc = accuracy_score(y_test, pred)
test_prc = precision_score(y_test, pred)
test_rcll = recall_score(y_test, pred)
test_f1 = f1_score(y_test, pred)

print(test_cm)
print('정확도', round(test_acc*100,2))
print('정밀도', round(test_prc*100,2))
print('재현율', round(test_rcll*100,2))
print('F1',  round(test_f1*100,2))

In [None]:
import joblib
joblib.dump(grid_svc, './grid_svc.pkl')

In [None]:
import pickle 
with open('grid_svc_rf.pickle','wb') as fw:
    pickle.dump(grid_svc, fw)