In [1]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
os.getcwd()
PATH = os.getcwd() + '/drive/MyDrive/Colab Notebooks/AI_Hackathon'
os.chdir(PATH)
os.getcwd()

'/content/drive/MyDrive/Colab Notebooks/AI_Hackathon'

In [3]:
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


#models - knn, naivebays, linear SVC
from sklearn.neighbors import KNeighborsClassifier #knn 모델
from sklearn.naive_bayes import MultinomialNB  # 다항분포 나이브 베이즈 모델
from sklearn.svm import LinearSVC #Linear SVC 모델

%matplotlib inline

DATA_PATH = 'data'
DATA_SET_CSV = 'text_df_f.csv'

#csv파일 읽기
dataset = pd.read_csv(os.path.join(DATA_PATH, DATA_SET_CSV), sep='\t', index_col=0)
text = sklearn.utils.shuffle(dataset)
X = list(text['content'])
y = np.array(text['target'])

In [4]:
type(X[0])

str

In [5]:
feature = text['content']
feature.shape

(443,)

In [6]:
#count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

def count_vectorizer(features):
  vector = CountVectorizer(max_features=features, ngram_range=(1,2))
  vector_dataset = vector.fit_transform(X)

  return vector, vector_dataset

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

#최적 하이퍼 파라미터 구하기 - naivebays

#모든 모델 하이퍼파라미터 조절할예정****
#Valid parameters are: ['alpha', 'class_prior', 'fit_prior', 'force_alpha'].
from sklearn.model_selection import GridSearchCV

params = {
    'alpha' : [0.1, 0.2, 0.3, 0.4, 0.5, 1.0]
}
#n_jobs = -1 -> 모든 CPU 코어 사용 random_state=0


# count vectorizer 사용해보기
# vector, vector_dataset = count_vectorizer(300)
# X_train, X_test, y_train, y_test = train_test_split(vector_dataset, y, test_size=0.2, random_state=1, stratify=y)


# tfidf 사용해보기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
tfidf = TfidfVectorizer(lowercase=False)

X_train_str = [str(x) for x in X_train]
X_train_tfidf = tfidf.fit_transform(X_train_str)

In [8]:
#교차 검증
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
naive = MultinomialNB()
grid = GridSearchCV(naive, param_grid = params, cv = skf)
grid.fit(X_train_tfidf, y_train)

print("최적 :", grid.best_params_)
print("최고 정확도 :", grid.best_score_)

최적 : {'alpha': 0.2}
최고 정확도 : 0.963276836158192


In [9]:
type(X_train_str)

list

In [10]:
#최적 하이퍼 파라미터 구하기 - linear SVC
#Valid parameters are: ['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'].
params = {
    'C' : [0.1, 0.2, 0.3, 0.4, 0.5, 1.0],
    'penalty' : ['l1', 'l2'],
    'multi_class' : ['ovr', 'crammer_singer'],
    'class_weight' : ['dict', 'balanced']
}

svc = LinearSVC()
# grid search를 이용해 최적의 하이퍼 파라미터 찾기
cv = GridSearchCV(svc, param_grid = params, cv = skf)
cv.fit(X_train_tfidf, y_train)

C = cv.best_estimator_.C
print("최적 :", cv.best_estimator_.C)
print("최적 :", cv.best_params_)
print("최고 정확도 :", cv.best_score_)



최적 : 0.1
최적 : {'C': 0.1, 'class_weight': 'balanced', 'multi_class': 'crammer_singer', 'penalty': 'l1'}
최고 정확도 : 0.9774011299435029


90 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_classes.py", line 261, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklear

In [11]:
#최적 하이퍼 파라미터 구하기 - knn
#Valid parameters are: ['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights']
params = {
    # 'leaf_size' : list(range(30,40)),
    'metric' : ['euclidean', 'manhattan', 'minkowski'],
    'n_neighbors' : list(range(1,10)),
    'weights' : ["uniform", "distance"]
}

knn = KNeighborsClassifier()
k_cv = GridSearchCV(knn, param_grid = params, cv = skf)
k_cv.fit(X_train_tfidf, y_train)

print("최적 :", k_cv.best_params_)
print("최고 정확도 :", k_cv.best_score_)


최적 : {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
최고 정확도 : 0.963276836158192


In [12]:
def draw_table(name, accuracy, precision, recall, F1,auc):
    raw_data = {'accuracy' : accuracy, 'precision':precision, 'recall':recall,'F1':F1,'auc':auc}
    table = pd.DataFrame(raw_data)
    table.index = name
    print(table)

def draw_table1(name, accuracy, precision, recall, F1):
    raw_data = {'accuracy' : accuracy, 'precision':precision, 'recall':recall,'F1':F1}
    table = pd.DataFrame(raw_data)
    table.index = name
    print(table)

In [13]:
from sklearn.metrics import classification_report

accuracy = []
precision = []
recall = []
F1 = []
name = []
prob = []

# vector, vector_dataset = count_vectorizer(200)

# X_train, X_test, y_train, y_test = train_test_split(vector_dataset, y, test_size=0.2, random_state=1, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

tfidf = TfidfVectorizer(lowercase=False)

X_train_str = [str(x) for x in X_train]
X_train_tfidf = tfidf.fit_transform(X_train_str)

X_test_str = [str(x) for x in X_test]
X_test_tfidf = tfidf.transform(X_test_str)


naive = MultinomialNB(alpha=0.1)
svc = LinearSVC(C=0.1, class_weight='balanced', multi_class='crammer_singer', penalty='l1')
knn = KNeighborsClassifier(leaf_size=30,metric='euclidean', n_neighbors=5, weights='distance')

#multiclass 분류를 위해 micro average 사용

for model in (naive, svc, knn):
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    if model == svc:
      prob.append(svc.decision_function(X_test_tfidf))
    else:
      prob.append(model.predict_proba(X_test_tfidf))
    name.append(model.__class__.__name__)
    accuracy.append(accuracy_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred, average='micro'))
    recall.append(recall_score(y_test, y_pred, average='micro'))
    F1.append(f1_score(y_test, y_pred, average='micro'))
    print(model.__class__.__name__)
    print(classification_report(y_test,y_pred))

# support : 각 라벨의 실제 샘플 개수

draw_table1(name, accuracy, precision, recall, F1)

MultinomialNB
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      1.00      1.00        38
           2       1.00      0.91      0.95        11

    accuracy                           0.99        89
   macro avg       0.99      0.97      0.98        89
weighted avg       0.99      0.99      0.99        89

LinearSVC
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.97      0.99        38
           2       1.00      1.00      1.00        11

    accuracy                           0.99        89
   macro avg       0.99      0.99      0.99        89
weighted avg       0.99      0.99      0.99        89

KNeighborsClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        38
           2       1.00      1



In [15]:
text = ['김서영 들다 오다 축하 드리다 내일 오전 급등 관련 중요 소식 알다 드리다 텔레 방 입장 부탁드리다']
text2 = ['광고 하나은행 드리다 선물 광고 하나은행 김영 드리다 깜짝 선물 이벤트 페이지 참여 완료 신라면 크다 사발 득템 참여 대상 문자 하나은행 으로부터 수신 손님 참여 방법 문자 링크 통해 이벤트 페이지 참여 완료 쿠폰 지다 증정 이벤트 기간 수화 쿠폰 농심 신라면 크다 사발 교환 쿠폰 적립 푸시 쿠폰 적립 알림 드리다 쿠폰 확인 하나원 쿠폰 선물 확인 가기 하나원 하나은행 대표 스마트폰 뱅킹 브랜드 이벤트 늘다 은행 사정 따르다 변경 중단 이벤트 혜택 동일 금액 대의 상품 변경 단기 조건 충족 손님 제외 이벤트 경품 교환 환불 환가 연장 재발 불가하다 쿠폰 사용 관련 유의사항 발급 모바일 쿠폰 하단 명시 되어다 이벤트 문의 하나은행 고객 센터 평일 주말 공휴일 제외 연락 바라다 홍보물 법령 내부통제 기준 따르다 절차 거치다 제공 홍보물 년월 유효하다 준법 감시인 심의 광고 무료 수신 거부']
inputdata = tfidf.transform(text)
# x = knn.predict_proba(inputdata)

x = svc.decision_function(inputdata)
# print(np.argmax(x))
y = svc.predict(inputdata)
print(x)
print("predict : ", int(y))

[[-0.2313948  -0.30422625  0.53562105]]
predict :  2


In [16]:
type(X_test_str[0])

str

micro를 적용했기 때문에 각 모델의 accuracy, precision, recall은 모두 같은 값을 가진다.

In [17]:
# from sklearn.metrics import classification_report
# from sklearn.ensemble import VotingClassifier


# accuracy = []
# precision = []
# recall = []
# F1 = []
# name = []
# prob = []

# vector, vector_dataset = count_vectorizer(200)
# X_train, X_test, y_train, y_test = train_test_split(vector_dataset, y, test_size=0.2, random_state=1, stratify=y)

# naive = MultinomialNB(alpha=0.1)
# svc = LinearSVC(C=1.0)
# knn = KNeighborsClassifier(leaf_size=30, metric='euclidean', n_neighbors=2, weights='uniform')

# from sklearn.svm import SVC
# vc = SVC(kernel='linear', probability=True)

# # 보팅에 참여한 single models 지정
# single_models = [
#     ('naive', naive),
#     ('svc', vc),
#     ('knn', knn)
# ]
# ensemble = VotingClassifier(single_models, voting='soft')


# #multiclass 분류를 위해 micro average 사용
# for model in (naive, svc, knn):
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     if model == svc:
#       prob.append(svc.decision_function(X_test))
#     else:
#       prob.append(model.predict_proba(X_test))
#     name.append(model.__class__.__name__)
#     accuracy.append(accuracy_score(y_test, y_pred))
#     precision.append(precision_score(y_test, y_pred, average='macro'))
#     recall.append(recall_score(y_test, y_pred, average='macro'))
#     F1.append(f1_score(y_test, y_pred, average='macro'))
#     print(model.__class__.__name__)
#     print(classification_report(y_test,y_pred))

# # support : 각 라벨의 실제 샘플 개수

# draw_table1(name, accuracy, precision, recall, F1)

In [18]:
F1

[0.9887640449438202, 0.9887640449438202, 1.0]

In [19]:
#머신러닝 모델 저장
import joblib
joblib.dump(naive, './naive_model_real.pkl')
joblib.dump(svc, './svc_model_real.pkl')
joblib.dump(knn, './knn_model_real.pkl')

joblib.dump(tfidf,'./tfidf_vector_real.pkl')

['./tfidf_vector_real.pkl']