In [2]:

import sklearn
sklearn.__version__

'1.0.2'

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
def get_font_family():
    """
    시스템 환경에 따른 기본 폰트명을 반환하는 함수
    """
    import platform
    system_name = platform.system()

    if system_name == "Darwin" :
        font_family = "AppleGothic"
    elif system_name == "Windows":
        font_family = "Malgun Gothic"
    else:
        # Linux(Colab)
        !apt-get install fonts-nanum -qq  > /dev/null
        !fc-cache -fv

        import matplotlib as mpl
        mpl.font_manager._rebuild()
        findfont = mpl.font_manager.fontManager.findfont
        mpl.font_manager.findfont = findfont
        mpl.backends.backend_agg.findfont = findfont
        
        font_family = "NanumBarunGothic"
    return font_family

plt.style.use("seaborn")
plt.rc("font", family=get_font_family())
plt.rc("axes", unicode_minus=False)

%config InlineBackend.figure_format = 'retina'

In [6]:
corpus = ["코로나 거리두기와 코로나 상생지원금 문의입니다.",
          "지하철 운행시간과 지하철 요금 문의입니다.",
          "지하철 승강장 문의입니다.",
          "코로나 선별진료소 문의입니다.",
          "버스 운행시간 문의입니다.", 
          "버스 터미널 위치 안내입니다.",
          "코로나 거리두기 안내입니다.",
          "택시 승강장 문의입니다."
         ]

In [14]:
df = pd.DataFrame(columns=['문서','분류'])
df['문서'] = corpus

In [18]:

df['분류'] = df['문서'].apply(lambda x: '보건' if '코로나' in x else '교통')
df

Unnamed: 0,문서,분류
0,코로나 거리두기와 코로나 상생지원금 문의입니다.,보건
1,지하철 운행시간과 지하철 요금 문의입니다.,교통
2,지하철 승강장 문의입니다.,교통
3,코로나 선별진료소 문의입니다.,보건
4,버스 운행시간 문의입니다.,교통
5,버스 터미널 위치 안내입니다.,교통
6,코로나 거리두기 안내입니다.,보건
7,택시 승강장 문의입니다.,교통


In [24]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import CountVectorizer

cvect = CountVectorizer()
dtm = cvect.fit_transform(df['문서'])
df_dtm = pd.DataFrame(dtm.toarray(), columns=cvect.get_feature_names())
df_dtm

Unnamed: 0,거리두기,거리두기와,문의입니다,버스,상생지원금,선별진료소,승강장,안내입니다,요금,운행시간,운행시간과,위치,지하철,코로나,택시,터미널
0,0,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0
1,0,0,1,0,0,0,0,0,1,0,1,0,2,0,0,0
2,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0
3,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
4,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
6,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
7,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = ['문의입니다', '안내입니다']
tfidfvect = TfidfVectorizer(min_df=.1, max_df=.8, max_features=50, stop_words=stopwords)
tfidf_dtm = tfidfvect.fit_transform(df['문서'])
df_tfidf_dtm = pd.DataFrame(tfidf_dtm.toarray(), columns=tfidfvect.get_feature_names())
df_tfidf_dtm.style.background_gradient()

Unnamed: 0,거리두기,거리두기와,버스,상생지원금,선별진료소,승강장,요금,운행시간,운행시간과,위치,지하철,코로나,택시,터미널
0,0.0,0.494346,0.0,0.494346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.715014,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.455984,0.0,0.455984,0.0,0.764301,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.810306,0.0,0.0,0.0,0.0,0.0,0.0,0.586007,0.0,0.0
4,0.0,0.0,0.642328,0.0,0.0,0.0,0.0,0.76643,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.509814,0.0,0.0,0.0,0.0,0.0,0.0,0.608313,0.0,0.0,0.0,0.608313
6,0.810306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.586007,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.642328,0.0,0.0,0.0,0.0,0.0,0.0,0.76643,0.0


In [40]:

X = tfidf_dtm.toarray()

In [41]:

y = df['분류']

In [42]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:

X_train.shape, y_train.shape

((6, 14), (6,))

In [49]:

X_test.shape, y_test.shape


((2, 14), (2,))

In [45]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(random_state=42, 
                                       max_depth=3, 
                                       min_samples_leaf=5, 
                                       min_samples_split=5,
                                       max_features=None, 
                                       class_weight='balanced', 
                                       criterion='gini',
                                       )

In [46]:
dt_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', max_depth=3, min_samples_leaf=5,
                       min_samples_split=5, random_state=42)

In [50]:
y_predict = dt_classifier.predict(X_test)

In [54]:
(y_test == y_predict).mean()*100

100.0

In [56]:

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)*100

100.0

In [60]:
pd.crosstab()

분류,교통
row_0,Unnamed: 1_level_1
교통,2


In [61]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_predict)

array([[2]])