In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
train = pd.read_csv('./df_data.csv')

In [67]:
head =train.head()
categories = train.loc[:,'category'].unique()

categories

array(['문화예술', '바닷가', '쇼핑', '트레킹', '동네 구경', '자연경관'], dtype=object)

In [126]:
#Seed 고정
class CFG:
    SEED = 30

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

In [69]:
# 데이터 전처리
# ! 장소 이름기반으로 훈련
df = pd.concat([train["name"], train["category"]], axis=1)
df

Unnamed: 0,name,category
0,테디베어뮤지엄 제주,문화예술
1,남원 큰엉해안,바닷가
2,JDC 면세점 (제주공항점),쇼핑
3,제주세계성문화박물관,문화예술
4,제주 특산품전시판매장,문화예술
...,...,...
250,김녕성세기해변 (김녕해수욕장),바닷가
251,조랑말체험공원,자연경관
252,제주문학관,문화예술
253,제주 센트럴파크,문화예술


In [70]:
# 정답 빈도수 확인
df["category"].value_counts()

category
문화예술     134
자연경관      44
트레킹       28
바닷가       23
동네 구경     23
쇼핑         3
Name: count, dtype: int64

In [71]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()

dtm = cvect.fit_transform(df["name"])
dtm

<255x348 sparse matrix of type '<class 'numpy.int64'>'
	with 428 stored elements in Compressed Sparse Row format>

In [72]:
feature_names = cvect.get_feature_names_out()
feature_names

array(['0일', '10', '10코스', '1100고지습지', '12코스', '17코스', '1코스', '3평화공원',
       '4코스', '5일', '5일시장', '5코스', '6일', '6코스', '7일', '8일', '8코스', '9일',
       'art', 'flash', 'jdc', 'light', 'sos박물관', '가파도', '감귤박물관', '갓전시관',
       '갤러리', '건강과', '걸매생태공원', '고성5일시장', '고흐의정원', '골든비치', '골목시장', '골프클럽',
       '공장투어', '곽지과물해변', '곽지해수욕장', '관덕정', '광령', '광치기', '국가지질공원',
       '국립제주박물관', '그리스신화박물관', '금능석물원', '금능해수욕장', '금룡사', '금산공원', '금호리조트',
       '김녕미로공원', '김녕성세기해변', '김녕요트투어', '김녕해수욕장', '김만덕기념관', '김영갑', '김정희',
       '김창열미술관', '나라의', '남원', '납읍난대림지대', '넥슨컴퓨터박물관', '노루생태관찰원', '노리매공원',
       '노형수퍼마켙', '뉴파라다이스호', '다이나믹', '다화원휴게소', '다희연', '닥종이인형박물관', '대유랜드',
       '대정5일시장', '대평', '대포동지삿개', '대합실', '더마파크', '더플래닛', '돈내코', '돌하르방공원',
       '동문공설시장', '동문수산시장', '동문재래시장', '동백마을', '두모악', '드라마월드', '라마다프라자제주호텔',
       '라온명품관', '리솜', '마라도', '마라도가는', '마린파크', '만장굴', '메이즈', '메이즈랜드',
       '면세점', '모슬포', '모슬포중앙시장', '모충사', '무릉', '문화거리', '미술관', '박물관', '박물관은',
       '방림원', '방주교회', '번개과학체험관', '법환동', '별빛축제', '보성시장', '본태박물

In [73]:
cvect.vocabulary_

{'테디베어뮤지엄': 303,
 '제주': 227,
 '남원': 57,
 '큰엉해안': 300,
 'jdc': 20,
 '면세점': 92,
 '제주공항점': 230,
 '제주세계성문화박물관': 251,
 '특산품전시판매장': 306,
 '제주올레': 255,
 '4코스': 8,
 '표선': 313,
 '올레': 199,
 '대정5일시장': 69,
 '6일': 12,
 '번개과학체험관': 103,
 '노루생태관찰원': 60,
 '탑동해변공연장': 302,
 '한담해변': 320,
 '제주난타전용관': 235,
 '섭지코지': 150,
 '다이나믹': 64,
 '메이즈': 90,
 '성읍점': 155,
 '천제연폭포': 284,
 '포레스트사파리': 312,
 '천지연': 285,
 '걸매생태공원': 28,
 '마라도': 86,
 '가파도': 23,
 '정기여객선': 224,
 '그리스신화박물관': 42,
 '용두암': 201,
 '우도': 205,
 '신양': 173,
 '섭지해수욕장': 151,
 '용머리해안': 203,
 '삼성혈': 126,
 '성산포유람선': 153,
 '하고수동해변': 317,
 '북촌': 108,
 '돌하르방공원': 76,
 '제주통일관': 267,
 '국립제주박물관': 41,
 '퍼시픽': 309,
 '리솜': 85,
 '조안베어뮤지엄': 276,
 '해비치': 335,
 '해변': 334,
 '본태박물관': 107,
 '김녕요트투어': 50,
 '에코랜드테마파크': 190,
 '법환동': 104,
 '청소년문화의집': 289,
 '선녀와나무꾼': 149,
 '테마공원': 304,
 '스누피가든': 169,
 '생각하는': 132,
 '정원': 226,
 '노형수퍼마켙': 62,
 '고흐의정원': 30,
 '초콜릿랜드': 291,
 '이중섭': 220,
 '미술관': 98,
 '비자림청소년수련원': 113,
 '석부작박물관': 148,
 '제주러브랜드': 242,
 '지속가능환경교육센터': 283,
 '중문': 280,
 '색달': 

In [74]:
# 단어 등장 빈도수 확인
pd.DataFrame(dtm.toarray(),columns=feature_names)

Unnamed: 0,0일,10,10코스,1100고지습지,12코스,17코스,1코스,3평화공원,4코스,5일,...,헬로키티아일랜드,협재굴,협재해변,혼인지,화순,화순금모래,황우지해안,훈데르트바서파크,휴애리,휴애리자연생활공원
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
251,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
252,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
x = dtm.toarray()
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [76]:
y = df["category"]
y

0      문화예술
1       바닷가
2        쇼핑
3      문화예술
4      문화예술
       ... 
250     바닷가
251    자연경관
252    문화예술
253    문화예술
254    문화예술
Name: category, Length: 255, dtype: object

In [94]:
split_count = int(df.shape[0] * 0.9)
split_count

229

In [95]:
x_train = x[:split_count]
y_train = y[:split_count]
print(x_train.shape, y_train.shape)
x_train, y_train

(229, 348) (229,)


(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 0       문화예술
 1        바닷가
 2         쇼핑
 3       문화예술
 4       문화예술
        ...  
 224     문화예술
 225      트레킹
 226    동네 구경
 227     문화예술
 228      트레킹
 Name: category, Length: 229, dtype: object)

In [96]:
# 테스트 세트 만들기 예) 실전 시험문제
# 테스트 세트의 정답 만들기 예) 실전 문제의 정답
# x_text, y_test
x_test = x[split_count:]
y_test = y[split_count:]
print(x_test.shape, y_test.shape)
x_test,y_test

(26, 348) (26,)


(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 229     문화예술
 230     문화예술
 231      바닷가
 232    동네 구경
 233     문화예술
 234     자연경관
 235     문화예술
 236     문화예술
 237     문화예술
 238     문화예술
 239      트레킹
 240     문화예술
 241     문화예술
 242     문화예술
 243      트레킹
 244      트레킹
 245     문화예술
 246      트레킹
 247     자연경관
 248     문화예술
 249     문화예술
 250      바닷가
 251     자연경관
 252     문화예술
 253     문화예술
 254     문화예술
 Name: category, dtype: object)

In [127]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, VotingClassifier,RandomForestClassifier
models = [
  ('bag', BaggingClassifier(random_state=CFG.SEED)),
   ('dt' , DecisionTreeClassifier(random_state=CFG.SEED)),
   ('gb', GradientBoostingClassifier(random_state=CFG.SEED)),
   ('rf' , RandomForestClassifier(random_state=CFG.SEED))
]
# model = DecisionTreeClassifier()
model = VotingClassifier(models,voting='soft',weights=[1,1,3,1])
model

In [133]:

# fit(기출문제, 정답)
model.fit(x_train,y_train)
print(model.get_feature_names_out())

['votingclassifier_bag0' 'votingclassifier_bag1' 'votingclassifier_bag2'
 'votingclassifier_bag3' 'votingclassifier_bag4' 'votingclassifier_bag5'
 'votingclassifier_dt0' 'votingclassifier_dt1' 'votingclassifier_dt2'
 'votingclassifier_dt3' 'votingclassifier_dt4' 'votingclassifier_dt5'
 'votingclassifier_gb0' 'votingclassifier_gb1' 'votingclassifier_gb2'
 'votingclassifier_gb3' 'votingclassifier_gb4' 'votingclassifier_gb5'
 'votingclassifier_rf0' 'votingclassifier_rf1' 'votingclassifier_rf2'
 'votingclassifier_rf3' 'votingclassifier_rf4' 'votingclassifier_rf5']


In [129]:
y_predict = model.predict(x_test)
y_predict

array(['문화예술', '문화예술', '바닷가', '문화예술', '문화예술', '문화예술', '문화예술', '문화예술',
       '문화예술', '문화예술', '문화예술', '문화예술', '문화예술', '문화예술', '문화예술', '트레킹',
       '문화예술', '트레킹', '문화예술', '문화예술', '문화예술', '문화예술', '문화예술', '문화예술',
       '문화예술', '문화예술'], dtype=object)

In [130]:
# 예측의 정확도 구하기
(y_test == y_predict).mean() * 100

73.07692307692307

In [112]:
# crosstab을 활용한 confusion matrix
pd.crosstab(y_test,y_predict)

col_0,문화예술,바닷가,트레킹
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
동네 구경,1,0,0
문화예술,16,0,0
바닷가,1,1,0
자연경관,3,0,0
트레킹,2,0,2


In [109]:
# from sklearn.tree import plot_tree
# plot_tree(model, feature_names=feature_names)

x_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [113]:
# feature_importances_ 로 피처 중요도 확인
sns.barplot(x=model.feature_importances_,y=feature_names)

AttributeError: 'VotingClassifier' object has no attribute 'feature_importances_'