In [30]:
#!pip install konlpy

In [31]:
import pandas as pd
import re
from konlpy.tag import Okt,Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,f1_score
from lightgbm import LGBMClassifier

## EDA

In [32]:
train = pd.read_csv('train_data.csv')

In [33]:
train.tail()

Unnamed: 0,index,title,topic_idx
45649,45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,45652,답변하는 배기동 국립중앙박물관장,2
45653,45653,2020 한국인터넷기자상 시상식 내달 1일 개최…특별상 김성후,2


In [34]:
train.isnull().sum()

index        0
title        0
topic_idx    0
dtype: int64

In [35]:
# label 비율 확인
train.topic_idx.value_counts()

4    7629
2    7362
5    6933
6    6751
1    6222
3    5933
0    4824
Name: topic_idx, dtype: int64

## 데이터 전처리


In [36]:
# 형태소 분석기(Okt) 
okt=Okt() 

In [37]:
# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

train['title'] = train['title'].apply(lambda x : func(x))

In [38]:
# tf-idf -> 벡터화
def split(text):
    tokens_ko = text.split()
    return tokens_ko

tfidf_vect = TfidfVectorizer(tokenizer=split)
tfidf_vect.fit(train['title'])
tfidf_matrix_train = tfidf_vect.transform(train['title'])

  "The parameter 'token_pattern' will not be used"


In [39]:
# train/valid dataset split.
def split_dataset(tfidf,df):
    X_data = tfidf
    y_data = df['topic_idx']

    X_train, X_test, y_train, y_test = \
    train_test_split(X_data, y_data, test_size=0.2, random_state=3, stratify=y_data)

    
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = split_dataset(tfidf_matrix_train,train)

## 모델 학습 

In [40]:
lgbm = LGBMClassifier(random_state = 3)
lgbm.fit(X_train,y_train)

LGBMClassifier(random_state=3)

## 모델 평가


In [41]:
pred = lgbm.predict(X_test)
accuracy = accuracy_score(y_test,pred)

print('정확도', accuracy)

정확도 0.8076880954988501


## test 데이터 예측


In [42]:
test = pd.read_csv('test_data.csv')

In [43]:
test['title'] = test['title'].apply(lambda x : func(x)) 

In [44]:
tfidf_matrix_test = tfidf_vect.transform(test['title'])

In [45]:
pred = lgbm.predict(tfidf_matrix_test)

## 제출 파일 생성


In [46]:
submission = pd.read_csv('sample_submission.csv')

In [47]:
submission['topic_idx'] = pred
submission.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [49]:
submission.to_csv('fileNewsData.csv',index = False)