__Prerequisites__

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
import time

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from eunjeon import Mecab
import re

nltk.download('punkt')
#nltk.download('stopwords')


import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def get_scores(y_test, predicted):
    print('-------------------------')
    print('Accuracy_score = ', accuracy_score(y_test, predicted))
    print('precision_score = ', precision_score(y_test, predicted))
    print('recall_score = ', recall_score(y_test, predicted))
    print('f1_score = ', f1_score(y_test, predicted))
    print('-------------------------\n')
    
    confusion_mat = confusion_matrix(y_test, predicted)
    print(confusion_mat)
    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vaiv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


__Preprocessing__

In [26]:
class DataPreprocessing:
    # Get writer_score(ratio of tag_1)
    def get_writer_score(self, data, train=False):
        if train == True:
            df_writer = data[['writerName', 'tag']].groupby('writerName', as_index=False).sum() # 신문사별 tag==1 개수 추출
            df_writer.rename(columns={'tag':'tag_1'}, inplace=True)

            total_cnt = data[['writerName', 'tag']].groupby(['writerName'], as_index=False).count()['tag'] # 신문사별 기사 개수 추출
            df_writer['total_cnt'] = total_cnt

            df_writer['writer_score'] = df_writer['tag_1']/df_writer['total_cnt'] # 신문사별 tag==1인 비율 계산
            
            df_writer.to_csv('../data/df_writer.csv', index=False)
        else:
            return


    def merge_writer_score(self, data):
        df_writer = pd.read_csv('../data/df_writer.csv')
        data = pd.merge(data, df_writer[['writerName', 'writer_score']], how='left', on='writerName') # writerName 기준으로 merge
        new_data = data[['date', 'writer_score', 'title', 'content', 'tag']]
        return new_data
    
    def title_tf_idf(self, data, target):
        # Mecab 객체 선언
        mecab = Mecab()
        
        # Get nouns from df['title']
        x_data = data['title'].apply(lambda x: ' '.join(mecab.nouns(x)))
        
        # Get vector count
        # count_vect = CountVectorizer()
        #X_counts = count_vect.fit_transform(x_data)
        tfidf_vect = TfidfVectorizer()
        X_counts = tfidf_vect.fit_transform(x_data)
        # Save word vector
        #pickle.dump(count_vect.vocabulary_, open("count_vector.pkl","wb"))

        # Transform word vector to ti-idf
        tfidf_transformer = TfidfTransformer()
        X_tfidf = tfidf_transformer.fit_transform(X_counts)
        df_tfidf = pd.DataFrame(X_tfidf.toarray())
        df_tfidf['writer_score'] = data['writer_score']
        df_tfidf[target] = data[target]
        # df_tfidf['date'] = data['date']
        df_tfidf.columns = list(map(str, list(df_tfidf.columns)))

        # save tf-idf
        #pickle.dump(tfidf_transformer, open("tfidf.pkl","wb"))
        return df_tfidf

    def data_transformation(self, data, target):
        # transform NA to 0
        data.fillna(0, inplace=True)
        
        # make date 0-1
        # data['date'] = data['date']/30000000
        
        # create a feature matrix
        X = data.drop(target, axis=1)

        # create a target vector
        y = data[target]
        
        # return the feature matrix and target vector
        return X, y

In [27]:
# # write a function to perform data exploration
# def perform_data_exploration(file_with_path):
    
#     # create an object of DataExploration class
#     data_exploration = DataExploration()

#     # load data HR_comma_sep.csv
#     data = data_exploration.load_data(file_with_path)

#     # Perform exploration
#     data_exploration.data_exploration(data)
    
#     # explore data
#     data_exploration.data_visualization(data)
    
#     return data

In [28]:
# write a function to perform data preprocessing
def perform_data_preprocessing(data, target, train=False):
    # use DataPreprocessing class to perform data preprocessing
    # create an object of DataPreprocessing class
    data_preprocessing = DataPreprocessing()

    data_preprocessing.get_writer_score(data, train)
    new_data = data_preprocessing.merge_writer_score(data)
    new_data = data_preprocessing.title_tf_idf(new_data, target)

    # perform data_transformation
    #data_preprocessing.title_transformation(data)
    X, y = data_preprocessing.data_transformation(new_data, target)

    return  X, y

def data_splitting(X, y):
    # split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1004)

    # return the training and testing sets
    return X_train, X_test, y_train, y_test

In [43]:
## TRAIN & TEST SET

train = True  # train set으로만 결과 확인 시 True

df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df = pd.concat([df_train, df_test])
X, y = perform_data_preprocessing(df, 'tag', train=False)

if train:
    X_train, X_test, y_train, y_test = data_splitting(X[:8000], y[:8000])
else:
    X_train = X[:len(df_train)]; X_test = X[len(df_train):]
    y_train = y[:len(df_train)]; y_test = y[len(df_train):]

In [44]:
from sklearn.linear_model import LogisticRegression

# Train logistic regression model
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)

predicted = classifier.predict(X_test)
predicted_prob = classifier.predict_proba(X_test)

if train :
    result_nb = pd.DataFrame({'true_labels':y_test, 'predicted_labels':predicted})
    print('Logistic Regression')
    get_scores(y_test, predicted)

Logistic Regression
-------------------------
Accuracy_score =  0.91125
precision_score =  0.8532763532763533
recall_score =  0.9388714733542319
f1_score =  0.8940298507462687
-------------------------

[[859 103]
 [ 39 599]]


In [45]:
# Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, y_train)
#pickle.dump(clf, open("svm.pkl", "wb"))

#SAVE MODEL
#pickle.dump(clf, open("nb_model.pkl", "wb"))

predicted = clf.predict(X_test)
predicted_prob = clf.predict_proba(X_test)

if train:
    result_nb = pd.DataFrame({'true_labels':y_test, 'predicted_labels':predicted})

    print('Naive_Bayes')
    get_scores(y_test, predicted)

Naive_Bayes
-------------------------
Accuracy_score =  0.94
precision_score =  0.9234375
recall_score =  0.9263322884012539
f1_score =  0.9248826291079814
-------------------------

[[913  49]
 [ 47 591]]


In [46]:
from sklearn.neural_network import MLPClassifier

clf_neural = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128,), max_iter=10000, random_state=1)
clf_neural.fit(X_train, y_train)
pickle.dump(clf_neural, open("softmax.pkl", "wb"))

predicted = clf_neural.predict(X_test)
predicted_prob = clf_neural.predict_proba(X_test)



if train:
    result_svm = pd.DataFrame({'true_labels':y_test, 'predicted_labels':predicted})

    print('Softmax')
    get_scores(y_test, predicted)
else:
    df_proba = pd.DataFrame(predicted_prob).rename(columns={0:'prob_0', 1:'prob_1'})
    df_test['tag'] = predicted
    df_test['prob_0'] = df_proba['prob_0']
    df_test['prob_1'] = df_proba['prob_1']
    
    df_test.to_csv('../data/result_ksh_2.csv', index=False)

Softmax
-------------------------
Accuracy_score =  0.951875
precision_score =  0.9560975609756097
recall_score =  0.9216300940438872
f1_score =  0.9385474860335195
-------------------------

[[935  27]
 [ 50 588]]
SVM
-------------------------
Accuracy_score =  0.951875
precision_score =  0.9560975609756097
recall_score =  0.9216300940438872
f1_score =  0.9385474860335195
-------------------------

[[935  27]
 [ 50 588]]


In [47]:
from sklearn import svm

clf_svm = svm.LinearSVC()
clf_svm.fit(X_train, y_train)

#pickle.dump(clf_svm, open("svm.pkl", "wb"))

predicted = clf_svm.predict(X_test)

if train:
    result_svm = pd.DataFrame({'true_labels':y_test, 'predicted_labels':predicted})

    print('SVM')
    get_scores(y_test, predicted)

SVM
-------------------------
Accuracy_score =  0.9475
precision_score =  0.9196969696969697
recall_score =  0.95141065830721
f1_score =  0.9352850539291218
-------------------------

[[909  53]
 [ 31 607]]


In [48]:
from sklearn.calibration import CalibratedClassifierCV

clf_svm = svm.LinearSVC()
clf = CalibratedClassifierCV(clf_svm, method='sigmoid') 
clf.fit(X_train, y_train)
y_proba = clf.predict_proba(X_test)

#pickle.dump(clf_svm, open("svm.pkl", "wb"))

predicted = clf.predict(X_test)

if train:
    result_svm = pd.DataFrame({'true_labels':y_test, 'predicted_labels':predicted})

    print('SVM')
    get_scores(y_test, predicted)
else:
    df_proba = pd.DataFrame(y_proba).rename(columns={0:'prob_0', 1:'prob_1'})
    df_test['tag'] = predicted
    df_test['prob_0'] = df_proba['prob_0']
    df_test['prob_1'] = df_proba['prob_1']
    
    # df_test.to_csv('../data/result_ksh_1.csv', index=False)

SVM
-------------------------
Accuracy_score =  0.94875
precision_score =  0.9276923076923077
recall_score =  0.945141065830721
f1_score =  0.9363354037267081
-------------------------

[[915  47]
 [ 35 603]]


In [49]:
# from sklearn import svm

# clf_svm = svm.SVC(kernel='linear', probability=True)
# clf_svm.fit(X_train, y_train)

# #pickle.dump(clf_svm, open("svm.pkl", "wb"))

# predicted = clf_svm.predict(X_test)
# predicted_prob = clf_svm.predict_proba(X_test)
# result_svm = pd.DataFrame({'true_labels':y_test, 'predicted_labels':predicted})

# print('SVM')
# get_scores(y_test, predicted)

using Doc2Vec

In [50]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from keras.utils import to_categorical
import sys
import multiprocessing
import re

# write a function to perform data preprocessing
def perform_data_preprocessing_for_doc2vec(data, target, train=False):
    # use DataPreprocessing class to perform data preprocessing
    # create an object of DataPreprocessing class
    data_preprocessing = DataPreprocessing()

    data_preprocessing.get_writer_score(data, train)
    new_data = data_preprocessing.merge_writer_score(data)
    new_data['content'].fillna('', inplace=True)

    return  new_data

def text_preprocessing(text_list):
    
    stopwords = ['을', '를', '이', '가', '은', '는', 'null'] #불용어 설정
    tokenizer = Mecab() #형태소 분석기 
    token_list = []
    
    for text in text_list:
        txt = re.sub('[^가-힣a-z]', ' ', text) #한글과 영어 소문자만 남기고 다른 글자 모두 제거
        token = tokenizer.nouns(txt) #형태소 분석
        token = [t for t in token if t not in stopwords or type(t) != float] #형태소 분석 결과 중 stopwords에 해당하지 않는 것만 추출
        #token_list.append(' '.join(token))
        token_list.append(token)
        
    return token_list, tokenizer


In [51]:
df = pd.read_csv('../data/train.csv')
df = perform_data_preprocessing_for_doc2vec(df, target='tag', train=True)
df['content'] = df[['title', 'content']].apply(lambda x: str(x['title'] + '. ' + x['content']), axis=1)

In [52]:
df

Unnamed: 0,date,writer_score,title,content,tag
0,20230214,0.000000,"정부, AI반도체 석·박사 집중 육성… 대학당 '6년간 164억원' 지원","정부, AI반도체 석·박사 집중 육성… 대학당 '6년간 164억원' 지원. 정부가 ...",0
1,20230215,0.021552,인사 청탁 대가 금품수수 의혹 전 소방청장 영장 기각,"인사 청탁 대가 금품수수 의혹 전 소방청장 영장 기각. 기사내용 요약 법원 ""피의 ...",0
2,20230214,0.151515,튀르키예 강진에 우리나라 지하수가 출렁였다,튀르키예 강진에 우리나라 지하수가 출렁였다. 튀르키예에서 발생한 강진에 우리나라의 ...,0
3,20230215,0.010638,"멸치쇼핑, 2023년 신입 및 경력 사원 대규모 공채 진행","멸치쇼핑, 2023년 신입 및 경력 사원 대규모 공채 진행. [데일리안 = 박영민 ...",0
4,20230111,0.699872,"美국방부, 추모의 벽 전사자 명단 오류에 ""유감스러운 실수""","美국방부, 추모의 벽 전사자 명단 오류에 ""유감스러운 실수"". 국방부 대변인 ""실수...",1
...,...,...,...,...,...
7995,20230104,0.544474,"외교부, 오는 12일 日강제징용 해법 토론회 연다","외교부, 오는 12일 日강제징용 해법 토론회 연다. 국회서 한일의원연맹과 공동 개최...",1
7996,20230215,0.667622,"교원단체, '유치원' 명칭은 일제 잔재…'유아학교'로 변경해야","교원단체, '유치원' 명칭은 일제 잔재…'유아학교'로 변경해야. 교육부가 2025년...",0
7997,20230212,0.699872,내일 한일외교차관회담… '강제동원 해법' 이견 좁힐까,내일 한일외교차관회담… '강제동원 해법' 이견 좁힐까. 13일 워싱턴 한미일 차관협...,1
7998,20230214,0.029167,최상호 국립오페라단 단장,최상호 국립오페라단 단장. 문화체육관광부가 재단법인 국립오페라단 단장 겸 예술감독에...,0


In [53]:
cores = multiprocessing.cpu_count()

#doc2vec parameters
vector_size = 1000
window_size = 15
word_min_count = 2
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 1 #0 = dbow; 1 = dmpv
worker_count = cores #number of parallel processes

tokened_content, mecab = text_preprocessing(df['content'])
onehot_y = pd.DataFrame(to_categorical(df['tag']))

X_train, X_test, y_train, y_test = train_test_split(tokened_content, onehot_y, test_size=0.2, random_state=1004)

In [54]:
# Create TaggedDocuments
train_docs = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(X_train)]
test_docs = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(X_test)]

In [55]:
# Train Doc2Vec model
model = Doc2Vec(min_count=word_min_count, size=vector_size, alpha=0.025, min_alpha=0.025, seed=1234, workers=worker_count)
model.build_vocab(train_docs)
model.train(train_docs, total_examples=model.corpus_count, epochs=model.epochs)



In [56]:
# Infer vectors for training and testing sets
train_vectors = [model.infer_vector(doc.words) for doc in train_docs]
test_vectors = [model.infer_vector(doc.words) for doc in test_docs]

In [57]:
df_train = pd.DataFrame(train_vectors)
df_train.index = y_train.index
df_train.index

df_train = df_train.join(df['writer_score'], how='left')
df_train.columns = list(map(str, list(df_train.columns)))

df_test = pd.DataFrame(test_vectors)
df_test.index = y_test.index
df_test.index

df_test = df_test.join(df['writer_score'], how='left')
df_test.columns = list(map(str, list(df_test.columns)))

In [58]:
df_y_train = df[['tag']].loc[y_train.index]['tag']
df_y_test = df[['tag']].loc[y_test.index]['tag']

In [59]:
from sklearn.neural_network import MLPClassifier

clf_neural = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128,), max_iter=10000, random_state=1)
clf_neural.fit(df_train, df_y_train)
pickle.dump(clf_neural, open("softmax.pkl", "wb"))

predicted = clf_neural.predict(df_test)
predicted_prob = clf_neural.predict_proba(df_test)
result_softmax = pd.DataFrame({'true_labels':df_y_test, 'predicted_labels':predicted})

print('Softmax')
get_scores(df_y_test, predicted)

Softmax
-------------------------
Accuracy_score =  0.93375
precision_score =  0.9130434782608695
recall_score =  0.9216300940438872
f1_score =  0.9173166926677067
-------------------------

[[906  56]
 [ 50 588]]


In [60]:
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV

clf_svm = svm.LinearSVC()
clf_svm.fit(df_train, df_y_train)

#pickle.dump(clf_svm, open("svm.pkl", "wb"))

predicted = clf_svm.predict(df_test)
result_svm = pd.DataFrame({'true_labels':df_y_test, 'predicted_labels':predicted})

print('SVM')
get_scores(df_y_test, predicted)

SVM
-------------------------
Accuracy_score =  0.921875
precision_score =  0.8952234206471494
recall_score =  0.9106583072100314
f1_score =  0.9028749028749028
-------------------------

[[894  68]
 [ 57 581]]




In [61]:
max_len = 0
for i in range(len(tokened_content)):             # tokened_X의 길이만큼 돌면서 제일 긴문장의 길이 알아내야 => max_20, wordsize_11919
    if max_len < len(tokened_content[i]):
        max_len = len(tokened_content[i])
print(max_len)

1113


In [62]:
print(df_train.shape, y_train.shape)
print(df_test.shape, y_test.shape)

(6400, 1001) (6400, 2)
(1600, 1001) (1600, 2)


In [63]:
import numpy as np
import matplotlib.pyplot as plt
from keras.models import *
from keras.layers import *

#X_train, X_test, Y_train, Y_test = np.load(                                 # .npy 파일 로드
#    '../model/news_classification_ksh-main/news_classification_ksh-main/models/news_data_max_26_wordsize_12256.npy', allow_pickle=True)

print(df_train.shape, y_train.shape)
print(df_test.shape, y_test.shape)

model = Sequential()
model.add(Embedding(10000, 300, input_length=1113))                           # 11919 => 300으로 줄여주는 # 본인단어 개수로 (난 11919)

# 임베딩 레이어의 역할 : 수치적으로 계산하여 안되는 명목척도
# 의미공간상의 배치, 11919차원 => 300차원으로 줄일 것. => 각각의 형태소가 의미를 계산할 수 있게됨

model.add(Conv1D(32, kernel_size=5, padding='same', activation='relu'))     # Relu
model.add(MaxPool1D(pool_size=1))                                           # 1써주면 달라지는 거 없지만 그래도 습관적으로
#model.add(GRU(128, activation='tanh', return_sequences=True))               # 리턴시퀀스는 나중에 설명
model.add(LSTM(128, activation='relu', return_sequences=True))
model.add(Dropout(0.2))                                                     # 과적합 막기위해
#model.add(GRU(64, activation='tanh', return_sequences=True))
model.add(LSTM(128, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, activation='relu', return_sequences=True))                                       # GRU는 여기까지
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))                                    # 좀 딥러닝같다
model.add(Dense(2, activation='softmax'))                                   # 카테고리 여섯개. 다중카테고리.
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam',            # 옵티마이저 adam
              metrics=['accuracy'])
# fit_hist = model.fit(df_train, y_train, batch_size=128,
#                      epochs=10, validation_data=(df_test, y_test))
# model.save('./models/news_category_classfication_model_{}.h5'.format(       # 모델저장하기. (한참 돌렸는데 저장안하면 말짱꽝, 중간에 팅겨도!)
#     np.round(fit_hist.history['val_accuracy'][-1], 3)))                     # 소수점 아래 3째까지 val_accuracy

(6400, 1001) (6400, 2)
(1600, 1001) (1600, 2)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1113, 300)         3000000   
                                                                 
 conv1d_1 (Conv1D)           (None, 1113, 32)          48032     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 1113, 32)         0         
 1D)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 1113, 128)         82432     
                                                                 
 dropout_3 (Dropout)         (None, 1113, 128)         0         
                                                                 
 lstm_4 (LSTM)               (None, 1113, 128)         131584    
        