In [None]:
!pip install konlpy
!pip install --upgrade gensim
!pip install catboost
!pip install pyLDAvis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import missingno as msno
from konlpy.tag import Kkma
from tqdm import tqdm
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_selection import SelectPercentile

from gensim.models.fasttext import FastText

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.decomposition import TruncatedSVD, NMF, PCA

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')

In [None]:
label2id_1 = {x:i for i, x in enumerate(sorted(file0.digit_1.unique()))}
id2label_1 = {i:x for i, x in enumerate(sorted(file0.digit_1.unique()))}
label2id_2 = {x:i for i, x in enumerate(sorted(file0.digit_2.unique()))}
id2label_2 = {i:x for i, x in enumerate(sorted(file0.digit_2.unique()))}
label2id_3 = {x:i for i, x in enumerate(sorted(file0.digit_3.unique()))}
id2label_3 = {i:x for i, x in enumerate(sorted(file0.digit_3.unique()))}

In [None]:
file0['digit_1'] = file0.digit_1.map(lambda x: label2id_1[x])
file0['digit_2'] = file0.digit_2.map(lambda x: label2id_2[x])
file0['digit_3'] = file0.digit_3.map(lambda x: label2id_3[x])

In [None]:
documents = file0[['text_obj', 'text_mthd', 'text_deal']].fillna('')
documents = documents.apply(lambda x: ' '.join(x).strip(), axis=1)

In [None]:
kkma = Kkma()
def extract_n(x):
    pos_lst = kkma.pos(x)
    for word, pos in pos_lst:
        if pos.startswith('N'):
            yield word

In [None]:
tfidf = TfidfVectorizer(tokenizer=extract_n, max_features=2000)
tdm = tfidf.fit_transform(documents)
words = tfidf.get_feature_names()
word_count = pd.DataFrame({'단어':words,'빈도':tdm.sum(axis=0).flat})

In [None]:
tdm_A = tdm.A
tdm_df = pd.DataFrame(tdm_A, columns=tfidf.get_feature_names())
tdm_df.to_csv('/content/drive/MyDrive/공모전/data/X_tfidf.csv', index=False)

In [None]:
with open('/content/drive/MyDrive/공모전/models/tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
np.save('/content/drive/MyDrive/공모전/data/tdm_tfidf.npy', tdm)
word_count.to_csv('/content/drive/MyDrive/공모전/data/word_count.csv', index=False)

In [None]:
with open('/content/drive/MyDrive/공모전/models/tfidf.pkl', 'rb') as f:
    tfidf = pickle.load(f)
tdm = np.load('/content/drive/MyDrive/공모전/data/tdm_tfidf.npy', allow_pickle=True).tolist()
word_count = pd.read_csv('/content/drive/MyDrive/공모전/data/word_count.csv')

### Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tdm, file0[['digit_1', 'digit_2', 'digit_3']], random_state=0, test_size=.3)

y_train1 = y_train['digit_1'].values
y_train2 = y_train['digit_2'].values
y_train3 = y_train['digit_3'].values

y_test1 = y_test['digit_1'].values
y_test2 = y_test['digit_2'].values
y_test3 = y_test['digit_3'].values

In [None]:
model1 = CatBoostClassifier(random_state=0, task_type = "GPU")
model1.fit(X_train, y_train1)

print(f'Accuracy: {accuracy_score(y_test1, model1.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test1, model1.predict(X_test), average="macro")}')

# Accuracy: 0.95655
# F1 Score: 0.888203052904787

In [None]:
model2 = CatBoostClassifier(random_state=0, task_type = "GPU")
model2.fit(X_train, y_train2)

print(f'Accuracy: {accuracy_score(y_test2, model2.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test2, model2.predict(X_test), average="macro")}')

# Accuracy: 0.8953133333333333
# F1 Score: 0.6648169846488206

In [None]:
model3 = CatBoostClassifier(random_state=0, task_type = "GPU")
model3.fit(X_train, y_train3)

print(f'Accuracy: {accuracy_score(y_test3, model3.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test3, model3.predict(X_test), average="macro")}')

# Accuracy: 0.36547666666666667
# F1 Score: 0.06678287247944568

In [None]:
model1.save('/content/drive/MyDrive/공모전/models/cab1_tfidf')
model2.save('/content/drive/MyDrive/공모전/models/cab2_tfidf')
model3.save('/content/drive/MyDrive/공모전/models/cab3_tfidf')

### Merge with FastText

In [None]:
X_ft = pd.read_csv('/content/drive/MyDrive/공모전/data/X_ft.csv')
X_svd = pd.read_csv('/content/drive/MyDrive/공모전/data/X_svd.csv')

In [None]:
X_merged = pd.concat([X_ft, X_svd], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_merged, file0[['digit_1', 'digit_2', 'digit_3']], random_state=0, test_size=.3)

y_train1 = y_train['digit_1'].values
y_train2 = y_train['digit_2'].values
y_train3 = y_train['digit_3'].values

y_test1 = y_test['digit_1'].values
y_test2 = y_test['digit_2'].values
y_test3 = y_test['digit_3'].values

In [None]:
model1 = CatBoostClassifier(random_state=0, task_type = "GPU")
model1.fit(X_train, y_train1)

print(f'Accuracy: {accuracy_score(y_test1, model1.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test1, model1.predict(X_test), average="macro")}')

# Accuracy: 0.9586733333333334
# F1 Score: 0.8881235275458895

In [None]:
model2 = CatBoostClassifier(random_state=0, task_type = "GPU")
model2.fit(X_train, y_train2)

print(f'Accuracy: {accuracy_score(y_test2, model2.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test2, model2.predict(X_test), average="macro")}')

# Accuracy: 0.8922333333333333
# F1 Score: 0.6513235439275279

In [None]:
model3 = CatBoostClassifier(random_state=0, task_type = "GPU")
model3.fit(X_train, y_train3)

print(f'Accuracy: {accuracy_score(y_test3, model3.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test3, model3.predict(X_test), average="macro")}')

# Accuracy: 0.7698333333333334
# F1 Score: 0.3848150262950447

In [None]:
features = X_train
# 사용할 모델 설정 (속도가 빠른 모델 사용 권장)
model = LogisticRegression(random_state=0, n_jobs=-1)

# 각 특성과 타깃(class) 사이에 유의한 통계적 관계가 있는지 계산하여 특성을 선택하는 방법 
# feature 개수 바꿔가며 성능 test한다.
cv_scores = []
for p in range(5,100):
    X_new = SelectPercentile(percentile=p).fit_transform(X_train, y_train1)    
    cv_score = cross_val_score(model, X_new, y_train1, scoring='accuracy', cv=3).mean()
    cv_scores.append((p,cv_score))
    print(f'{p}%: {cv_score}')

# Print the best percentile
best_score = cv_scores[np.argmax([score for _, score in cv_scores])]
print(best_score)

# Plot the performance change with p
plt.plot([k for k, _ in cv_scores], [score for _, score in cv_scores])
plt.xlabel('Percent of features')
plt.grid()

In [None]:
# 과적합을 피하기 위해 최적의 p값 주변의 값을 선택하는게 더 나은 결과를 얻을 수 있다. 
fs = SelectPercentile(percentile=best_score[0]).fit(X_train, y_train1)
X_train = fs.transform(X_train)
X_test = fs.transform(X_test)

print(x_train.shape)
print(features.columns[fs.get_support()].tolist())

### SVD

In [None]:
tdm_df = pd.read_csv('/content/drive/MyDrive/공모전/data/X_tfidf.csv')

In [None]:
svd = TruncatedSVD(n_components=100, random_state=0)
X_svd = svd.fit_transform(tdm_df)
X_svd = pd.DataFrame(X_svd, columns=[f'svd_{i}' for i in range(100)])

In [None]:
with open('/content/drive/MyDrive/공모전/models/svd.pkl', 'wb') as f:
    pickle.dump(svd, f)

In [None]:
X_svd.to_csv('/content/drive/MyDrive/공모전/data/X_svd.csv', index=False)

In [None]:
X_svd = pd.read_csv('/content/drive/MyDrive/공모전/data/X_svd.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_svd, file0[['digit_1', 'digit_2', 'digit_3']], random_state=0, test_size=.3)
y_train1 = y_train['digit_1'].values
y_train2 = y_train['digit_2'].values
y_train3 = y_train['digit_3'].values

y_test1 = y_test['digit_1'].values
y_test2 = y_test['digit_2'].values
y_test3 = y_test['digit_3'].values

In [None]:
model1 = CatBoostClassifier(random_state=0, task_type = "GPU")
model1.fit(X_train, y_train1)

print(f'Accuracy: {accuracy_score(y_test1, model1.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test1, model1.predict(X_test), average="macro")}')

# Accuracy: 0.9370433333333333
# F1 Score: 0.8326697975147066

In [None]:
model2 = CatBoostClassifier(random_state=0, task_type = "GPU")
model2.fit(X_train, y_train2)

print(f'Accuracy: {accuracy_score(y_test2, model2.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test2, model2.predict(X_test), average="macro")}')

# Accuracy: 0.8602066666666667
# F1 Score: 0.581634298872744

In [None]:
model3 = CatBoostClassifier(random_state=0, task_type = "GPU")
model3.fit(X_train, y_train3)

print(f'Accuracy: {accuracy_score(y_test3, model3.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test3, model3.predict(X_test), average="macro")}')

# Accuracy: 0.79601
# F1 Score: 0.43979936535409775

### LDA

In [None]:
from gensim.matutils import Sparse2Corpus
import re
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

In [None]:
corpus = Sparse2Corpus(tdm.T)
id2token = Dictionary([tfidf.get_feature_names()])

In [None]:
train_corpus, valid_corpus = train_test_split(corpus, test_size=0.3, random_state=0)
model = LdaModel(corpus=train_corpus, id2word=id2token, num_topics=225)

loss = model.log_perplexity(valid_corpus)
old_loss = -np.inf
while loss > old_loss + 0.01:
    model.update(train_corpus)
    old_loss = loss
    loss = model.log_perplexity(valid_corpus)
    print(loss)

In [None]:
model.show_topic(5)

In [None]:
# 주제 응집도 계산
coh = CoherenceModel(model=model, corpus=corpus, texts=[tfidf.get_feature_names()], dictionary=id2token, coherence='c_v')
coh.get_coherence()

In [None]:
# 주제 다양도 계산
topn = 25
top_words = set()
for topic in range(model.num_topics):
    for word, prob in model.show_topic(topic, topn=topn):
        top_words.add(word)
len(top_words)/(25*100)

In [None]:
model.show_topics()

In [None]:
import pyLDAvis
from pyLDAvis import gensim_models

In [None]:
p = gensim_models.prepare(model, corpus, id2token)
pyLDAvis.display(p)