In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras import Sequential

In [2]:
train      = pd.read_csv("data/train_data.csv")
test       = pd.read_csv("data/test_data.csv")
submission = pd.read_csv("data/sample_submission.csv")
topic_dict = pd.read_csv("data/topic_dict.csv")

In [3]:
def clean_text(sent):
  sent_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", " ", sent)
  return sent_clean

train["cleaned_title"] = train["title"].apply(lambda x : clean_text(x))
test["cleaned_title"]  = test["title"].apply(lambda x : clean_text(x))

train_text = train["cleaned_title"].tolist()
test_text = test["cleaned_title"].tolist()
train_label = np.asarray(train.topic_idx)

tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, ngram_range=(1, 2), max_features=150000, binary=False)

tfidf.fit(train_text)

train_tf_text = tfidf.transform(train_text).astype('float32')
test_tf_text  = tfidf.transform(test_text).astype('float32')

In [4]:
def dnn_model():
  model = Sequential()
  model.add(Dense(128, input_dim = 150000, activation = "relu"))
  model.add(Dropout(0.8))
  model.add(Dense(7, activation = "softmax"))
  return model

In [5]:
model = dnn_model()
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = tf.optimizers.Adam(0.001), metrics = ['accuracy'])

In [6]:
history = model.fit(x = train_tf_text[:40000], y = train_label[:40000],
                    validation_data =(train_tf_text[40000:], train_label[40000:]),
                    epochs = 4)

Epoch 1/4


  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [7]:
tmp_pred = model.predict(test_tf_text)
pred = np.argmax(tmp_pred, axis = 1)

In [10]:
tmp_pred[0]

array([0.11681657, 0.0302223 , 0.611529  , 0.2268661 , 0.00416492,
       0.00250056, 0.00790059], dtype=float32)

In [12]:
test_y_df = pd.DataFrame(tmp_pred)

In [13]:
test_y_df.to_csv('ensemble/simple-dense.csv')

In [8]:
submission.topic_idx = pred
submission.sample(3)

Unnamed: 0,index,topic_idx
2818,48472,6
1302,46956,1
5169,50823,5
