In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import backend as K

# Load data
train_df = pd.read_csv('/content/drive/MyDrive/GBT_hackerton/train_df_1009.csv')
test_df = pd.read_csv('/content/drive/MyDrive/GBT_hackerton/test_df_1009.csv')

# Encode target labels
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['분류'])
num_classes = train_df['label'].nunique()

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['키워드'])

# Convert texts to sequences and pad them
X_train = tokenizer.texts_to_sequences(train_df['키워드'])
X_test = tokenizer.texts_to_sequences(test_df['키워드'])
max_len = 200
X_train_padded = pad_sequences(X_train, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test, maxlen=max_len, padding='post')

# Prepare target variable
y_train = to_categorical(train_df['label'], num_classes=num_classes)

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

# 컴파일
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Make predictions on test data
predictions = model.predict(X_test_padded)
predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))

# Save predictions
test_df['predicted_label'] = predicted_labels
test_df[['ID', 'predicted_label']].to_csv('/content/drive/MyDrive/GBT_hackerton/yk_submission_lstm.csv', index=False)




Epoch 1/5
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m778s[0m 569ms/step - accuracy: 0.4907 - loss: 2.5216 - val_accuracy: 0.5382 - val_loss: 2.0779
Epoch 2/5
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m786s[0m 579ms/step - accuracy: 0.5578 - loss: 1.9300 - val_accuracy: 0.6234 - val_loss: 1.6132
Epoch 3/5
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m788s[0m 580ms/step - accuracy: 0.6364 - loss: 1.4942 - val_accuracy: 0.6617 - val_loss: 1.3151
Epoch 4/5
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m774s[0m 560ms/step - accuracy: 0.6988 - loss: 1.1174 - val_accuracy: 0.7266 - val_loss: 1.0964
Epoch 5/5
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m753s[0m 555ms/step - accuracy: 0.7848 - loss: 0.7872 - val_accuracy: 0.7526 - val_loss: 1.0043
[1m732/732[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 41ms/step


In [None]:
result = pd.read_csv('/content/drive/MyDrive/GBT_hackerton/yk_submission_lstm.csv')
result['predicted_label'].value_counts()

Unnamed: 0_level_0,count
predicted_label,Unnamed: 1_level_1
지역,12304
경제:부동산,1394
사회:사건_사고,1201
경제:반도체,1015
사회:사회일반,691
문화:전시_공연,539
사회:의료_건강,496
사회:교육_시험,490
정치:국회_정당,479
경제:취업_창업,380


In [None]:
result = result.rename(columns={'predicted_label':'분류'})

In [None]:
result

Unnamed: 0,ID,분류
0,TEST_00000,지역
1,TEST_00001,사회:사회일반
2,TEST_00002,정치:행정_자치
3,TEST_00003,경제:취업_창업
4,TEST_00004,지역
...,...,...
23400,TEST_23400,지역
23401,TEST_23401,사회:사회일반
23402,TEST_23402,경제:부동산
23403,TEST_23403,지역


In [None]:
result.to_csv('/content/drive/MyDrive/GBT_hackerton/yk_submission3.csv', index=False)