# 전처리된 데이터 EDA

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

train_file_path = "Data/processed_train2.csv"
test_file_path = "Data/processed_test2.csv"
submission_file_path = 'Data/submission.csv'
train_data = pd.read_csv(train_file_path)
print("Train Data Sample:")
print(train_data.head())

print("\nTrain Data Columns:")
print(train_data.columns)



test_data = pd.read_json(test_file_path).T
test_data.head()

submission_data = pd.read_csv(submission_file_path)


print("\nTest Data Sample:")
print(test_data.head())
print(test_data.columns)

# submission.csv 파일 읽기
submission_data = pd.read_csv(submission_file_path)
print("\nSubmission Data Sample:")
print(submission_data.head())
print(submission_data.columns)

In [None]:
# train 데이터 구조 및 통계 정보 확인
print("\nTrain Data Info:")
print(train_data.info())

# test 데이터 구조 및 통계 정보 확인
print("\nTest Data Info:")
print(test_data.info())

# submission 데이터 구조 및 통계 정보 확인
print("\nSubmission Data Info:")
print(submission_data.info())

In [None]:
# 결측치 확인
print("\nTrain Data Missing Values:")
print(train_data.isnull().sum())

print("\nTest Data Missing Values:")
print(test_data.isnull().sum())

print("\nSubmission Data Missing Values:")
print(submission_data.isnull().sum())

In [None]:
# 결측치 확인
print("\nTrain Data Missing Values:")
print(train_data.isnull().sum())

print("\nTest Data Missing Values:")
print(test_data.isnull().sum())

print("\nSubmission Data Missing Values:")
print(submission_data.isnull().sum())

In [None]:
label_col = 'class' if 'class' in train_data.columns else train_data.columns[-1]  # Assuming label column is the last one if unnamed
print("\nTrain Data Label Distribution:")
print(train_data[label_col].value_counts())

In [None]:
# 텍스트 길이 분석
text_col = 'conversation' if 'conversation' in train_data.columns else train_data.columns[0]  # Assuming text column is the first one if unnamed

# 텍스트 데이터 타입 확인 및 정리
train_data['text_length'] = train_data[text_col].apply(lambda x: len(str(x)) if isinstance(x, str) else 0)
test_data['text_length'] = test_data['text'].apply(lambda x: len(str(x)) if isinstance(x, str) else 0)

print("\nTrain Data Text Length Stats:")
print(train_data['text_length'].describe())

print("\nTest Data Text Length Stats:")
print(test_data['text_length'].describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 텍스트 길이 분포 시각화
plt.figure(figsize=(12, 6))
sns.histplot(train_data['text_length'], bins=50, kde=True, color='blue', label='Train Data')
sns.histplot(test_data['text_length'], bins=50, kde=True, color='green', label='Test Data')
plt.legend()
plt.title('Text Length Distribution')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
!apt-get update      
!apt-get install fonts-nanum* #나눔글꼴 설치

In [None]:
import matplotlib.font_manager as fm


fe = fm.FontEntry(fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', name='NanumGothic') #파일 저장되어있는 경로와 이름 설정
fm.fontManager.ttflist.insert(0, fe)  # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumGothic'}) #폰트설정

In [None]:
from wordcloud import WordCloud
from konlpy.tag import Okt
import matplotlib.pyplot as plt
import pandas as pd

# 형태소 분석기 초기화
okt = Okt()

# 예시 폰트 경로
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'

# 텍스트 열 이름
text_col = 'conversation' if 'conversation' in train_data.columns else train_data.columns[0]

# 클래스별 단어 구름 시각화
for label in train_data['class'].unique():
    # 해당 클래스의 텍스트 추출
    text = " ".join(train_data[train_data['class'] == label][text_col].dropna().astype(str))
    
    # 형태소 분석을 통해 명사만 추출
    tokens = okt.nouns(text)
    
    # 명사 리스트를 문자열로 변환
    tokens_str = " ".join(tokens)
    
    # 단어 구름 생성
    wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(tokens_str)
    
    # 단어 구름 시각화
    plt.figure(figsize=(10, 5))
    plt.rcParams['font.family'] = 'NanumGothic'
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for Label {label}')
    plt.axis('off')
    plt.show()



In [None]:
# 클래스 분포 시각화
plt.figure(figsize=(10, 6))
sns.countplot(x='class', data=train_data)
plt.title('Class Distribution in Train Data')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 텍스트 길이 계산
train_data['text_length'] = train_data['conversation'].apply(lambda x: len(x.split()))

# 클래스별 텍스트 길이 분포 시각화
plt.figure(figsize=(12, 6))
sns.boxplot(x='class', y='text_length', data=train_data)
plt.title('Text Length Distribution by Class')
plt.xlabel('Class')
plt.ylabel('Text Length')
plt.xticks(rotation=45)
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(train_data['conversation'])

# 클래스별 중요한 단어 시각화
classes = train_data['class'].unique()

for cls in classes:
    class_data = train_data[train_data['class'] == cls]['conversation']
    class_tfidf = vectorizer.transform(class_data)
    tfidf_means = np.mean(class_tfidf.toarray(), axis=0)
    tfidf_indices = np.argsort(tfidf_means)[::-1][:10]  # 상위 10개 단어 선택
    
    words = np.array(vectorizer.get_feature_names_out())[tfidf_indices]
    scores = tfidf_means[tfidf_indices]

    plt.figure(figsize=(10, 5))
    sns.barplot(x=scores, y=words)
    plt.title(f'Top 10 TF-IDF Words for Class: {cls}')
    plt.xlabel('TF-IDF Score')
    plt.ylabel('Word')
    plt.show()

In [None]:
from collections import Counter
import seaborn as sns

# 형태소 분석기 초기화
okt = Okt()

# 모든 텍스트를 결합
all_text = " ".join(train_data['conversation'].dropna().astype(str))

# 형태소 분석을 통해 명사만 추출
nouns = okt.nouns(all_text)

# 명사 빈도수 계산
noun_counts = Counter(nouns)

# 상위 20개 단어 추출
top_nouns = noun_counts.most_common(20)

# 단어와 빈도 분리 (리스트로 변환)
words, counts = zip(*top_nouns)
words = list(words)
counts = list(counts)

# 단어 빈도 시각화
plt.figure(figsize=(12, 6))
sns.barplot(x=counts, y=words)
plt.title('Top 20 Words by Frequency')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.show()

In [None]:
print("\nTrain Data Text Length Stats:")
print(train_data['text_length'].describe())

print("\nTest Data Text Length Stats:")
print(test_data['text_length'].describe())

# 모델 선택 및 학습
## 모델 선택

In [None]:
# 테스트 환경 속 설정 변수
epochs = 20
batch_size = 32
validation_split = 0.2

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# 파일 경로 설정
train_file_path = 'Data/train.csv'
export_path = 'Output/'

# 텍스트 정제 함수
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # 다중 공백 제거
    text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
    return text.strip()

train_data['cleaned_conversation'] = train_data['conversation'].apply(clean_text)
train_data['text_length'] = train_data['cleaned_conversation'].apply(len)

# 토크나이저 설정
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['cleaned_conversation'])

# 시퀀스 변환 및 패딩
sequences = tokenizer.texts_to_sequences(train_data['cleaned_conversation'])
max_length = 300
X_data = pad_sequences(sequences, maxlen=max_length, padding='post')

# 레이블 인코딩
label_encoder = LabelEncoder()
y_data = label_encoder.fit_transform(train_data['class'])

# 결과 확인
print('토크나이저 단어 개수:', len(tokenizer.word_index))
print('X_data 크기:', X_data.shape)
print('y_data 크기:', y_data.shape)

In [None]:
padded_sequences = X_data
labels = y_data

## 1. 기본 LSTM 모델

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# 입력 정의
input_layer = Input(shape=(max_length,), name='input')

# 임베딩 층
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_layer)

# LSTM 층
lstm_layer = LSTM(64)(embedding_layer)

# 출력층 정의
num_classes = 4
output_layer = Dense(num_classes, activation='softmax', name='output')(lstm_layer)

# 모델 생성
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 요약
model.summary()

In [None]:
# 모델 학습
history = model.fit(
    padded_sequences, labels,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=validation_split
)

## 2. CNN 구조를 엮은 모델

In [None]:
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense

input_layer = Input(shape=(max_length,), name='input')

embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_layer)

conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
polling_layer = GlobalMaxPooling1D()(conv_layer)

output_layer = Dense(num_classes, activation='softmax', name='output')(polling_layer)

model_conv = Model(inputs=input_layer, outputs=output_layer)
model_conv.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model_conv.summary()

In [None]:
con_history = model.fit(
    padded_sequences,
    labels,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=validation_split
)

## 3. Bidirectional LSTM

In [None]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense

# 입력 정의
input_layer = Input(shape=(max_length,), name='input')

# 임베딩 층
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_layer)

# 양방향 LSTM 층
bi_lstm_layer = Bidirectional(LSTM(64))(embedding_layer)

# 출력층 정의
output_layer = Dense(num_classes, activation='softmax', name='output')(bi_lstm_layer)

# 모델 생성
model_bi = Model(inputs=input_layer, outputs=output_layer)

# 모델 컴파일
model_bi.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 요약
model_bi.summary()

In [None]:
model_bi_history = model_bi.fit(
    padded_sequences,
    labels,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=validation_split
)

## 4. BERT 모델

In [None]:
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling1D

bert_model_name = 'distilbert-base-uncased'

# BERT 모델 로드
bert_model = TFBertModel.from_pretrained(bert_model_name)

# BERT 토크나이저
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# 입력 정의
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

# BERT 모델 레이어
bert_output = bert_model([input_ids, attention_mask])[0]

pooling_layer = GlobalAveragePooling1D()(bert_output)

# 출력층 정의
output_layer = Dense(num_classes, activation='softmax', name='output')(pooling_layer)

# 모델 생성
model_bert = Model(inputs=[input_ids, attention_mask], outputs=output_layer)

# 모델 컴파일
model_bert.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 요약
model_bert.summary()

In [None]:
# BERT 입력 데이터 생성
def encode_textx(texts, max_length):
    inputs = tokenizer(texts, max_length=max_length, padding='max_length', truncation=True, return_tensors='tf')
    return inputs['input_ids'], inputs['attention_mask']

texts = train_data['cleaned_conversation'].tolist()
input_ids, attention_mask = encode_textx(texts, max_length)

# 모델 학습
history_bert = model_bert.fit(
    [input_ids, attention_mask],
    labels,
    epochs=epochs,
    batch_size=8,
    validation_split=validation_split
)

## 5. 사전 학습 모델 [BERT]

from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# BERT 모델 로드
bert_model_name = 'distilbert-base-multilingual-cased'
bert_model = TFBertModel.from_pretrained(bert_model_name)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# 데이터 전처리 함수
def encode_texts(texts, max_length=128):
   inputs = tokenizer(texts, max_length=max_length, padding='max_length', truncation=True, return_tensors='tf')
   return inputs['input_ids'], inputs['attention_mask']

# 데이터 로드 및 전처리
texts = train_data['cleaned_conversation'].tolist()
input_ids, attention_mask = encode_texts(texts)

# 입력 정의
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_masks = Input(shape=(max_length,), dtype=tf.int32, name='attention_masks')

# BERT 모델을 사용하여 임베딩 추출
bert_output = bert_model([input_ids, attention_mask])[0][:, 0, :]

# 임베딩 저장
# np.save('data/mnt/targets.npy')

# 임베딩 로드
# bert_embeddings = np.load('data/mnt/targets.npy')
bert_embeddings = bert_output

# 입력 정의
input_layer = Input(shape=(bert_embeddings.shape[1],), name='input')

# 출력층 정의
output_layer = Dense(num_classes, activation='softmax', name='output')(input_layer)

# 모델 생성
model_bert = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 요약
model_bert.summary()

In [None]:
# 모델 학습
history_bert = model_bert.fit(
    bert_embeddings,
    labels,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=validation_split
)

In [None]:
import matplotlib.pyplot as plt
# 학습 결과 시각화

num_model = 4
plt_size = num_model + 1

plt.figure(figsize=(12, 5 * plt_size))

plt.subplot(plt_size, 2, 1)
plt.plot(history.history['loss'], 'b-', label='lstm_loss')
plt.plot(con_history.history['loss'], 'r--', label='con_loss')
plt.plot(model_bi_history.history['loss'], 'g-.', label='bi_loss')
plt.plot(history_bert.history['loss'], 'k:', label='bert_loss')
plt.plot()
plt.title('All_loss')
plt.legend()

plt.subplot(plt_size, 2, 2)
plt.plot(history.history['accuracy'], 'g-', label='lstm_accuracy')
plt.plot(con_history.history['accuracy'], 'k--', label='con_accuracy')
plt.title('All_accuracy')
plt.legend()

plt.subplot(plt_size, 2, 3)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.title('LSTM')
plt.legend()

plt.subplot(plt_size, 2, 4)
plt.plot(history.history['accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_accuracy'], 'k--', label='val_accuracy')
plt.xlabel('Epoch')
plt.title('LSTM')
plt.legend()

plt.subplot(plt_size, 2, 5)
plt.plot(con_history.history['loss'], 'b-', label='loss')
plt.plot(con_history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.title('Conv1D-LSTM')
plt.legend()

plt.subplot(plt_size, 2, 6)
plt.plot(con_history.history['accuracy'], 'g-', label='accuracy')
plt.plot(con_history.history['val_accuracy'], 'k--', label='val_accuracy')
plt.xlabel('Epoch')
plt.title('Conv1D-LSTM')
plt.legend()

plt.subplot(plt_size, 2, 7)
plt.plot(model_bi_history.history['loss'], 'b-', label='loss')
plt.plot(model_bi_history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.title('Bidirectional-LSTM')
plt.legend()

plt.subplot(plt_size, 2, 8)
plt.plot(model_bi_history.history['accuracy'], 'g-', label='accuracy')
plt.plot(model_bi_history.history['val_accuracy'], 'k--', label='val_accuracy')
plt.xlabel('Epoch')
plt.title('Bidirectional-LSTM')
plt.legend()

plt.subplot(plt_size, 2, 9)
plt.plot(history_bert.history['loss'], 'b-', label='loss')
plt.plot(history_bert.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.title('BERT')
plt.legend()

plt.subplot(plt_size, 2, 10)
plt.plot(history_bert.history['accuracy'], 'g-', label='accuracy')
plt.plot(history_bert.history['val_accuracy'], 'k--', label='val_accuracy')
plt.xlabel('Epoch')
plt.title('BERT')
plt.legend()

plt.show()