- 레스토랑 리뷰 감성 분류하기 :
https://github.com/rickiepark/nlp-with-pytorch/blob/main/chapter_3/3_5_Classifying_Yelp_Review_Sentiment.ipynb

- NLP using GloVe Embeddings(FAKE NEWS) : 
https://www.kaggle.com/code/madz2000/nlp-using-glove-embeddings-99-87-accuracy
- https://www.kaggle.com/code/lorwohl/fake-news-detection

In [None]:
# 모듈 로딩
import numpy as np
import pandas as pd
import re
import string
from string import punctuation

import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.stem import WordNetLemmatizer # 표제어 추출

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer # used to tokenize text sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences # padding sequences to the same length
from tensorflow.keras.models import Sequential # building sequential models like FF layers in the transformer encoder
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, SimpleRNN, Dropout # used for parts of the transformer encoder

---
### [1] 데이터 보기 및 시각화

In [None]:
real = pd.read_csv("./True.csv")
fake = pd.read_csv("./Fake.csv")

In [None]:
real.head()

In [None]:
fake.head()

In [None]:
# real / fake에 범주 부여
fake['category']=0
real['category']=1

In [None]:
# real / fake 합치기
df=pd.concat([real, fake])

In [None]:
# 카테고리 확인
print(df.category.value_counts())
print(f'RealNEws : {round(df.category.value_counts()[0] / df.category.count(), 2)}%')
print(f'FakeNews : {round(df.category.value_counts()[1] / df.category.count(), 2)}%')

# sns.countplot => 갯수 확인 시각화
sns.countplot(df.category)

In [None]:
# 중복 제거
df.duplicated().sum()

In [None]:
# 정보 확인
df.info()

In [None]:
# 결측치 확인
df.isnull().sum()

In [None]:
# 형태 확인
df.shape

In [None]:
# 뉴스의 종류 확인
print(df.subject.value_counts())

# 카테고리 별 뉴스 종류 확인
plt.xticks(rotation=70)
sns.countplot(x='subject', hue='category', data=df)

In [None]:
# Real News
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = STOPWORDS).generate(" ".join(df[df.category == 1].text))
plt.imshow(wc , interpolation = 'bilinear')

In [None]:
# Fake News
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = STOPWORDS).generate(" ".join(df[df.category == 0].text))
plt.imshow(wc , interpolation = 'bilinear')

---
## [2] 데이터 처리

In [None]:
# 뉴스 기사들을 한 컬럼으로 합치기
df['text_all']=df['text'] + ' ' + df['title']
df['text_all'].head()

del df['title']
del df['subject']
del df['date']

### 데이터 분할

In [None]:
# from sklearn.model_selection import train_test_split, 비율에 맞게 분할
X_train, X_test, y_train, y_test = train_test_split(df.text_all, 
                                                    df.category, 
                                                    test_size=0.2, 
                                                    stratify=df.category,
                                                    random_state=11)

print(f'X_train : {X_train.shape}, X_test : {X_test.shape}')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size=0.2, 
                                                  stratify=y_train,
                                                  random_state=11)
print(f'X_train : {X_train.shape}, X_test : {X_test.shape} , X_val : {X_val.shape}')

In [None]:
print('\n', '학습 데이터', '-'*20)
print(f'X_train : {X_train.shape}, y_train : {y_train.shape}')
print(f'가짜뉴스 : {round(y_train.value_counts()[0]/len(y_train),2)}%')
print(f'진짜뉴스 : {round(y_train.value_counts()[1]/len(y_train),2)}%')

print('\n', '테스트 데이터', '-'*20)
print(f'X_test   : {X_test.shape}, y_test : {y_test.shape}')
print(f'가짜뉴스 : {round(y_test.value_counts()[0]/len(y_test),2)}%')
print(f'진짜뉴스 : {round(y_test.value_counts()[1]/len(y_test),2)}%')

print('\n', '검증 데이터', '-'*20)
print(f'X_val    : {X_val.shape}, y_val : {y_val.shape}')
print(f'가짜뉴스 : {round(y_val.value_counts()[0]/len(y_val),2)}%')
print(f'진짜뉴스 : {round(y_val.value_counts()[1]/len(y_val),2)}%')

### 불용어 처리
string.punctuation -> !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

In [None]:
from wordcloud import WordCloud, STOPWORDS # wordcloud 모듈에도 stopword 기능 있음?
from nltk.corpus import stopwords

stop=set(stopwords.words('english'))
# stopwords=set(nltk.corpus.stopwords.words('english'))
punctuation = string.punctuation
stop.update(punctuation)

### 텍스트 데이터 전처리

In [None]:
# 텍스트 처리하는 함수
def textProcess(textData):
    refined_texts = []
    # wordnet_lemmatizer = WordNetLemmatizer()
    
    w_tokens = word_tokenize(textData) # 단어_토큰화
    for w in w_tokens:
        
        if w not in stop: # english stopwords에 포함 안되어있다면 ~ 해라
            # string.puctuation 안했으면 refined_T = re.sub('[^a-zA-Z]', '', w)
            refined_t = w.lower() # 소문자로
            
            # 표제어 추출
            # refined_t = wordnet_lemmatizer.lemmatize(w)
            refined_texts.append(refined_t)
    
    return " ".join(refined_texts) # 단어에서 문장으로 복원

In [None]:
# refined_data = []
# for i in df.text_all:
#     refined_data.append(textProcess(i))
    
X_train = [textProcess(i) for i in X_train]
X_test = [textProcess(i) for i in X_test]
X_val = [textProcess(i) for i in X_val]

In [None]:
X_train[:1]

### 텍스트 데이터 토큰화 및 수치화

In [None]:
# 텍스트 데이터 토큰화한 후 수치화
def makeToken(textData, numWord=0):
    if numWord>0:
        myToken=Tokenizer(num_words=numWord)
    else:
        myToken=Tokenizer()
    
    # 단어사전(voca) 생성
    myToken.fit_on_texts(textData)
    seq_Token = myToken.texts_to_sequences(textData)
    
    # voca 총개수
    voca_w_num = len(myToken.word_index)
    
    return (seq_Token, voca_w_num)

In [None]:
X_train, X_train_vocaNum = makeToken(X_train)
X_test, X_test_vocaNum = makeToken(X_test)
X_val, X_val_vocaNum = makeToken(X_val)

### 단어 갯수 파악

In [None]:
# 단어 갯수 파악 함수
def checkLength(datas):

  # 기사 개당 단어 개수
  length=[len(data) for data in datas]
  
  # 히스토그램
  plt.figure(figsize=(12,8))
  plt.hist(length)
  plt.title(f'Max {max(length)}  Min {min(length)}  AVG {round(sum(length)/len(length),2)}')
  plt.xlabel('data length')
  plt.ylabel('data number')
  plt.show()
# return length
  
checkLength(X_train)  

In [None]:
# fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))

# text_len=df[df['category']==1]['text_all'].str.len()
# ax1.hist(text_len,color='red')
# ax1.set_title('Real text')

# text_len=df[df['category']==0]['text_all'].str.len()
# ax2.hist(text_len,color='green')
# ax2.set_title('Fake text')

# fig.suptitle('Characters in texts')
# plt.show()

In [None]:
# fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
# text_len=df[df['category']==1]['text_all'].str.split().map(lambda x: len(x))
# ax1.hist(text_len,color='red')
# ax1.set_title('Real text')
# text_len=df[df['category']==0]['text_all'].str.split().map(lambda x: len(x))
# ax2.hist(text_len,color='green')
# ax2.set_title('Fake text')
# fig.suptitle('Words in texts')
# plt.show()

### pad_sequences

In [None]:
maxlen = 300
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

---
## 모델 구성
https://jimmy-ai.tistory.com/281

In [None]:
EMB_DIM = 32                     # 임베딩 벡터 크기 
WORD_NUM = X_train_vocaNum         # 단어사전 수
HIDDEN_NODE = 64                # 은닉층 뉴런 수
INPUT_LENGTH = 300         # 1문장의 토큰 수 

In [None]:
model = Sequential()
# RNN 적용을 위한 임베딩 지정
model.add(Embedding(WORD_NUM, EMB_DIM, input_length=INPUT_LENGH))

### RNN 파트 시작점 ###

# 이중층 GRU -> SimpleRNN, 단일 방향 예시
model.add(GRU(HIDDEN_NODE, return_sequences=True))
model.add(SimpleRNN(HIDDEN_NODE))

### RNN 파트 끝점 ###

# fc layer 부분(32 차원 변환 -> dropout -> 이진 분류 결과)
model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid')) # 이진 분류를 위한 마지막 layer 설정

In [None]:
model.summary()

In [None]:
plot_model(model, show_shapes=True)

## 모델 생성

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

## 모델 학습

In [None]:
batch_size = 256
epochs = 10

In [None]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', 
                                            patience = 2, 
                                            verbose=1,
                                            factor=0.5, 
                                            min_lr=0.00001)

In [None]:
history = model.fit(X_train, y_train, epochs = epochs, 
                    batch_size = batch_size, validation_data = (X_val, y_val))