In [51]:
import pandas as pd
import numpy as np

In [52]:
df= pd.read_csv("../dataset/240128_TripAdvisorHotelReviews/dataset_removed_stopwords_ver.csv")

### 1. 패딩 처리

In [53]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# 예시 데이터 (refined_df['tokens']에 토큰이 이미 들어가 있다고 가정)
# 토큰화된 데이터가 저장된 컬럼 이름이 'tokens'일 경우:
tokenized_text = df['tokens']

# 토크나이저 객체 생성
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_text)

# 토큰들을 시퀀스로 변환
sequences = tokenizer.texts_to_sequences(tokenized_text)

# 시퀀스 패딩
max_sequence_length = 200  # 시퀀스의 최대 길이 (임의 설정)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

### 2. Glove로 임베딩

In [54]:
import torch
embedding_file = "Embedding_model/glove.840B.300d.txt"

# 단어와 임베딩을 저장할 딕셔너리 생성
word_to_embedding = {}

# 임베딩 파일을 읽어서 딕셔너리에 저장
with open(embedding_file, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            values = line.split()
            word = values[0]
            embedding = torch.tensor([float(val) for val in values[1:]])
            word_to_embedding[word] = embedding
        except ValueError:
            continue  # ValueError가 발생하면 해당 행을 건너뜁니다.

In [55]:
len(word_to_embedding['the'])

300

In [56]:
# 임베딩된 시퀀스 저장할 리스트 생성

from tqdm import tqdm
embedded_sequences = []

# 각 토큰에 대해 GloVe 임베딩을 추출하여 리스트에 저장
for sequence in tqdm(padded_sequences, desc="Processing sequences", unit="sequence"):
    embedded_sequence = [word_to_embedding.get(token, torch.zeros(300)) for token in sequence]
    embedded_sequences.append(embedded_sequence)


Processing sequences: 100%|██████████| 10000/10000 [00:10<00:00, 963.55sequence/s]


In [57]:
import numpy as np
embedded_sequences = np.array(embedded_sequences)

In [58]:
labels = df['ratings']

In [59]:
df['ratings'].value_counts()

ratings
high    5000
low     5000
Name: count, dtype: int64

In [60]:
import numpy as np

# labels 배열
labels = np.array(labels)

# 값을 숫자로 변환하는 코드
labels = np.array([1 if value == 'high' else 0 for value in labels])

print("Modified labels:", labels)


Modified labels: [1 1 1 ... 0 0 0]


In [61]:
import numpy as np

# labels 배열


# 값이 1인 원소의 인덱스 찾기
indices_of_ones = np.where(labels == 1)[0]

print("Indices of ones:", indices_of_ones)


Indices of ones: [   0    1    2 ... 4997 4998 4999]


### 3. Train-Test data Split

In [62]:
from sklearn.model_selection import train_test_split

# PyTorch DataLoader로 사용할 데이터셋 클래스를 정의하고 데이터를 생성하는 작업이 필요

# 예시: X는 embedded_sequence, y는 refined_df['ratings']
X_train, X_temp, y_train, y_temp = train_test_split(embedded_sequences, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 결과 확인
print("Train data shapes:", X_train.shape, y_train.shape)
print("Validation data shapes:", X_val.shape, y_val.shape)
print("Test data shapes:", X_test.shape, y_test.shape)

Train data shapes: (7000, 200, 300) (7000,)
Validation data shapes: (1500, 200, 300) (1500,)
Test data shapes: (1500, 200, 300) (1500,)


### 모델 학습

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 데이터셋을 플랫하게 펼침
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)


In [64]:
X_train.shape

(7000, 200, 300)

In [49]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# 모델 초기화
model = LogisticRegression()

# 모델 학습
model.fit(X_train_flat, y_train)


In [50]:
# 예측
y_pred = model.predict(X_val_flat)

# 정확도 평가
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

Validation Accuracy: 0.4866666666666667


In [65]:
from sklearn.ensemble import RandomForestClassifier

# 최적의 하이퍼파라미터
best_hyperparameters = {'max_depth': None, 'n_estimators': 150}

# RandomForestClassifier 초기화
model = RandomForestClassifier(**best_hyperparameters)



model.fit(X_train_flat, y_train)

In [69]:
X_test_flat = X_val.reshape(X_test.shape[0], -1)

# 예측
y_pred = model.predict(X_test_flat)



# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"Validation Accuracy: {accuracy}")

Validation Accuracy: 0.4886666666666667
